In [4]:
#importing dependencies

from sklearn import tree
import pandas as pd
import os
import matplotlib.pyplot as plt

In [5]:
#Loading CSV

# df = pd.read_csv(os.path.join("..", "Resources", "tennis_data.csv"))
df = pd.read_csv(os.path.join("Resources", "tennis_data.csv"))

df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,ATP,Location,Tournament,Date,Series,Court,Surface,Round,Best of,Winner,...,pl1_flag,pl1_year_pro,pl1_weight,pl1_height,pl1_hand,pl2_flag,pl2_year_pro,pl2_weight,pl2_height,pl2_hand
0,1,Adelaide,Adelaide International 1,2022-01-03,ATP250,Outdoor,Hard,1st Round,3,Kwon S.W.,...,KOR,2015.0,72.0,180.0,Right-Handed,JPN,2014.0,64.0,170.0,Left-Handed
1,1,Adelaide,Adelaide International 1,2022-01-03,ATP250,Outdoor,Hard,1st Round,3,Monteiro T.,...,BRA,2011.0,78.0,183.0,Left-Handed,GER,2014.0,80.0,188.0,Right-Handed
2,1,Adelaide,Adelaide International 1,2022-01-03,ATP250,Outdoor,Hard,1st Round,3,Djere L.,...,SRB,2013.0,80.0,185.0,Right-Handed,ESP,2011.0,76.0,180.0,Right-Handed
3,1,Adelaide,Adelaide International 1,2022-01-03,ATP250,Outdoor,Hard,1st Round,3,Johnson S.,...,USA,2012.0,86.0,188.0,Right-Handed,AUS,2018.0,85.0,188.0,Right-Handed
4,1,Adelaide,Adelaide International 1,2022-01-04,ATP250,Outdoor,Hard,1st Round,3,Moutet C.,...,FRA,2016.0,71.0,175.0,Left-Handed,DEN,2020.0,77.0,188.0,Right-Handed


In [6]:
# dataframe 1 : percentage of games won per player

df_winner = df.groupby(by=["Winner"]).count().rename(columns={"ATP":"games_played"}).reset_index()[["Winner","games_played"]]
df_loser = df.groupby(by=["Loser"]).count().rename(columns={"ATP":"games_played"}).reset_index()[["Loser","games_played"]]
df_games = pd.merge(df_winner, df_loser, left_on="Winner", right_on="Loser", how="inner", suffixes=("_win", "_lose"))
df_games["games_played"] = df_games["games_played_win"] + df_games["games_played_lose"]
df_games = df_games.rename(columns={"Winner":"player_name"})[["player_name", "games_played","games_played_win"]]
df_games["win_percentage"] = df_games["games_played_win"]/df_games["games_played"]
df_games.head()

Unnamed: 0,player_name,games_played,games_played_win,win_percentage
0,Acasuso J.,99,50,0.505051
1,Ahouda A.,4,1,0.25
2,Ajdukovic D.,2,1,0.5
3,Albot R.,184,73,0.396739
4,Alcaraz C.,49,30,0.612245


In [7]:
# dataframe 2 : player statistics for each player

df_player_stats = df.groupby(by=["Winner"]).first().reset_index().rename(columns={
    "Winner":"player_name", 
    "pl1_weight":"weight",
    "pl1_height": "height",
    "pl1_hand": "hand"})[["player_name","weight","height","hand"]]
df_player_stats.head()

Unnamed: 0,player_name,weight,height,hand
0,Acasuso J.,86.0,191.0,Right-Handed
1,Ahouda A.,76.0,185.0,Right-Handed
2,Ajdukovic D.,77.0,185.0,Right-Handed
3,Albot R.,69.0,175.0,Right-Handed
4,Alcaraz C.,72.0,185.0,Right-Handed


In [8]:
# dataframe merged : merge of df_games, df_player_stats on player name
df_merged = pd.merge(df_games, df_player_stats, on="player_name", how="inner")
df_merged.head()

Unnamed: 0,player_name,games_played,games_played_win,win_percentage,weight,height,hand
0,Acasuso J.,99,50,0.505051,86.0,191.0,Right-Handed
1,Ahouda A.,4,1,0.25,76.0,185.0,Right-Handed
2,Ajdukovic D.,2,1,0.5,77.0,185.0,Right-Handed
3,Albot R.,184,73,0.396739,69.0,175.0,Right-Handed
4,Alcaraz C.,49,30,0.612245,72.0,185.0,Right-Handed


In [9]:
#Cleaning Data Frame

df_merged = df_merged.dropna()
df_merged


Unnamed: 0,player_name,games_played,games_played_win,win_percentage,weight,height,hand
0,Acasuso J.,99,50,0.505051,86.0,191.0,Right-Handed
1,Ahouda A.,4,1,0.250000,76.0,185.0,Right-Handed
2,Ajdukovic D.,2,1,0.500000,77.0,185.0,Right-Handed
3,Albot R.,184,73,0.396739,69.0,175.0,Right-Handed
4,Alcaraz C.,49,30,0.612245,72.0,185.0,Right-Handed
...,...,...,...,...,...,...,...
657,Zopp J.,47,16,0.340426,88.0,191.0,Right-Handed
658,Zovko L.,5,2,0.400000,80.0,191.0,Right-Handed
659,Zverev A.,430,297,0.690698,86.0,198.0,Right-Handed
660,Zverev M.,313,125,0.399361,88.0,191.0,Left-Handed


# Pre-processing

In [10]:
df_clean = pd.get_dummies(df_merged, columns=["hand"])
df_clean.tail()

Unnamed: 0,player_name,games_played,games_played_win,win_percentage,weight,height,hand_Left-Handed,hand_Right-Handed
657,Zopp J.,47,16,0.340426,88.0,191.0,0,1
658,Zovko L.,5,2,0.4,80.0,191.0,0,1
659,Zverev A.,430,297,0.690698,86.0,198.0,0,1
660,Zverev M.,313,125,0.399361,88.0,191.0,1,0
661,de Voest R.,4,2,0.5,68.0,180.0,0,1


In [11]:
X = df_clean[["weight","height","hand_Left-Handed","hand_Right-Handed"]]
y = df_clean["win_percentage"].values.reshape(-1,1)

In [12]:
print(X)

     weight  height  hand_Left-Handed  hand_Right-Handed
0      86.0   191.0                 0                  1
1      76.0   185.0                 0                  1
2      77.0   185.0                 0                  1
3      69.0   175.0                 0                  1
4      72.0   185.0                 0                  1
..      ...     ...               ...                ...
657    88.0   191.0                 0                  1
658    80.0   191.0                 0                  1
659    86.0   198.0                 0                  1
660    88.0   191.0                 1                  0
661    68.0   180.0                 0                  1

[633 rows x 4 columns]


In [13]:
print(y.max())

0.8578784757981462


In [14]:
print(y.max())

0.8578784757981462


In [15]:
print(y)

[[0.50505051]
 [0.25      ]
 [0.5       ]
 [0.39673913]
 [0.6122449 ]
 [0.61386139]
 [0.41176471]
 [0.41176471]
 [0.1875    ]
 [0.5       ]
 [0.61428571]
 [0.58375635]
 [0.25      ]
 [0.48444444]
 [0.30555556]
 [0.41223404]
 [0.05882353]
 [0.5       ]
 [0.28571429]
 [0.11764706]
 [0.60674157]
 [0.5       ]
 [0.35714286]
 [0.46153846]
 [0.5199115 ]
 [0.32978723]
 [0.25      ]
 [0.37209302]
 [0.15384615]
 [0.53333333]
 [0.4       ]
 [0.28205128]
 [0.5       ]
 [0.5       ]
 [0.44680851]
 [0.47470817]
 [0.28571429]
 [0.63245033]
 [0.59821429]
 [0.38297872]
 [0.22916667]
 [0.40728477]
 [0.46043165]
 [0.47229551]
 [0.31944444]
 [0.16666667]
 [0.50125313]
 [0.41949153]
 [0.66476462]
 [0.4       ]
 [0.42045455]
 [0.37433155]
 [0.65142857]
 [0.35      ]
 [0.36842105]
 [0.54028436]
 [0.42857143]
 [0.22222222]
 [0.41880342]
 [0.43209877]
 [0.33333333]
 [0.31818182]
 [0.30769231]
 [0.5       ]
 [0.5       ]
 [0.2       ]
 [0.23076923]
 [0.38255034]
 [0.33333333]
 [0.65517241]
 [0.39240506]
 [0.25

# Splitting training data

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y)

# Scaling data

In [17]:
from sklearn.preprocessing import StandardScaler

X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

# Transforming data

In [18]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

Deep Learning

In [19]:
from tensorflow.keras.utils import to_categorical

In [20]:
# One-hot encoding
y_train_categorical = to_categorical(y_train_scaled)
y_test_categorical = to_categorical(y_test_scaled)

In [28]:
# Creating a normal neural network with 3 inputs, 6 hidden nodes, and 1 output
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(units=6, activation='relu', input_dim=3))
model.add(Dense(units=1, activation='softmax'))

In [29]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 6)                 24        
                                                                 
 dense_5 (Dense)             (None, 1)                 7         
                                                                 
Total params: 31
Trainable params: 31
Non-trainable params: 0
_________________________________________________________________


In [30]:
# Compile the model
model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['accuracy'])

In [31]:
# Fit the model to the training data
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=100,
    shuffle=True,
    verbose=2
)

Epoch 1/100


ValueError: in user code:

    File "C:\Users\Laurent\anaconda3\envs\PythonData\lib\site-packages\keras\engine\training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\Laurent\anaconda3\envs\PythonData\lib\site-packages\keras\engine\training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Laurent\anaconda3\envs\PythonData\lib\site-packages\keras\engine\training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Laurent\anaconda3\envs\PythonData\lib\site-packages\keras\engine\training.py", line 859, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\Laurent\anaconda3\envs\PythonData\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\Laurent\anaconda3\envs\PythonData\lib\site-packages\keras\engine\input_spec.py", line 264, in assert_input_compatibility
        raise ValueError(f'Input {input_index} of layer "{layer_name}" is '

    ValueError: Input 0 of layer "sequential_2" is incompatible with the layer: expected shape=(None, 3), found shape=(None, 4)
