In [62]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn import preprocessing
from sklearn import model_selection 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split 

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

dataset_file = 'datasets/2019.xlsx'
df = pd.read_excel(dataset_file) 

In [63]:
df = df.drop(['ATP','Tournament','Date','Series','B365W','B365L','EXW','EXL','LBW','LBL','PSW','PSL','SJW','SJL','MaxW','MaxL','AvgW','AvgL','LPts','W1','L1','W2','L2','W3','L3','W4','L4','W5','L5','Wsets','Lsets','Comment','WPts','Best of','Location','Court'], axis=1)
df.index.name   = "Match ID"
df.columns.name = "Attributes"

In [64]:
# I couldn't find any dataset that gave me the the players qualified to AustralianOpen 
# I "googled" the players and made a list with the first 32 players of AO2018 => this can be considered as a dataset

players = ["Nadal R.", "Federer R.", "Dimitrov G.", "Zverev A.", "Thiem D.", "Cilic M.", "Goffin D.", "Sock J.", "Wawrinka S.",
          "Carreno Busta P.", "Anderson K.", "Del Potro J.M.", "Querrey S.", "Djokovic N.", "Tsonga J.W.", "Isner J.", "Kyrgios N.",           
    "Pouille L.", "Berdych T.", "Bautista Agut R.", "Ramos-Vinolas A.", "Raonic M.", "Muller G.", "Schwartzman D.",
          "Fognini F.", "Mannarino A.", "Kohlschreiber P.", "Dzumhur D.", "Gasquet R.", "Rublev A.", "Cuevas P.","Zverev M."]
keys = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
dictionary = dict(zip(keys, players))

In [65]:
indexNames = df[(~df['Winner'].isin(dictionary.values())) | (~df['Loser'].isin(dictionary.values())) | (df['Surface'] != 'Hard')].index
df.drop(indexNames, inplace = True)
df = df.reset_index(drop=True)

In [66]:
print ("Dataframe shape is:", df.shape)
num_matches, num_columns = df.shape
print ("The number of matches is:", num_matches)

Dataframe shape is: (1252, 6)
The number of matches is: 1252


In [67]:
# I can drop now the surface too because we are considering just matches on the Hard Surface and I already
# filtered the matches in this way ;

df = df.drop('Surface', axis = 1)

In [68]:
players = df['Winner'].value_counts() - df['Loser'].value_counts()
win_players = df['Winner'].value_counts() 
los_players = df['Loser'].value_counts()
new_frame = pd.concat([win_players, los_players, players], axis=1)
new_frame = new_frame.rename(index={1: "W", 2: "L", 3: "W-L"})
df

Attributes,Round,Winner,Loser,WRank,LRank
0,Semifinals,Kyrgios N.,Dimitrov G.,21.0,3.0
1,1st Round,Schwartzman D.,Querrey S.,52.0,31.0
2,2nd Round,Raonic M.,Schwartzman D.,3.0,52.0
3,2nd Round,Nadal R.,Zverev M.,9.0,51.0
4,Quarterfinals,Dimitrov G.,Thiem D.,17.0,8.0
5,Quarterfinals,Raonic M.,Nadal R.,3.0,9.0
6,Semifinals,Dimitrov G.,Raonic M.,17.0,3.0
7,2nd Round,Pouille L.,Goffin D.,78.0,16.0
8,Quarterfinals,Raonic M.,Pouille L.,14.0,78.0
9,Quarterfinals,Thiem D.,Cilic M.,20.0,13.0


In [69]:
new_frame.columns = ["W","L","W-L"]

In [70]:
new_frame.index.name   = "Player"
new_frame.columns.name = "Attributes"

In [71]:
new_frame.head()

Attributes,W,L,W-L
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Anderson K.,42,61,-19
Bautista Agut R.,18,21,-3
Berdych T.,65,65,0
Carreno Busta P.,18,20,-2
Cilic M.,39,58,-19


In [72]:
df.head()

Attributes,Round,Winner,Loser,WRank,LRank
0,Semifinals,Kyrgios N.,Dimitrov G.,21.0,3.0
1,1st Round,Schwartzman D.,Querrey S.,52.0,31.0
2,2nd Round,Raonic M.,Schwartzman D.,3.0,52.0
3,2nd Round,Nadal R.,Zverev M.,9.0,51.0
4,Quarterfinals,Dimitrov G.,Thiem D.,17.0,8.0


In [73]:
df2020 = df
df2020 = df2020.rename({'Winner': 'Player_1', 'Loser': 'Player_2'}, axis='columns')
df2020["Winner"] = 1
df2020.head()

Attributes,Round,Player_1,Player_2,WRank,LRank,Winner
0,Semifinals,Kyrgios N.,Dimitrov G.,21.0,3.0,1
1,1st Round,Schwartzman D.,Querrey S.,52.0,31.0,1
2,2nd Round,Raonic M.,Schwartzman D.,3.0,52.0,1
3,2nd Round,Nadal R.,Zverev M.,9.0,51.0,1
4,Quarterfinals,Dimitrov G.,Thiem D.,17.0,8.0,1


In [74]:
final = pd.get_dummies(df2020, prefix = ['Player_1','Player_2'], columns = ['Player_1','Player_2'])
final = final.drop("Round", axis = 1)

In [75]:
df
clasbyseconds= {'Djokovic N.' : 1025654400, 'Nadal R.' : 1056153600,
                'Federer R.' : 1208217600, 'Zverev A.' : 712800000,
                'Anderson K.' : 1057536000,'Čilić M.' : 982886400,
                'Thiem D.' : 827280000,'Nishikori K.' : 943401600,
                'Isner J.' : 1090972800,'Khachanov K.' : 741657600,
                'Ćorić B.' : 726364800,
                'Fognini F' : 1025481600,'Edmund K.' : 784771200,
                'Tsitsipas S.' : 671414400,'Medvedev D.' : 750297600,
                'Raonic M.' : 912038400,
                'Cecchinato M.' : 856483200,'Schwartzman D.' : 860371200,
                'Basilashvili N.' : 875491200,
                'Dimitrov G.' : 899942400,' David Goffin' : 913766400,
                'Agut R.' : 997315200,
                'Busta P.' : 895017600,'Hyeon C.' : 741830400,
                'Shapovalov D.' : 650160000,'Verdasco F.' : 1136592000,
                'Minaur A.' : 655084800,
                'Pouille L' : 812332800,'Simon G.' : 1101340800,
                'Monfils G.' : 1048377600,
                'Johnson S.' : 943833600,'Kohlschreiber P.' : 1139184000,}


In [76]:
df2020['Player_1'] = df2020['Player_1'].map({'Djokovic N.' : 1025654400, 'Nadal R.' : 1056153600,
                'Federer R.' : 1208217600, 'Zverev A.' : 712800000,
                'Anderson K.' : 1057536000,'Čilić M.' : 982886400,
                'Thiem D.' : 827280000,'Nishikori K.' : 943401600,
                'Isner J.' : 1090972800,'Khachanov K.' : 741657600,
                'Ćorić B.' : 726364800,
                'Fognini F' : 1025481600,'Edmund K.' : 784771200,
                'Tsitsipas S.' : 671414400,'Medvedev D.' : 750297600,
                'Raonic M.' : 912038400,
                'Cecchinato M.' : 856483200,'Schwartzman D.' : 860371200,
                'Basilashvili N.' : 875491200,
                'Dimitrov G.' : 899942400,' David Goffin' : 913766400,
                'Agut R.' : 997315200,
                'Busta P.' : 895017600,'Hyeon C.' : 741830400,
                'Shapovalov D.' : 650160000,'Verdasco F.' : 1136592000,
                'Minaur A.' : 655084800,
                'Pouille L' : 812332800,'Simon G.' : 1101340800,
                'Monfils G.' : 1048377600,
                'Johnson S.' : 943833600,'Kohlschreiber P.' : 1139184000})

In [77]:
df2020

Attributes,Round,Player_1,Player_2,WRank,LRank,Winner
0,Semifinals,,Dimitrov G.,21.0,3.0,1
1,1st Round,8.603712e+08,Querrey S.,52.0,31.0,1
2,2nd Round,9.120384e+08,Schwartzman D.,3.0,52.0,1
3,2nd Round,1.056154e+09,Zverev M.,9.0,51.0,1
4,Quarterfinals,8.999424e+08,Thiem D.,17.0,8.0,1
5,Quarterfinals,9.120384e+08,Nadal R.,3.0,9.0,1
6,Semifinals,8.999424e+08,Raonic M.,17.0,3.0,1
7,2nd Round,,Goffin D.,78.0,16.0,1
8,Quarterfinals,9.120384e+08,Pouille L.,14.0,78.0,1
9,Quarterfinals,8.272800e+08,Cilic M.,20.0,13.0,1


In [16]:
X = final.drop(['WRank'], axis=1)

In [17]:
y = final['WRank']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

In [19]:
final.head()

Unnamed: 0,WRank,LRank,Winner,Player_1_Anderson K.,Player_1_Bautista Agut R.,Player_1_Berdych T.,Player_1_Carreno Busta P.,Player_1_Cilic M.,Player_1_Cuevas P.,Player_1_Del Potro J.M.,...,Player_2_Ramos-Vinolas A.,Player_2_Raonic M.,Player_2_Rublev A.,Player_2_Schwartzman D.,Player_2_Sock J.,Player_2_Thiem D.,Player_2_Tsonga J.W.,Player_2_Wawrinka S.,Player_2_Zverev A.,Player_2_Zverev M.
0,21.0,3.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,52.0,31.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3.0,52.0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,9.0,51.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,17.0,8.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [26]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
score = logreg.score(X_train, y_train)
score2 = logreg.score(X_test, y_test)

In [27]:
print(score)

0.3721461187214612


In [28]:
print(score2)

0.23670212765957446
