In [1]:
import pandas as pd
import requests
import json
import time
import numpy as np
from datetime import datetime
#Show all columns
pd.set_option('display.max_columns', None)

import sklearn
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score
from functools import reduce


In [2]:
All_years = [str(2010 + i) for i in range(14)]
timeline_dict = dict()
for year in All_years:
    df_temp = pd.read_excel('All_years/' + year + '.xlsx')
    df_temp.sort_values(by = 'Date', ascending = True, inplace = True)
    timeline_dict[year] = df_temp

  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


In [3]:
Columns = ['Location', 'Tournament', 'Date', 'Series', 'Environment', 'Round', 'Best of', 'Winner', 'Loser', 
           'WPts', 'LPts', 'Comment']
Cleaned_timeline = dict()
for year, df in timeline_dict.items():
    df['Environment'] = df['Court'] + ' ' + df['Surface']
    df = df.dropna(subset = ['WPts', 'LPts'])
    Cleaned_timeline[year] = df.loc[df['Comment'] == 'Completed'].copy()[Columns]

In [4]:
def build_timeline_df(timestep_dict, Cleaned_timeline, lag = 3):
    ##Append all the dfs first
    currentYear = datetime.now().year
    for year, df in Cleaned_timeline.items():
        int_year = int(year)
        year_list = [str(int_year + i) for i in range(lag)]
        all_dfs = [Cleaned_timeline[y] for y in year_list if y in Cleaned_timeline]
        begin_year = year_list[0]
        end_year = year_list[-1]
        timestep_dict[begin_year + '-' + end_year] = pd.concat(all_dfs, axis = 0)
    return timestep_dict

In [5]:
timestep_dict = dict()
timestep_dict = build_timeline_df(timestep_dict, Cleaned_timeline, lag = 2)
timestep_dict['2015-2016']

Unnamed: 0,Location,Tournament,Date,Series,Environment,Round,Best of,Winner,Loser,WPts,LPts,Comment
0,Brisbane,Brisbane International,2015-01-05,ATP250,Outdoor Hard,1st Round,3,Duckworth J.,Simon G.,430.0,1730.0,Completed
56,Doha,Qatar Exxon Mobil Open,2015-01-05,ATP250,Outdoor Hard,1st Round,3,Gasquet R.,Andujar P.,1350.0,950.0,Completed
57,Doha,Qatar Exxon Mobil Open,2015-01-05,ATP250,Outdoor Hard,1st Round,3,Brown D.,Lorenzi P.,549.0,759.0,Completed
58,Doha,Qatar Exxon Mobil Open,2015-01-05,ATP250,Outdoor Hard,1st Round,3,Verdasco F.,Gabashvili T.,1135.0,730.0,Completed
59,Doha,Qatar Exxon Mobil Open,2015-01-05,ATP250,Outdoor Hard,1st Round,3,Karlovic I.,Rosol L.,1320.0,1210.0,Completed
...,...,...,...,...,...,...,...,...,...,...,...,...
2621,London,Masters Cup,2016-11-18,Masters Cup,Indoor Hard,Round Robin,3,Murray A.,Wawrinka S.,11185.0,5115.0,Completed
2622,London,Masters Cup,2016-11-18,Masters Cup,Indoor Hard,Round Robin,3,Cilic M.,Nishikori K.,3450.0,4705.0,Completed
2623,London,Masters Cup,2016-11-19,Masters Cup,Indoor Hard,Semifinals,3,Murray A.,Raonic M.,11185.0,5050.0,Completed
2624,London,Masters Cup,2016-11-19,Masters Cup,Indoor Hard,Semifinals,3,Djokovic N.,Nishikori K.,10780.0,4705.0,Completed


In [6]:
Cleaned_timeline['2016']['Series'].unique()

array(['ATP250', 'Grand Slam', 'ATP500', 'Masters 1000', 'Masters Cup'],
      dtype=object)

In [7]:
####This function will generate 5 dataframes
##['ATP250', 'Grand Slam', 'ATP500', 'Masters 1000', 'Masters Cup']
##this function will apply to one key.
##Each datasets represents a 3-year win rate with specific environment

##This function only works for a dataframe given the year (key)
##Update: We won't do Masters Cup
def win_rate(key, Cleaned_timeline):
    Series_list = ['ATP250', 'Grand Slam', 'ATP500', 'Masters 1000']
    series_dict = dict()
    Env_list = ['Outdoor Hard', 'Indoor Hard', 'Outdoor Clay', 'Indoor Clay', 'Outdoor Grass']
    Env_dict = {'Outdoor Hard': 'OH', 'Indoor Hard': 'IH', 'Outdoor Clay': 'OC', 
                'Indoor Clay': 'IC', 'Outdoor Grass': 'OG'}
    features = list()
    for env, abbrv in Env_dict.items():
        ft = abbrv
        features.append(ft)
    
    df = Cleaned_timeline[key]
    ##This is the main dataframe
    winners = list(df['Winner'].unique())
    losers = list(df['Loser'].unique())
    current_players = np.unique(winners + losers)
    data = {player_name: [0] * len(features) for player_name in current_players}
    df_temp = pd.DataFrame.from_dict(data, orient = 'index', columns = features)
    df_temp = df_temp.reset_index().rename(columns = {'index': 'Player Name'})
    
    for serie in Series_list:
        df_serie = df_temp.copy()
        for env in Env_list:
            for player in current_players:
            ##tr: Number of tournaments the player participates in that specific series and sepcific environment
                tr = len(df.loc[(df['Series'] == serie) & (df['Environment'] == env)
                      & ((df['Winner'] == player) | (df['Loser'] == player))]['Tournament'].unique())
                number_of_rounds = len(df.loc[(df['Series'] == serie) & (df['Environment'] == env)]['Round'].unique())
                wins = df.loc[(df['Series'] == serie) & (df['Environment'] == env) & (df['Winner'] == player)].shape[0]

                ###Total_rounds: all the rounds that are in the specific tournaments
                total_rounds = number_of_rounds * tr
                win_prob = 0
                if total_rounds != 0:
                    win_prob = wins/total_rounds
                df_serie.loc[df_serie['Player Name'] == player, Env_dict[env]] = win_prob
        series_dict[serie] = df_serie
    return series_dict

In [8]:
Cleaned_timeline['2016'].loc[Cleaned_timeline['2016']['Series'] == 'Masters Cup'].shape

(15, 12)

In [9]:
###All the years except 2023
All_years = [str(2010 + i) for i in range(13)]
yearly_stats = dict()
###Apply this function to all the timeline dataframes
for year in All_years:
    series_dict = win_rate(year, Cleaned_timeline)
    yearly_stats[year] = series_dict

In [10]:
yearly_stats['2013']['Masters 1000']

Unnamed: 0,Player Name,OH,IH,OC,IC,OG
0,Aguilar J.,0.000000,0.000000,0.000000,0,0
1,Almagro N.,0.171429,0.166667,0.055556,0,0
2,Alund M.,0.000000,0.000000,0.000000,0,0
3,Amritraj P.,0.000000,0.000000,0.000000,0,0
4,Anderson K.,0.171429,0.000000,0.277778,0,0
...,...,...,...,...,...,...
296,Zhang Z.,0.000000,0.000000,0.000000,0,0
297,Ziadi M.,0.000000,0.000000,0.000000,0,0
298,Zopp J.,0.000000,0.000000,0.000000,0,0
299,Zverev A.,0.000000,0.000000,0.000000,0,0


#### Concate all these dicts based on time lags 

In [11]:
timestep_dict['2015-2016']

Unnamed: 0,Location,Tournament,Date,Series,Environment,Round,Best of,Winner,Loser,WPts,LPts,Comment
0,Brisbane,Brisbane International,2015-01-05,ATP250,Outdoor Hard,1st Round,3,Duckworth J.,Simon G.,430.0,1730.0,Completed
56,Doha,Qatar Exxon Mobil Open,2015-01-05,ATP250,Outdoor Hard,1st Round,3,Gasquet R.,Andujar P.,1350.0,950.0,Completed
57,Doha,Qatar Exxon Mobil Open,2015-01-05,ATP250,Outdoor Hard,1st Round,3,Brown D.,Lorenzi P.,549.0,759.0,Completed
58,Doha,Qatar Exxon Mobil Open,2015-01-05,ATP250,Outdoor Hard,1st Round,3,Verdasco F.,Gabashvili T.,1135.0,730.0,Completed
59,Doha,Qatar Exxon Mobil Open,2015-01-05,ATP250,Outdoor Hard,1st Round,3,Karlovic I.,Rosol L.,1320.0,1210.0,Completed
...,...,...,...,...,...,...,...,...,...,...,...,...
2621,London,Masters Cup,2016-11-18,Masters Cup,Indoor Hard,Round Robin,3,Murray A.,Wawrinka S.,11185.0,5115.0,Completed
2622,London,Masters Cup,2016-11-18,Masters Cup,Indoor Hard,Round Robin,3,Cilic M.,Nishikori K.,3450.0,4705.0,Completed
2623,London,Masters Cup,2016-11-19,Masters Cup,Indoor Hard,Semifinals,3,Murray A.,Raonic M.,11185.0,5050.0,Completed
2624,London,Masters Cup,2016-11-19,Masters Cup,Indoor Hard,Semifinals,3,Djokovic N.,Nishikori K.,10780.0,4705.0,Completed


In [12]:
###We concat based on 2 year time lags
lags = 2
lag_stats = dict()
Series_list = ['ATP250', 'Grand Slam', 'ATP500', 'Masters 1000']

for key, df in timestep_dict.items():
    start_year, end_year = key.split('-')
    if end_year == '2023':
        break
    year_list = list(range(int(start_year), int(end_year)+1))
    serie_dict2 = dict()
    for serie in Series_list:
        df_list = list()
        for year in year_list:
            idx = year_list.index(year)
            df_temp = yearly_stats[str(year)][serie].copy()
            df_temp = df_temp.add_suffix('_' + str(idx + 1))
            df_temp['Player Name'] = df_temp['Player Name_' + str(idx + 1)]
            df_temp = df_temp.drop('Player Name_' + str(idx + 1), axis = 1)
            df_list.append(df_temp)
        merged_df = reduce(lambda left, right: pd.merge(left, right, on='Player Name', how='outer'), df_list)
        merged_df = merged_df.fillna(0)
        serie_dict2[serie] = merged_df
    lag_stats[key] = serie_dict2

In [13]:
lag_stats['2015-2016']['Masters 1000']

Unnamed: 0,OH_1,IH_1,OC_1,IC_1,OG_1,Player Name,OH_2,IH_2,OC_2,IC_2,OG_2
0,0.000000,0.000000,0.000000,0.0,0.0,Albot R.,0.000000,0.0,0.000000,0.0,0.0
1,0.142857,0.000000,0.083333,0.0,0.0,Almagro N.,0.047619,0.0,0.000000,0.0,0.0
2,0.228571,0.166667,0.083333,0.0,0.0,Anderson K.,0.190476,0.0,0.083333,0.0,0.0
3,0.000000,0.000000,0.000000,0.0,0.0,Andreozzi G.,0.000000,0.0,0.000000,0.0,0.0
4,0.000000,0.000000,0.000000,0.0,0.0,Androic T.,0.000000,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
359,0.000000,0.000000,0.000000,0.0,0.0,Weintraub A.,0.000000,0.0,0.000000,0.0,0.0
360,0.000000,0.000000,0.000000,0.0,0.0,Whittington A.,0.000000,0.0,0.000000,0.0,0.0
361,0.000000,0.000000,0.000000,0.0,0.0,Willis M.,0.000000,0.0,0.000000,0.0,0.0
362,0.000000,0.000000,0.000000,0.0,0.0,Zayid M. S.,0.000000,0.0,0.000000,0.0,0.0


### Prepare the data for ML

In [14]:
###We wont predict any Masters Cup
key = '2015-2016'
df_time2019 = Cleaned_timeline['2017'].reset_index(drop = True)
df_test = df_time2019.copy()[['Winner', 'Loser', 'Series', 'Environment', 'WPts', 'LPts']]

df_test[['player 0', 'player 1', 'Pts_0', 'Pts_1']] = df_test.apply(lambda row: pd.Series([row['Winner'], row['Loser'], row['WPts'], row['LPts']]) 
                                                                    if np.random.permutation([0, 1])[0] == 0 
                                                                    else pd.Series([row['Loser'], row['Winner'], row['LPts'], row['WPts']]), axis=1)

df_test = df_test[df_test['Series'] != 'Masters Cup']

In [15]:
###Concat the probability of winning based on series and environment
df_test

Unnamed: 0,Winner,Loser,Series,Environment,WPts,LPts,player 0,player 1,Pts_0,Pts_1
0,Thompson J.,Ymer E.,ATP250,Outdoor Hard,689,372.0,Thompson J.,Ymer E.,689.0,372.0
1,Almagro N.,Lorenzi P.,ATP250,Outdoor Hard,1013,1090.0,Lorenzi P.,Almagro N.,1090.0,1013.0
2,Djokovic N.,Struff J.L.,ATP250,Outdoor Hard,11780,758.0,Struff J.L.,Djokovic N.,758.0,11780.0
3,Zeballos H.,Mayer F.,ATP250,Outdoor Hard,722,903.0,Zeballos H.,Mayer F.,722.0,903.0
4,Goffin D.,Haase R.,ATP250,Outdoor Hard,2750,795.0,Haase R.,Goffin D.,795.0,2750.0
...,...,...,...,...,...,...,...,...,...,...
2506,Benneteau J.,Cilic M.,Masters 1000,Indoor Hard,634,4185.0,Cilic M.,Benneteau J.,4185.0,634.0
2507,Isner J.,Del Potro J.M.,Masters 1000,Indoor Hard,2505,2435.0,Isner J.,Del Potro J.M.,2505.0,2435.0
2508,Krajinovic F.,Isner J.,Masters 1000,Indoor Hard,681,2505.0,Isner J.,Krajinovic F.,2505.0,681.0
2509,Sock J.,Benneteau J.,Masters 1000,Indoor Hard,1945,634.0,Benneteau J.,Sock J.,634.0,1945.0


In [16]:
##p0, p1: performance on the specific series and environment of player 0, player 1
##r0, r1: result of the games 0 means win, 1 means lost
for i in range(lags):
    df_test['p0_' + str(i+1)] = 0
for j in range(lags):
    df_test['p1_' + str(j+1)] = 0
df_test['Match'] = 0

In [17]:
df_test

Unnamed: 0,Winner,Loser,Series,Environment,WPts,LPts,player 0,player 1,Pts_0,Pts_1,p0_1,p0_2,p1_1,p1_2,Match
0,Thompson J.,Ymer E.,ATP250,Outdoor Hard,689,372.0,Thompson J.,Ymer E.,689.0,372.0,0,0,0,0,0
1,Almagro N.,Lorenzi P.,ATP250,Outdoor Hard,1013,1090.0,Lorenzi P.,Almagro N.,1090.0,1013.0,0,0,0,0,0
2,Djokovic N.,Struff J.L.,ATP250,Outdoor Hard,11780,758.0,Struff J.L.,Djokovic N.,758.0,11780.0,0,0,0,0,0
3,Zeballos H.,Mayer F.,ATP250,Outdoor Hard,722,903.0,Zeballos H.,Mayer F.,722.0,903.0,0,0,0,0,0
4,Goffin D.,Haase R.,ATP250,Outdoor Hard,2750,795.0,Haase R.,Goffin D.,795.0,2750.0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2506,Benneteau J.,Cilic M.,Masters 1000,Indoor Hard,634,4185.0,Cilic M.,Benneteau J.,4185.0,634.0,0,0,0,0,0
2507,Isner J.,Del Potro J.M.,Masters 1000,Indoor Hard,2505,2435.0,Isner J.,Del Potro J.M.,2505.0,2435.0,0,0,0,0,0
2508,Krajinovic F.,Isner J.,Masters 1000,Indoor Hard,681,2505.0,Isner J.,Krajinovic F.,2505.0,681.0,0,0,0,0,0
2509,Sock J.,Benneteau J.,Masters 1000,Indoor Hard,1945,634.0,Benneteau J.,Sock J.,634.0,1945.0,0,0,0,0,0


In [18]:
###Since this is 2019, we use data from 2016-2018
data_dict = lag_stats[key]
Env_dict = {'Outdoor Hard': 'OH', 'Indoor Hard': 'IH', 'Outdoor Clay': 'OC', 
            'Indoor Clay': 'IC', 'Outdoor Grass': 'OG'}
for i in range(df_test.shape[0]):
    serie = df_test.iloc[i]['Series']
    env = df_test.iloc[i]['Environment']
    player_0 = df_test.iloc[i]['player 0']
    player_1 = df_test.iloc[i]['player 1']
    df_serie = data_dict[serie]
    
    ###Generate a list of years
    start_year, end_year = key.split('-')
    year_list = list(range(int(start_year), int(end_year)+1))
    
    for j in range(len(year_list)):
        check = df_serie.loc[(df_serie['Player Name'] == player_0)][Env_dict[env] + '_' + str(j + 1)].shape[0]
        prob_0 = 0
        if check != 0:
            prob_0 = df_serie.loc[(df_serie['Player Name'] == player_0)][Env_dict[env] + '_' + str(j + 1)].iloc[0]
        col_index0 = df_test.columns.get_loc('p0_' + str(j+1))
        df_test.iloc[i, col_index0] = prob_0
        
        check = df_serie.loc[(df_serie['Player Name'] == player_1)][Env_dict[env] + '_' + str(j + 1)].shape[0]
        prob_1 = 0
        if check != 0:
            prob_1 = df_serie.loc[(df_serie['Player Name'] == player_1)][Env_dict[env] + '_' + str(j + 1)].iloc[0]
        col_index1 = df_test.columns.get_loc('p1_' + str(j+1))
        df_test.iloc[i, col_index1] = prob_1

In [19]:
df_test.loc[(df_test['Winner'] == df_test['player 0']), 'Match'] = 0
df_test.loc[(df_test['Winner'] == df_test['player 1']), 'Match'] = 1

In [20]:
df_test

Unnamed: 0,Winner,Loser,Series,Environment,WPts,LPts,player 0,player 1,Pts_0,Pts_1,p0_1,p0_2,p1_1,p1_2,Match
0,Thompson J.,Ymer E.,ATP250,Outdoor Hard,689,372.0,Thompson J.,Ymer E.,689.0,372.0,0.000000,0.000000,0.166667,0.000000,0
1,Almagro N.,Lorenzi P.,ATP250,Outdoor Hard,1013,1090.0,Lorenzi P.,Almagro N.,1090.0,1013.0,0.000000,0.055556,0.000000,0.166667,1
2,Djokovic N.,Struff J.L.,ATP250,Outdoor Hard,11780,758.0,Struff J.L.,Djokovic N.,758.0,11780.0,0.166667,0.083333,0.333333,0.833333,1
3,Zeballos H.,Mayer F.,ATP250,Outdoor Hard,722,903.0,Zeballos H.,Mayer F.,722.0,903.0,0.000000,0.250000,0.000000,0.000000,0
4,Goffin D.,Haase R.,ATP250,Outdoor Hard,2750,795.0,Haase R.,Goffin D.,795.0,2750.0,0.000000,0.166667,0.166667,0.083333,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2506,Benneteau J.,Cilic M.,Masters 1000,Indoor Hard,634,4185.0,Cilic M.,Benneteau J.,4185.0,634.0,0.000000,0.500000,0.000000,0.000000,1
2507,Isner J.,Del Potro J.M.,Masters 1000,Indoor Hard,2505,2435.0,Isner J.,Del Potro J.M.,2505.0,2435.0,0.333333,0.833333,0.000000,0.000000,0
2508,Krajinovic F.,Isner J.,Masters 1000,Indoor Hard,681,2505.0,Isner J.,Krajinovic F.,2505.0,681.0,0.333333,0.833333,0.000000,0.000000,1
2509,Sock J.,Benneteau J.,Masters 1000,Indoor Hard,1945,634.0,Benneteau J.,Sock J.,634.0,1945.0,0.000000,0.000000,0.000000,0.500000,1


In [21]:
env_df = pd.get_dummies(df_test['Environment'], prefix = 'env')
series_df = pd.get_dummies(df_test['Series'], prefix = 'serie')
df_ML = pd.concat([df_test.copy()[['Pts_0','Pts_1','p0_1', 'p0_2', 'p1_1', 'p1_2']], 
                   env_df, series_df, df_test.copy()[['Match']]], axis = 1)

In [22]:
df_ML

Unnamed: 0,Pts_0,Pts_1,p0_1,p0_2,p1_1,p1_2,env_Indoor Clay,env_Indoor Hard,env_Outdoor Clay,env_Outdoor Grass,env_Outdoor Hard,serie_ATP250,serie_ATP500,serie_Grand Slam,serie_Masters 1000,Match
0,689.0,372.0,0.000000,0.000000,0.166667,0.000000,0,0,0,0,1,1,0,0,0,0
1,1090.0,1013.0,0.000000,0.055556,0.000000,0.166667,0,0,0,0,1,1,0,0,0,1
2,758.0,11780.0,0.166667,0.083333,0.333333,0.833333,0,0,0,0,1,1,0,0,0,1
3,722.0,903.0,0.000000,0.250000,0.000000,0.000000,0,0,0,0,1,1,0,0,0,0
4,795.0,2750.0,0.000000,0.166667,0.166667,0.083333,0,0,0,0,1,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2506,4185.0,634.0,0.000000,0.500000,0.000000,0.000000,0,1,0,0,0,0,0,0,1,1
2507,2505.0,2435.0,0.333333,0.833333,0.000000,0.000000,0,1,0,0,0,0,0,0,1,0
2508,2505.0,681.0,0.333333,0.833333,0.000000,0.000000,0,1,0,0,0,0,0,0,1,1
2509,634.0,1945.0,0.000000,0.000000,0.000000,0.500000,0,1,0,0,0,0,0,0,1,1


## Prepare cross 5 fold validation and do traditional ML

In [23]:
features = df_ML.iloc[:, :-1].columns
target = df_ML.iloc[:, -1:].columns

In [24]:
splits = 5
kf = KFold(n_splits = splits, shuffle=True, random_state = 0)
train_indices_dict = dict()
test_indices_dict = dict()

train_data_dict = dict()
test_data_dict = dict()

train_target_dict = dict()
test_target_dict = dict()

Count = 1
for train_index, test_index in kf.split(df_ML):
    #get the original indices of the training and test data
    train_indices_dict['Fold ' + str(Count)] = train_index
    test_indices_dict['Fold ' + str(Count)] = test_index
    
    train_data = df_ML.iloc[train_index][features]
    test_data = df_ML.iloc[test_index][features]
    train_data_dict['Fold ' + str(Count)] = train_data
    test_data_dict['Fold ' + str(Count)] = test_data
    
    train_target = df_ML.iloc[train_index][target]
    test_target = df_ML.iloc[test_index][target]
    train_target_dict['Fold ' + str(Count)] = train_target
    test_target_dict['Fold ' + str(Count)] = test_target
    Count += 1

In [25]:
Xtrain = train_data_dict['Fold 1'].to_numpy()
Ytrain = train_target_dict['Fold 1'].to_numpy()
Xtest = test_data_dict['Fold 1'].to_numpy()
Ytest = test_target_dict['Fold 1'].to_numpy()

In [26]:
Ytrain

array([[0],
       [1],
       [0],
       ...,
       [0],
       [1],
       [1]], dtype=int64)

In [27]:
dc = DecisionTreeClassifier()
dc.fit(Xtrain, Ytrain)

DecisionTreeClassifier()

In [28]:
dc.predict(Xtest).shape

(503,)

In [29]:
len(Xtrain)

2008

In [30]:
dc.score(Xtest, Ytest)

0.5944333996023857

In [31]:
lr = LogisticRegression()
lr.fit(Xtrain, Ytrain)

# Use the trained model to predict the labels of the testing data
Ypred = lr.predict(Xtest)

# Compute the accuracy score of the model
accuracy = accuracy_score(Ytest, Ypred)
print('Accuracy:', accuracy)

Accuracy: 0.6640159045725647


  y = column_or_1d(y, warn=True)


In [32]:
lr.score(Xtrain, Ytrain)

0.6573705179282868

In [33]:
def player_ml(model, train_data_dict, train_target_dict, test_data_dict, test_target_dict, verbose = 0):
    Test_accuracy_list = list()
    Test_precision_list = list()
    Train_accuracy_list = list()
    Test_recall_list = list()

    for i in range(5):
        train_set = train_data_dict['Fold ' + str(i + 1)].to_numpy()
        train_target = train_target_dict['Fold ' + str(i + 1)].to_numpy().ravel()
        test_set = test_data_dict['Fold ' + str(i + 1)].to_numpy()
        test_target = test_target_dict['Fold ' + str(i + 1)].to_numpy().ravel()
        model.fit(train_set, train_target)
        train_accuracy = model.score(train_set, train_target)
        
        test_observed = test_target_dict['Fold ' + str(i + 1)].to_numpy().ravel()
        test_predicted = model.predict(test_set)
        accuracy = accuracy_score(test_observed, test_predicted)
        precision = precision_score(test_observed, test_predicted)
        recall = recall_score(test_observed, test_predicted)

        Test_accuracy_list.append(accuracy)
        Test_precision_list.append(precision)
        Train_accuracy_list.append(train_accuracy)
        Test_recall_list.append(recall)
        
        if verbose == 1:
            print("Fold " + str(i + 1) + " Test Accuracy score:", np.round(accuracy, 4))
            print("Fold " + str(i + 1) + " Test Precision score:", np.round(precision, 4))
            print("Fold " + str(i + 1) + " Test Recall score:", np.round(recall, 4))
            print()
    dif = np.round(sum(Train_accuracy_list)/len(Train_accuracy_list), 4) - np.round(sum(Test_accuracy_list)/len(Test_accuracy_list), 4)
    dif = np.round(dif * 100, 2)
    if verbose == 1:
        print("The average Train Accuracy : ", np.round(sum(Train_accuracy_list)/len(Train_accuracy_list), 4))
        print("The average Test Accuracy : ", np.round(sum(Test_accuracy_list)/len(Test_accuracy_list), 4))
        print("The average Test Precision: ", np.round(sum(Test_precision_list)/len(Test_precision_list), 4))
        print("The average Test Recall: ", np.round(sum(Test_recall_list)/len(Test_recall_list), 4))
        print("Difference between Train vs. Test Accuracy: ", dif, "%")
    avg_Train_accuracy = np.round(sum(Train_accuracy_list)/len(Train_accuracy_list), 4)
    avg_Test_accuracy = np.round(sum(Test_accuracy_list)/len(Test_accuracy_list), 4)
    avg_Test_precision = np.round(sum(Test_precision_list)/len(Test_precision_list), 4)
    avg_Test_recall = np.round(sum(Test_recall_list)/len(Test_recall_list), 4)
    return (avg_Train_accuracy, avg_Test_accuracy, avg_Test_precision, avg_Test_recall, dif)

In [34]:
lr = LogisticRegression()
_ = player_ml(lr, train_data_dict, train_target_dict, test_data_dict, test_target_dict, verbose = 1)

Fold 1 Test Accuracy score: 0.664
Fold 1 Test Precision score: 0.6197
Fold 1 Test Recall score: 0.781

Fold 2 Test Accuracy score: 0.6673
Fold 2 Test Precision score: 0.6491
Fold 2 Test Recall score: 0.6992

Fold 3 Test Accuracy score: 0.6753
Fold 3 Test Precision score: 0.6955
Fold 3 Test Recall score: 0.7283

Fold 4 Test Accuracy score: 0.6355
Fold 4 Test Precision score: 0.6886
Fold 4 Test Recall score: 0.5836

Fold 5 Test Accuracy score: 0.6375
Fold 5 Test Precision score: 0.6464
Fold 5 Test Recall score: 0.6564

The average Train Accuracy :  0.653
The average Test Accuracy :  0.6559
The average Test Precision:  0.6598
The average Test Recall:  0.6897
Difference between Train vs. Test Accuracy:  -0.29 %


In [35]:
###naive bayes
NB = GaussianNB()
_ = player_ml(NB, train_data_dict, train_target_dict, test_data_dict, test_target_dict, verbose = 1)

Fold 1 Test Accuracy score: 0.6143
Fold 1 Test Precision score: 0.5719
Fold 1 Test Recall score: 0.7893

Fold 2 Test Accuracy score: 0.6614
Fold 2 Test Precision score: 0.6258
Fold 2 Test Recall score: 0.7683

Fold 3 Test Accuracy score: 0.6534
Fold 3 Test Precision score: 0.6474
Fold 3 Test Recall score: 0.8116

Fold 4 Test Accuracy score: 0.5976
Fold 4 Test Precision score: 0.5994
Fold 4 Test Recall score: 0.7509

Fold 5 Test Accuracy score: 0.5797
Fold 5 Test Precision score: 0.5656
Fold 5 Test Recall score: 0.7992

The average Train Accuracy :  0.6232
The average Test Accuracy :  0.6213
The average Test Precision:  0.602
The average Test Recall:  0.7839
Difference between Train vs. Test Accuracy:  0.19 %


In [36]:
RF = RandomForestClassifier()
_ = player_ml(RF, train_data_dict, train_target_dict, test_data_dict, test_target_dict, verbose = 1)

Fold 1 Test Accuracy score: 0.6441
Fold 1 Test Precision score: 0.6189
Fold 1 Test Recall score: 0.6777

Fold 2 Test Accuracy score: 0.6335
Fold 2 Test Precision score: 0.6131
Fold 2 Test Recall score: 0.6829

Fold 3 Test Accuracy score: 0.6813
Fold 3 Test Precision score: 0.7377
Fold 3 Test Recall score: 0.6522

Fold 4 Test Accuracy score: 0.6255
Fold 4 Test Precision score: 0.6653
Fold 4 Test Recall score: 0.6059

Fold 5 Test Accuracy score: 0.6076
Fold 5 Test Precision score: 0.6183
Fold 5 Test Recall score: 0.6255

The average Train Accuracy :  1.0
The average Test Accuracy :  0.6384
The average Test Precision:  0.6507
The average Test Recall:  0.6488
Difference between Train vs. Test Accuracy:  36.16 %


#### Tuning the logisitc regression

In [37]:
##Based on the result logistic regression is better
lr = LogisticRegression(penalty = 'none', solver = 'newton-cg')
_ = player_ml(lr, train_data_dict, train_target_dict, test_data_dict, test_target_dict, verbose = 1)

Fold 1 Test Accuracy score: 0.6581
Fold 1 Test Precision score: 0.6167
Fold 1 Test Recall score: 0.7645

Fold 2 Test Accuracy score: 0.6633
Fold 2 Test Precision score: 0.636
Fold 2 Test Recall score: 0.7317

Fold 3 Test Accuracy score: 0.6932
Fold 3 Test Precision score: 0.7163
Fold 3 Test Recall score: 0.7319

Fold 4 Test Accuracy score: 0.6275
Fold 4 Test Precision score: 0.6565
Fold 4 Test Recall score: 0.6394

Fold 5 Test Accuracy score: 0.6116
Fold 5 Test Precision score: 0.6168
Fold 5 Test Recall score: 0.6525

The average Train Accuracy :  0.6606
The average Test Accuracy :  0.6507
The average Test Precision:  0.6485
The average Test Recall:  0.704
Difference between Train vs. Test Accuracy:  0.99 %


In [None]:
###Lists of things to do:
##With everything above prep the data for all the years skipping 2010, 2011 due to 2 years time lag.
##Create a prediction table after this.