In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import xgboost as xgb
import sklearn.metrics as metrics
from pandas.api.types import CategoricalDtype

In [2]:
inputfolder = 'data'
datadf = pd.read_csv(os.path.join(inputfolder,"cleaned_data.csv"))
datadf.drop(['Player1','Player2','Court_Surface','Player1_Rank_Grouped','Player2_Rank_Grouped','Round_Description','datetime'],axis=1,inplace=True)
orderedlist = ['Qualifying', 'First Round', 'Second Round', 'Third Round', 'Fourth Round', 'Finals']
roundsdtype = CategoricalDtype(orderedlist,ordered=True)
roundscategories = datadf['Round_Description_clean'].astype(roundsdtype).cat.categories #storing catogical mapping for later
datadf['Round_Description_clean'] = datadf['Round_Description_clean'].astype(roundsdtype).cat.codes
print(datadf.dtypes)
display(datadf.head())

#splitting into X and Y
X = datadf.drop('Player1Win',axis=1)
Y = datadf['Player1Win']

Rank_diff                  float64
Sets_Won_diff                int64
Games_Won_diff               int64
Aces_diff                  float64
DoubleFaults_diff          float64
FirstServes_Won_diff         int64
FirstServes_In_diff          int64
SecondServes_Won_diff        int64
SecondServes_In_diff         int64
BreakPoints_Won_diff       float64
BreakPoints_diff           float64
ReturnPoints_Won_diff      float64
ReturnPoints_Faced_diff    float64
TotalPoints_Won_diff         int64
FirstServes_ratio_diff     float64
SecondServes_ratio_diff    float64
BreakPoints_ratio_diff     float64
ReturnPoints_ratio_diff    float64
year                         int64
month                        int64
day                          int64
Round_Description_clean       int8
Player1_Rank               float64
Player2_Rank               float64
Player1Win                   int64
dtype: object


Unnamed: 0,Rank_diff,Sets_Won_diff,Games_Won_diff,Aces_diff,DoubleFaults_diff,FirstServes_Won_diff,FirstServes_In_diff,SecondServes_Won_diff,SecondServes_In_diff,BreakPoints_Won_diff,...,SecondServes_ratio_diff,BreakPoints_ratio_diff,ReturnPoints_ratio_diff,year,month,day,Round_Description_clean,Player1_Rank,Player2_Rank,Player1Win
0,75.0,-2,-7,-3.0,2.0,-8,-3,-1,2,-4.0,...,-0.079545,-0.336364,-0.162554,2012,1,16,0,270.0,195.0,0
1,-54.0,-2,-8,-1.0,-2.0,6,30,-3,-5,-4.0,...,-0.055556,0.666667,-0.20383,2012,1,16,0,220.0,274.0,0
2,-75.0,2,3,1.0,-6.0,-8,-13,2,-5,1.0,...,0.11828,-0.692308,0.052072,2012,1,16,0,152.0,227.0,1
3,-229.0,2,7,3.0,1.0,12,1,-5,-5,3.0,...,-0.074442,-0.1,0.142463,2012,1,16,0,267.0,496.0,1
4,17.0,2,6,-1.0,-3.0,6,0,4,7,3.0,...,0.041394,0.245455,0.091947,2012,1,16,0,174.0,157.0,1


In [7]:
#training and then testing a basic model
Xtrain,Xtest,ytrain,ytest = train_test_split(X,Y,test_size=.3)
params ={'max_depth':6,
         'n_estimators':200,
         'silent':False,
         'njobs':0,
        }
model = xgb.XGBClassifier(**params)
model.fit(Xtrain,ytrain)

def get_scores(predy,ytest):
    import sklearn.metrics as metrics
    acc = metrics.accuracy_score(ytest,predy)
    prec = metrics.precision_score(ytest,predy)
    recall = metrics.recall_score(ytest,predy)
    auc = metrics.roc_auc_score(ytest,predy)
    return {'accuracy':acc,'precision':prec,'recall':recall,'auc':auc}
score = model.score(Xtest,ytest)
predy = model.predict(Xtest)
scoresdict = get_scores(predy,ytest)
print("="*200)
for key,value in scoresdict.items():
    print("{}: {}".format(key,value))

print("="*200)
for idx, col in enumerate(Xtrain.columns.tolist()):
    print("{} importance: {}".format(col,model.feature_importances_[idx]))


[19:08:52] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=4
[19:08:52] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=4
[19:08:52] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 16 extra nodes, 0 pruned nodes, max_depth=5
[19:08:52] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=4
[19:08:52] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=4
[19:08:52] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=4
[19:08:52] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning e

[19:08:53] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[19:08:53] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[19:08:53] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 4 extra nodes, 0 pruned nodes, max_depth=2
[19:08:53] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[19:08:53] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=3
[19:08:53] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 4 extra nodes, 0 pruned nodes, max_depth=2
[19:08:53] C:\Users\Administrator\Desktop\xgboost\src\tree\updater_prune.cc:74: tree pruning end, 1 

Aces_diff importance: 0.07092198729515076
DoubleFaults_diff importance: 0.05390070751309395
FirstServes_Won_diff importance: 0.022695034742355347
FirstServes_In_diff importance: 0.03971631079912186
SecondServes_Won_diff importance: 0.011347517371177673
SecondServes_In_diff importance: 0.0028368793427944183
BreakPoints_Won_diff importance: 0.09078013896942139
BreakPoints_diff importance: 0.03687943145632744
ReturnPoints_Won_diff importance: 0.005673758685588837
ReturnPoints_Faced_diff importance: 0.059574469923973083
TotalPoints_Won_diff importance: 0.09503546357154846
FirstServes_ratio_diff importance: 0.014184396713972092
SecondServes_ratio_diff importance: 0.0042553190141916275
BreakPoints_ratio_diff importance: 0.0624113492667675
ReturnPoints_ratio_diff importance: 0.05390070751309395
year importance: 0.0042553190141916275
month importance: 0.0
day importance: 0.022695034742355347
Round_Description_clean importance: 0.0014184396713972092
Player1_Rank importance: 0.025531914085149765