In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import xgboost as xgb
import sklearn.metrics as metrics
import datetime

In [2]:
inputfolder = 'data'
datadf = pd.read_csv(os.path.join(inputfolder,"womens_match_data.csv"))
datadf['datetime'] = pd.to_datetime(datadf['datetime'])
with open(os.path.join(inputfolder,'datacolumns.txt'),'r') as f:
    content = f.readlines()
datacols = [x.strip() for x in content]
print(datadf.dtypes)
display(datadf.head())

atpdata = pd.read_csv(os.path.join(inputfolder,"WTA_matches.csv"))
cols = atpdata.drop(['Winner','Loser','Tournament','Tournament_Date','Court_Surface','Round_Description'],axis=1).columns
for col in cols:
    atpdata.loc[:,col]=pd.to_numeric(atpdata[col],errors='coerce')
print(atpdata.dtypes)
atpdata['datetime'] = pd.to_datetime(atpdata['Tournament_Date'])
atpdata['year'] = pd.DatetimeIndex(atpdata['datetime']).year
def fillrank(row,col,ATPframe):
    if np.isnan(row[col]):
            year = row['year']
            tournament = row['Tournament']
            surface = row['Court_Surface']
            yeardf = ATPframe[ATPframe['year']==year] #i can do this slicing in one line, but im doing in two to allow for easier debugging
            tourndf = ATPframe[(ATPframe['Tournament']==tournament)&(ATPframe['Court_Surface']==surface)]
            maxrank = int(max([tourndf['Winner_Rank'].max(),tourndf['Loser_Rank'].max()]))
            return maxrank+1
    else:
        return row[col]
atpdata.loc[:,'Winner_Rank'] = atpdata.apply(fillrank,axis=1,args=('Winner_Rank',atpdata))
atpdata.loc[:,'Loser_Rank'] = atpdata.apply(fillrank,axis=1,args=('Loser_Rank',atpdata))

display(atpdata.head())

#only going to use matches from 2014 onwards, not ussing matches from 2012-13 so we have enough historic data for matches in 2014
atpHardOnly = atpdata.loc[(atpdata['Court_Surface']=='Hard')&(atpdata['year']>2013),['Winner','Loser','datetime']]
frame1,frame2 = train_test_split(atpHardOnly,test_size=.5)
frame1.columns = ['Player1','Player2','datetime']
frame1['Player1Win'] = 1

frame2.columns = ['Player2','Player1','datetime']
frame2['Player1Win'] = 0

traininput = pd.concat([frame1,frame2])
traininput.sort_index(inplace=True)
print(traininput['Player1Win'].mean())
display(traininput.head())

# splitting into X and Y
X = traininput.drop('Player1Win',axis=1)
Y = traininput['Player1Win']

Player                        object
Rank                         float64
Sets_Won                     float64
Games_Won                      int64
Aces                         float64
DoubleFaults                 float64
FirstServes_Won              float64
FirstServes_In               float64
SecondServes_Won             float64
SecondServes_In              float64
BreakPoints_Won              float64
BreakPoints                  float64
ReturnPoints_Won             float64
ReturnPoints_Faced           float64
TotalPoints_Won              float64
won_game?                      int64
FirstServes_ratio            float64
SecondServes_ratio           float64
BreakPoints_ratio            float64
ReturnPoints_ratio           float64
datetime              datetime64[ns]
year                           int64
Tournament                    object
Round_Description             object
Court_Surface                 object
Total_Serves                 float64
Aces%                        float64
S

Unnamed: 0,Player,Rank,Sets_Won,Games_Won,Aces,DoubleFaults,FirstServes_Won,FirstServes_In,SecondServes_Won,SecondServes_In,...,BreakPoints_ratio,ReturnPoints_ratio,datetime,year,Tournament,Round_Description,Court_Surface,Total_Serves,Aces%,ServesWon%
0,Johanna Larsson,84.0,2.0,13,6.0,0.0,31.0,49.0,7.0,13.0,...,0.666667,0.553846,2014-01-01,2014,Auckland,First Round,Hard,62.0,0.096774,0.612903
1,Ana Konjuh,259.0,2.0,15,3.0,10.0,23.0,32.0,15.0,36.0,...,0.5,0.5,2014-01-01,2014,Auckland,First Round,Hard,68.0,0.044118,0.558824
2,Ayumi Morita,60.0,2.0,13,3.0,2.0,33.0,46.0,9.0,22.0,...,1.0,0.484375,2014-01-01,2014,Auckland,First Round,Hard,68.0,0.044118,0.617647
3,Venus Williams,47.0,2.0,13,4.0,7.0,38.0,52.0,14.0,30.0,...,0.6,0.435484,2014-01-01,2014,Auckland,First Round,Hard,82.0,0.04878,0.634146
4,Julia Goerges,72.0,2.0,18,2.0,4.0,45.0,56.0,29.0,46.0,...,0.111111,0.359375,2014-01-01,2014,Auckland,First Round,Hard,102.0,0.019608,0.72549


Winner                        object
Loser                         object
Tournament                    object
Tournament_Date               object
Court_Surface                 object
Round_Description             object
Winner_Rank                  float64
Loser_Rank                   float64
Retirement_Ind                 int64
Winner_Sets_Won              float64
Winner_Games_Won               int64
Winner_Aces                  float64
Winner_DoubleFaults          float64
Winner_FirstServes_Won       float64
Winner_FirstServes_In        float64
Winner_SecondServes_Won      float64
Winner_SecondServes_In       float64
Winner_BreakPoints_Won       float64
Winner_BreakPoints           float64
Winner_ReturnPoints_Won      float64
Winner_ReturnPoints_Faced    float64
Winner_TotalPoints_Won       float64
Loser_Sets_Won               float64
Loser_Games_Won                int64
Loser_Aces                   float64
Loser_DoubleFaults           float64
Loser_FirstServes_Won        float64
L

Unnamed: 0,Winner,Loser,Tournament,Tournament_Date,Court_Surface,Round_Description,Winner_Rank,Loser_Rank,Retirement_Ind,Winner_Sets_Won,...,Loser_FirstServes_In,Loser_SecondServes_Won,Loser_SecondServes_In,Loser_BreakPoints_Won,Loser_BreakPoints,Loser_ReturnPoints_Won,Loser_ReturnPoints_Faced,Loser_TotalPoints_Won,datetime,year
0,Johanna Larsson,Lourdes Dominguez-Lino,Auckland,01-Jan-14,Hard,First Round,84.0,69.0,0,2.0,...,32.0,13.0,33.0,2.0,8.0,24.0,62.0,53.0,2014-01-01,2014
1,Ana Konjuh,Roberta Vinci,Auckland,01-Jan-14,Hard,First Round,259.0,14.0,0,2.0,...,65.0,10.0,23.0,5.0,6.0,30.0,68.0,74.0,2014-01-01,2014
2,Ayumi Morita,Lucie Safarova,Auckland,01-Jan-14,Hard,First Round,60.0,29.0,0,2.0,...,32.0,12.0,32.0,3.0,5.0,26.0,68.0,59.0,2014-01-01,2014
3,Venus Williams,Andrea Hlavackova,Auckland,01-Jan-14,Hard,First Round,47.0,134.0,0,2.0,...,34.0,12.0,28.0,2.0,6.0,30.0,82.0,65.0,2014-01-01,2014
4,Julia Goerges,Karin Knapp,Auckland,01-Jan-14,Hard,First Round,72.0,41.0,0,2.0,...,70.0,30.0,58.0,2.0,2.0,28.0,102.0,110.0,2014-01-01,2014


0.5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.




Unnamed: 0,Player1,Player1Win,Player2,datetime
0,Johanna Larsson,1,Lourdes Dominguez-Lino,2014-01-01
1,Roberta Vinci,0,Ana Konjuh,2014-01-01
2,Ayumi Morita,1,Lucie Safarova,2014-01-01
3,Venus Williams,1,Andrea Hlavackova,2014-01-01
4,Julia Goerges,1,Karin Knapp,2014-01-01


In [3]:
#defining preprocessing functions (from wrangleDataAttempt2)
def days_difference(date1,date2):
            diff = date2-date1
            return diff.days

def get_player_stats(inputframe,harddf,datacols,playercol):
    #idea is to pass a panda dataframe with columns [playername, tournamentdate] and be able to return a dataframe with the stats
    length = inputframe.shape[0]
    historicnames = ['careeravg_'+x for x in datacols]
    ytdnames = ['ytd_'+x for x in datacols]
    colnames = [playercol,'datetime','Rank','ytd_Aces%_Stddev','ytd_ServesWon%_Stddev'] + historicnames+ytdnames
    outputframe = pd.DataFrame(index=range(0,length),columns=colnames)
    outputframe.loc[:,playercol] = inputframe[playercol]
    outputframe.loc[:,'datetime'] = inputframe['datetime']
    for index in range(0,length):
        #extract player name and date of game
        row = inputframe.iloc[index,:]
        playername = row[playercol]
        date = row['datetime']
        
        #grab only data for that player before that date
        tempdf = harddf[harddf['Player']==playername]
        tempdf.loc[:,'timedelta'] = tempdf['datetime'].apply(days_difference,args=(date,))
        tempdf = tempdf[tempdf['timedelta']>0]
        if tempdf.empty:
            continue
        ytddf = tempdf.loc[tempdf['timedelta']<=365]
#         display(ytddf.head())
        currank = tempdf.loc[tempdf['timedelta'].idxmin(),'Rank']
        historicframe = tempdf.loc[:,datacols].mean()
        historicframe.index = historicnames
        YTDframe = ytddf.loc[:,datacols].mean()
        YTDframe.index = ytdnames
        
        #code for debugging NaNs
#         if tempdf.isna().sum().sum()>0:
#             print(playername)
#             print("ytdframe shape: {}, missing values: {}".format(ytddf.shape,ytddf.isna().sum().sum()))
#             print(ytddf.isna().sum())
#             print("careerdf shape: {}, missing values: {}".format(tempdf.shape,tempdf.isna().sum().sum()))
#             print(tempdf.isna().sum())

        outputframe.loc[index,['Rank','ytd_Aces%_Stddev','ytd_ServesWon%_Stddev']] = [currank,ytddf['Aces%'].std(),ytddf['ServesWon%'].std()]
        outputframe.loc[index,historicnames]=historicframe
        outputframe.loc[index,ytdnames]=YTDframe
    return outputframe

def get_difference(frame1,frame2,colname=None, prefix1='',prefix2='',index1=None,index2=None):
    #check if index between frames are identical, if not, reset both
    idx_check = frame1.index.equals(frame2.index)
    if not(idx_check):
        frame1.reset_index(drop=True, inplace=True)
        frame2.reset_index(drop=True, inplace=True)
    if colname is None:
        colname = frame1.columns
    if frame1.shape[0] != frame2.shape[0]:
        raise ValueError('both frames must contain the same number of rows/n Frame1: %{}/mn Frame2: %{}'.format(frame1.shape[0],frame2.shape[0]))
    tempdiffcolnames = [x+"_diff" for x in colname]
    diffcolnames = list()
    if index1 is not None:
        diffcolnames.append(index1)
    if index2 is not None:
        diffcolnames.append(index2)
    diffcolnames = diffcolnames + tempdiffcolnames
    if idx_check:
        difframe = pd.DataFrame(index=frame1.index,columns=diffcolnames,data=0)
    else:
        difframe = pd.DataFrame(index=range(0,frame1.shape[0]),columns=diffcolnames,data=0)
    if index1 is not None:
        difframe[index1] = frame1[index1]
    if index2 is not None:
        difframe[index2] = frame2[index2]
    for col in colname:
        frame1name = prefix1+col
        frame2name = prefix2+col
        difframename = col+"_diff"
        difframe[difframename]=frame1[frame1name] - frame2[frame2name]
    return difframe

def convert_to_data(inputframe,matchdata,datacols,player1colname="Player1",player2colname="Player2"):
    inputframe.reset_index(drop=True,inplace=True)
    
    player1frame = inputframe.loc[:,[player1colname,'datetime']]
    player2frame = inputframe.loc[:,[player2colname,'datetime']]
    
    player1df = get_player_stats(player1frame,matchdata,datacols,player1colname)
    player2df = get_player_stats(player2frame,matchdata,datacols,player2colname)
    
    #append columns indicating whether this is a players first recorded Hard surface ATPgame (if it's their first game, Rank will return NaNs)
    player1df['Player1fg'] = player1df['Rank'].isna()
    player2df['Player2fg'] = player2df['Rank'].isna()
    
#     player1df = player1df[~player1df['Player1fg']].fillna(0)
#     player2df = player2df[~player2df['Player2fg']].fillna(0)
#     if player1df.isna().sum().sum() >0:
#         display(player1df)
    
    
    #for debugging purposes
    if player1df.shape[0] != player2df.shape[0]:
        return (player1df,player2df)
    
    newdatacols = player1df.drop([player1colname,'datetime','Player1fg'],axis=1).columns
    outputdf = get_difference(player1df,player2df,newdatacols,index1=player1colname,index2=player2colname)
    outputdf.loc[:,'Player1fg'] = player1df['Player1fg']
    outputdf.loc[:,'Player2fg'] = player2df['Player2fg']
    outputdf.loc[:,'datetime'] = player1df['datetime']
    
    return outputdf.infer_objects() #infer_objects soft converts object columns to their correct types

# tempX = X.iloc[:15,:]
# tempX = convert_to_data(tempX,datadf,datacols)
# print(tempX.dtypes)

X = convert_to_data(X,datadf,datacols)
print(X.dtypes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Player1                                      object
Player2                                      object
Rank_diff                                   float64
ytd_Aces%_Stddev_diff                       float64
ytd_ServesWon%_Stddev_diff                  float64
careeravg_Sets_Won_diff                     float64
careeravg_Games_Won_diff                    float64
careeravg_Aces_diff                         float64
careeravg_DoubleFaults_diff                 float64
careeravg_FirstServes_Won_diff              float64
careeravg_FirstServes_In_diff               float64
careeravg_SecondServes_Won_diff             float64
careeravg_SecondServes_In_diff              float64
careeravg_BreakPoints_Won_diff              float64
careeravg_BreakPoints_diff                  float64
careeravg_ReturnPoints_Won_diff             float64
careeravg_ReturnPoints_Faced_diff           float64
careeravg_TotalPoints_Won_diff              float64
careeravg_won_game?_diff                    float64
careeravg_Fi

In [4]:
#training and then testing a basic model
from sklearn.model_selection import GridSearchCV
fullX = X.copy()
if 'Player1' in X.columns:
    X.drop(['Player1','Player2','datetime'],axis=1,inplace=True)
Xtrain,Xtest,ytrain,ytest = train_test_split(X,Y,test_size=.3)
params ={'max_depth':[3,4,5,6],
         'n_estimators':[100,200,300],
         'learning_rate':[.01,.03,.1],
         'reg_alpha':[0,.01,.03,.1],
         'reg_lambda':[0,.01,.03,.1]
        }
estimator = xgb.XGBClassifier(random_state=123)
tuneparam = GridSearchCV(estimator,params,n_jobs=-1,cv=5)
print("tuning model")
tuneparam.fit(Xtrain,ytrain)

model = tuneparam.best_estimator_
best_params = tuneparam.best_params_
cvtable = tuneparam.cv_results_
cvdf = pd.DataFrame(cvtable)
display(cvdf[cvdf['rank_test_score']==1])

def get_scores(predy,ytest):
    import sklearn.metrics as metrics
    acc = metrics.accuracy_score(ytest,predy)
    prec = metrics.precision_score(ytest,predy)
    recall = metrics.recall_score(ytest,predy)
    auc = metrics.roc_auc_score(ytest,predy)
    return {'accuracy':acc,'precision':prec,'recall':recall,'auc':auc}
score = model.score(Xtest,ytest)
predy = model.predict(Xtest)

scoresdict = get_scores(predy,ytest)
print("="*200)
print("Score: {}".format(score))
for key,value in scoresdict.items():
    print("{}: {}".format(key,value))

print("="*200)
for idx, col in enumerate(Xtrain.columns.tolist()):
    print("{} importance: {}".format(col,model.feature_importances_[idx]))


tuning model




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,param_reg_alpha,param_reg_lambda,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
198,4.509041,0.149791,0.017247,0.007881,0.03,3,100,0.01,0.03,"{'learning_rate': 0.03, 'max_depth': 3, 'n_est...",...,0.629035,0.011929,1,0.6708,0.67651,0.667042,0.675383,0.66892,0.671731,0.003659


  if diff:
  if diff:


Score: 0.6482961716449306
accuracy: 0.6482961716449306
precision: 0.6720142602495544
recall: 0.6170212765957447
auc: 0.6492032789905131
Rank_diff importance: 0.12081513553857803
ytd_Aces%_Stddev_diff importance: 0.005822416394948959
ytd_ServesWon%_Stddev_diff importance: 0.014556040987372398
careeravg_Sets_Won_diff importance: 0.03202328830957413
careeravg_Games_Won_diff importance: 0.0029112081974744797
careeravg_Aces_diff importance: 0.0029112081974744797
careeravg_DoubleFaults_diff importance: 0.007278020493686199
careeravg_FirstServes_Won_diff importance: 0.010189228691160679
careeravg_FirstServes_In_diff importance: 0.005822416394948959
careeravg_SecondServes_Won_diff importance: 0.026200873777270317
careeravg_SecondServes_In_diff importance: 0.016011644154787064
careeravg_BreakPoints_Won_diff importance: 0.005822416394948959
careeravg_BreakPoints_diff importance: 0.014556040987372398
careeravg_ReturnPoints_Won_diff importance: 0.0029112081974744797
careeravg_ReturnPoints_Faced_di

What about a model just trained using Ranked Diff? (curious to see how much predictive power rank has)

In [5]:
rankXtrain = Xtrain.loc[:,'Rank_diff'].values.reshape(Xtrain.shape[0],1)
rankXtest = Xtest.loc[:,'Rank_diff'].values.reshape(Xtest.shape[0],1)

# print(rankXtrain.shape)
params ={'max_depth':[4,5,6],
         'n_estimators':[100,200,300],
         'learning_rate':[.01,.03,.1],
        }
estimator = xgb.XGBClassifier(random_state=123)
tuneparam = GridSearchCV(estimator,params,n_jobs=-1)
print("tuning model")
tuneparam.fit(rankXtrain,ytrain)

model2 = tuneparam.best_estimator_
best_params2 = tuneparam.best_params_

score2 = model2.score(rankXtest,ytest)
predy2 = model2.predict(rankXtest)
scoresdict2 = get_scores(predy2,ytest)

print("="*200)
print("Score: {}".format(score2))
for key,value in scoresdict2.items():
    print("{}: {}".format(key,value))

print("="*200)
print("{} importance: {}".format('Rank_diff',model2.feature_importances_))


tuning model
Score: 0.6196886832141355
accuracy: 0.6196886832141355
precision: 0.6574257425742575
recall: 0.5433715220949263
auc: 0.6219022112639134
Rank_diff importance: [1.]


  if diff:
  if diff:


In [6]:
print(best_params)
print(best_params2)

{'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 100, 'reg_alpha': 0.01, 'reg_lambda': 0.03}
{'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 300}
