In [1]:
import pandas as pd
import numpy as np
import glob
import re
import seaborn as sns
import datetime

pd.set_option('display.max_columns', 200, 'display.max_rows', 200)

In [2]:
# we start with the merged table (ATP data + bookmakers)

df = pd.read_csv('ATP_merged_clean.csv', index_col=0)

In [6]:
# some pre-processing on column names, this will be useful later when we switch from winner/loser to P1/P2

df.rename(columns={"Winner": 'winner_name',"Loser": "loser_name","WRank": "w_rank", "LRank": "l_rank", "PSW": "w_PS", "PSL": 'l_PS', 'B365W': 'w_B365', 'B365L': 'l_B365', 'elo_winner': 'w_elo', 'elo_loser': 'l_elo'}, errors="raise", inplace=True)


In [7]:
# get date as date format

df['Date'] = pd.to_datetime(df['Date'])

In [8]:
# The score column cannot be interpreted as is, therefore we made this function which parses the string to extract each game's score

def score_split(score):
  # first we take out the parenthesis from the score
  score = re.sub("[\(\[].*?[\)\]]", "", score)
  # we split a first time between the spaces in order to get the sets scores
  Games = score.split()
  G = 0
  result = []
  # we split a second time within each set to get each player's score
  for game in Games:
    scores = game.split('-')
    for sc in scores:
      result.append(int(sc))

  # because not all games have 5 sets, we complete the missing sets with 0s
  l = len(result)

  for i in range(l, 10):
    result.append(0)

  result = tuple(result)

  return result

In [9]:
# we use the function above to parse the games scores into new columns
df['p1_1'], df['p2_1'], df['p1_2'], df['p2_2'], df['p1_3'], df['p2_3'], df['p1_4'], df['p2_4'], df['p1_5'], df['p2_5'] = zip(*map(score_split, df['score']))

In [10]:
# we rename the columns to p1 and p2 instead of winner and loser

pattern = '|'.join(['winner_', 'w_'])
df.columns = df.columns.str.replace(pattern, 'p1_', regex=True)
pattern = '|'.join(['loser_', 'l_'])
df.columns = df.columns.str.replace(pattern, 'p2_', regex=True)


In [11]:
# we duplicate the df and reverse half in order to get a Game/Player dataset instead of a Game dataset, from this step on we get double the amount of lines

to_keep = df.copy()
to_switch = df.copy()

to_keep['p1_win'] = True
to_switch['p1_win'] = False

# on the lines that we will switch, proba_elo_p2 is equal to 1 - proba_elo_p1
to_switch['proba_elo'] = 1 - to_switch['proba_elo']

# we get all the columns which are player-related and create a version with reveresed p1 and p2
cols = list(df)
p_cols = [x for x in cols if ('p1_' in x) or ('p2_' in x)]

# there might be a better way to do this, but to revert p1 & p2 I use 3 steps and a temporary p3
p_cols_revert = [x.replace('p1_','p3_').replace('p2_', 'p1_').replace('p3_', 'p2_') for x in p_cols]

mydict = dict(zip(p_cols, p_cols_revert))

# call rename () method
to_switch.rename(columns=mydict,
          inplace=True)

# finally we group the two parts (reverted & original) of our dataframe back together
df = to_keep.append(to_switch).sort_index().reset_index(drop=True)


In [12]:
# nb of sets won for each player

df['p1_sets'] = (df['p1_1'] > df['p2_1']).astype(int) + (df['p1_2'] > df['p2_2']).astype(int) + (df['p1_3'] > df['p2_3']).astype(int) + (df['p1_4'] > df['p2_4']).astype(int) + (df['p1_5'] > df['p2_5']).astype(int)
df['p2_sets'] = (df['p1_1'] < df['p2_1']).astype(int) + (df['p1_2'] < df['p2_2']).astype(int) + (df['p1_3'] < df['p2_3']).astype(int) + (df['p1_4'] < df['p2_4']).astype(int) + (df['p1_5'] < df['p2_5']).astype(int)



In [13]:
# We will now try to get extra features

# all these features are taken from ultimate tennis statistics, we simply reproduce the formulas with our data
# https://www.ultimatetennisstatistics.com/glossary

# 1st Serve Effectiveness
# First Serve Effectiveness: 1st serve points won % divided by 2nd serve points won %
df["p1_1stWon%"] = df["p1_1stWon"] / df["p1_1stIn"]
df["p1_2ndWon%"] = df["p1_2ndWon"] / (df["p1_svpt"] - df["p1_1stIn"])
df["p1_1stServeEffectiveness"] = df["p1_1stWon%"]/df["p1_2ndWon%"]

df["p2_1stWon%"] = df["p2_1stWon"] / df["p2_1stIn"]
df["p2_2ndWon%"] = df["p2_2ndWon"] / (df["p2_svpt"] - df["p2_1stIn"])
df["p2_1stServeEffectiveness"] = df["p2_1stWon%"]/df["p2_2ndWon%"]

# Return to Service Points Ratio 
# Return to Service Points Ratio - Return points played divided by service points played
df["p1_Ret2ServPtsRatio"] = df["p2_svpt"] / df["p1_svpt"]
df["p2_Ret2ServPtsRatio"] = df["p1_svpt"] / df["p2_svpt"]

# Point Dominance Ratio
# Points Dominance Ratio: % of return points won divided by % of service points lost
df["p1_ServeWon%"] = (df["p1_1stWon"] + df["p1_2ndWon"]) / df["p1_svpt"]
df["p1_ReturnWon%"] = 1 - df["p1_ServeWon%"]

df["p2_ServeWon%"] = (df["p2_1stWon"] + df["p2_2ndWon"]) / df["p2_svpt"]
df["p2_ReturnWon%"] = 1 - df["p2_ServeWon%"]

df["p1_PtsDominanceRatio"] = df["p1_ReturnWon%"] / df["p2_ReturnWon%"]
df["p2_PtsDominanceRatio"] = df["p2_ReturnWon%"] / df["p1_ReturnWon%"]

# Break Points Ratio
# Break Points Ratio: % of break points converted divided by % of faced break points lost

df["p1_BPConverted%"] = (df["p2_bpFaced"] - df["p2_bpSaved"]) / df["p2_bpFaced"]
df["p2_BPConverted%"] = (df["p1_bpFaced"] - df["p1_bpSaved"]) / df["p1_bpFaced"]

df["p1_BPRatio"] = df["p1_BPConverted%"] / df["p2_BPConverted%"]
df["p2_BPRatio"] = df["p2_BPConverted%"] / df["p1_BPConverted%"]

# Points to Sets Over-Performing Ratio
# Points to Sets Over-Performing Ratio - Points to Sets Over-Performing Ratio: % of sets won divided by % of total points won
df["p1_SetWon%"] = df["p1_sets"] / (df["p1_sets"] + df["p2_sets"])
df["p1_PtsWon%"] = (df["p1_1stWon"] + df["p1_2ndWon"] + df["p2_1stIn"] - df["p2_1stWon"] + (df["p2_svpt"] - df["p2_1stIn"]) - df["p2_2ndWon"]) / (df["p1_svpt"] + df["p2_svpt"])
df["p1_Pts2Sets_OP_Ratio"] = df["p1_SetWon%"] / df["p1_PtsWon%"]

df["p2_SetWon%"] = df["p2_sets"] / (df["p1_sets"] + df["p2_sets"])
df["p2_PtsWon%"] = (df["p2_1stWon"] + df["p2_2ndWon"] + df["p1_1stIn"] - df["p1_1stWon"] + (df["p1_svpt"] - df["p1_1stIn"]) - df["p1_2ndWon"]) / (df["p1_svpt"] + df["p2_svpt"])
df["p2_Pts2Sets_OP_Ratio"] = df["p2_SetWon%"] / df["p2_PtsWon%"]

# Points to Games Over-Performing Ratio
# Points to Games Over-Performing Ratio - Points to Games Over-Performing Ratio: % of games won divided by % of total points won
df["p1_GmsWon%"] = (df["p1_1"] + df["p1_2"] + df["p1_3"] + df["p1_4"] + df["p1_5"]) / (df["p1_1"] + df["p1_2"] + df["p1_3"] + df["p1_4"] + df["p1_5"] + df["p2_1"] + df["p2_2"] + df["p2_3"] + df["p2_4"] + df["p2_5"])
df["p1_Pts2Gms_OP_Ratio"] = df["p1_GmsWon%"] / df["p1_PtsWon%"]

df["p2_GmsWon%"] = (df["p2_1"] + df["p2_2"] + df["p2_3"] + df["p2_4"] + df["p2_5"]) / (df["p1_1"] + df["p1_2"] + df["p1_3"] + df["p1_4"] + df["p1_5"] + df["p2_1"] + df["p2_2"] + df["p2_3"] + df["p2_4"] + df["p2_5"])
df["p2_Pts2Gms_OP_Ratio"] = df["p2_GmsWon%"] / df["p2_PtsWon%"]

# Games to Sets Over-Performing Ratio
# Games to Sets Over-Performing Ratio - Games to Sets Over-Performing Ratio: % of sets won divided by % of games won
df["p1_Gms2Sets_OP_Ratio"] = df["p1_SetWon%"] / df["p1_GmsWon%"]
df["p2_Gms2Sets_OP_Ratio"] = df["p2_SetWon%"] / df["p2_GmsWon%"]

# Break Points Over-Performing Ratio
# Break Points Over-Performing Ratio - Break Points Over-Performing Ratio: % of break points won (saved + converted) divided by % of total points won
df["p1_BPWon%"] = (df["p2_bpFaced"] - df["p2_bpSaved"] + df["p1_bpSaved"]) / (df["p1_bpFaced"] + df["p2_bpFaced"])
df["p1_BP_OP_Ratio"] = df["p1_BPWon%"] / df["p1_PtsWon%"]

df["p2_BPWon%"] = (df["p1_bpFaced"] - df["p1_bpSaved"] + df["p2_bpSaved"]) / (df["p1_bpFaced"] + df["p2_bpFaced"])
df["p2_BP_OP_Ratio"] = df["p2_BPWon%"] / df["p2_PtsWon%"]

# Break Points Saved Over-Performing Ratio
# Break Points Saved Over-Performing Ratio - Break Points Saved Over-Performing Ratio: % of break points saved divided by % of service points won
df["p1_BPSaved%"] = df["p1_bpSaved"] / df["p1_bpFaced"]
df["p1_BPSaved_OP_Ratio"] = df["p1_BPSaved%"] / df["p1_ServeWon%"]

df["p2_BPSaved%"] = df["p2_bpSaved"] / df["p2_bpFaced"]
df["p2_BPSaved_OP_Ratio"] = df["p2_BPSaved%"] / df["p2_ServeWon%"]

# Break Points Converted Over-Performing Ratio
# Break Points Converted Over-Performing Ratio - Break Points Converted Over-Performing Ratio: % of break points converted divided by % of return points won
df["p1_BPConverted_OP_Ratio"] = df["p1_BPConverted%"] / df["p1_ReturnWon%"]
df["p2_BPConverted_OP_Ratio"] = df["p2_BPConverted%"] / df["p2_ReturnWon%"]

 # Extras
df["p1_Ace%"] = df["p1_ace"]/df["p1_svpt"]
df["p1_DF%"] = df["p1_df"]/df["p1_svpt"]
df["p1_1stServe%"] = df["p1_1stIn"] / df["p1_svpt"]
df["p1_1stReturnWon%"] = (df["p2_1stIn"] - df["p2_1stWon"]) / df["p2_1stIn"]

df["p2_Ace%"] = df["p2_ace"]/df["p2_svpt"]
df["p2_DF%"] = df["p2_df"]/df["p2_svpt"]
df["p2_1stServe%"] = df["p2_1stIn"] / df["p2_svpt"]
df["p2_1stReturnWon%"] = (df["p1_1stIn"] - df["p1_1stWon"]) / df["p1_1stIn"]

# Upsets
df["p1_UpsetScored"] = [1 if (row["p1_rank"] < row["p2_rank"] and row["p1_win"] == 1) else 0 for i,row in df.iterrows()]
df["p2_UpsetScored"] = [1 if (row["p1_rank"] > row["p2_rank"] and row["p1_win"] == 0) else 0 for i,row in df.iterrows()]
df["p1_UpsetAgainst"] = df["p2_UpsetScored"]
df["p2_UpsetAgainst"] = df["p1_UpsetScored"]

In [14]:
# we drop the lines with NA and create a dataframe called df_ra

df = df.dropna(axis=0)
df_ra = df.copy()


In [15]:
# so far our dataframe contains the in-game data for each game, of course we will not have this data available before the games
# in order to base our model on data available at the beginning of each game (when we want to predict its outcome)
# we will replace the in-game data with rolling averages of the previous games of each player

# below we can set the parameters for this rolling aerage calculation
min_periods = 1
window = 60

# window_short = 10
# window_long = 30

# this is the list of in-game features that we only get once the game is finished, and that we therefore need to process with rolling average
calculated_features_p1 = ['p1_1stWon%',
'p1_2ndWon%',
'p1_1stServeEffectiveness',
'p1_Ret2ServPtsRatio',
'p1_ServeWon%',
'p1_ReturnWon%',
'p1_PtsDominanceRatio',
'p1_BPConverted%',
'p1_BPRatio',
'p1_SetWon%',
'p1_PtsWon%',
'p1_Pts2Sets_OP_Ratio',
'p1_GmsWon%',
'p1_Pts2Gms_OP_Ratio',
'p1_Gms2Sets_OP_Ratio',
'p1_BPWon%',
'p1_BP_OP_Ratio',
'p1_BPSaved%',
'p1_BPSaved_OP_Ratio',
'p1_BPConverted_OP_Ratio',
'p1_Ace%',
'p1_DF%',
'p1_1stServe%',
'p1_1stReturnWon%',
'p1_UpsetScored',
'p1_UpsetAgainst']

calculated_features_p2 = [x.replace('p1_', 'p2_') for x in calculated_features_p1]



In [16]:
# these dataframes are used only to experiment with different rolling average techniques

df_ra2 = df_ra.copy()
df_ra3 = df_ra.copy()

In the following part we experiment with different rolling averages techniques. In order to maintain clarity these experiments are done on copies of the dataframe, and we will only use one of these copies in the end.
Once we are more confident that we found the best technique, we could remove the others to improve performance.

In [17]:
# simple exponential

# calculated_features_p2.extend(['minutes'])
# df_ra.sort_index(ascending=False)

for feature in calculated_features_p1:
  df_ra2[feature] = df_ra2.groupby('p1_name')[feature].transform(lambda s: s[::1].shift(1).ewm(span=window).mean().round(4))

for feature in calculated_features_p2:
  df_ra2[feature] = df_ra2.groupby('p2_name')[feature].transform(lambda s: s[::1].shift(1).ewm(span=window).mean().round(4))

In [18]:
# time related exponential

half_life = 365

# calculated_features_p2.extend(['minutes'])

# df_ra.sort_index(ascending=False)

#gpby = df_ra3.groupby('p1_name').transform(lambda s: s[::1].shift(1)).rolling(window=window, min_periods=min_periods, method="table")


#  df_ra[(feature)] = df_ra.groupby('p1_name')[feature].transform(lambda s: s[::1].shift(1).rolling(window=window, min_periods=min_periods).mean().round(4))

#for feature in calculated_features_p1:
#  df_ra2[feature] = df_ra2.groupby('p1_name')[feature].transform(lambda s: s[::1].shift(1).ewm(span=window).mean().round(4))

#for feature in calculated_features_p2:
#  df_ra2[feature] = df_ra2.groupby('p2_name')[feature].transform(lambda s: s[::1].shift(1).ewm(span=window).mean().round(4))

In [19]:
# simple rolling average

for feature in calculated_features_p1:
  df_ra[(feature)] = df_ra.groupby('p1_name')[feature].transform(lambda s: s[::1].shift(1).rolling(window=window, min_periods=min_periods).mean())

for feature in calculated_features_p2:
  df_ra[(feature)] = df_ra.groupby('p2_name')[feature].transform(lambda s: s[::1].shift(1).rolling(window=window, min_periods=min_periods).mean())

In [20]:

# finally for some features we thought that it could be interesting to get each player's 'trend': to know if they are in a positive or negative progression over their last games
# to do so, we compare the position at the time of the game with the average of the positions over a set amount of previous games.
# we do this with everything which concerns ranking. This data is available pre-game

# window_trend

# df_ra['p1_rank_change'] = (df_ra['p1_rank'] - df_ra.groupby('p1_name')['p1_rank'].transform(lambda s: s[::1].shift(1).rolling(window=window, min_periods=min_periods).mean())) / df_ra['p1_rank']
# df_ra['p2_rank_change'] = (df_ra['p2_rank'] - df_ra.groupby('p2_name')['p2_rank'].transform(lambda s: s[::1].shift(1).rolling(window=window, min_periods=min_periods).mean())) / df_ra['p2_rank']

# df_ra['p1_rank_points_change'] = (df_ra['p1_rank_points'] - df_ra.groupby('p1_name')['p1_rank_points'].transform(lambda s: s[::1].shift(1).rolling(window=window, min_periods=min_periods).mean())) / df_ra['p1_rank_points']
# df_ra['p2_rank_points_change'] = (df_ra['p2_rank_points'] - df_ra.groupby('p2_name')['p2_rank_points'].transform(lambda s: s[::1].shift(1).rolling(window=window, min_periods=min_periods).mean())) / df_ra['p2_rank_points']

# df_ra['p1_elo_change'] = (df_ra['p1_elo'] - df_ra.groupby('p1_name')['p1_elo'].transform(lambda s: s[::1].shift(1).rolling(window=window, min_periods=min_periods).mean())) / df_ra['p1_elo']
# df_ra['p2_elo_change'] = (df_ra['p2_elo'] - df_ra.groupby('p2_name')['p2_elo'].transform(lambda s: s[::1].shift(1).rolling(window=window, min_periods=min_periods).mean())) / df_ra['p2_elo']

#ATP and elo ranks change with relative change
df_ra['p1_rank_change'] = df_ra.groupby('p1_name')['p1_rank'].transform(lambda r: r.shift(1).rolling(window=window, min_periods=2).apply(lambda x: (x.iloc[-1]-x.dropna().iloc[0])/x.dropna().iloc[0]))
df_ra['p2_rank_change'] = df_ra.groupby('p2_name')['p2_rank'].transform(lambda r: r.shift(1).rolling(window=window, min_periods=2).apply(lambda x: (x.iloc[-1]-x.dropna().iloc[0])/x.dropna().iloc[0]))

df_ra['p1_rank_points_change'] = df_ra.groupby('p1_name')['p1_rank_points'].transform(lambda r: r.shift(1).rolling(window=window, min_periods=2).apply(lambda x: (x.iloc[-1]-x.dropna().iloc[0])/x.dropna().iloc[0]))
df_ra['p2_rank_points_change'] = df_ra.groupby('p2_name')['p2_rank_points'].transform(lambda r: r.shift(1).rolling(window=window, min_periods=2).apply(lambda x: (x.iloc[-1]-x.dropna().iloc[0])/x.dropna().iloc[0]))

df_ra['p1_elo_change'] = df_ra.groupby('p1_name')['p1_elo'].transform(lambda r: r.shift(1).rolling(window=window, min_periods=2).apply(lambda x: (x.iloc[-1]-x.dropna().iloc[0])/x.dropna().iloc[0]))
df_ra['p2_elo_change'] = df_ra.groupby('p2_name')['p2_elo'].transform(lambda r: r.shift(1).rolling(window=window, min_periods=2).apply(lambda x: (x.iloc[-1]-x.dropna().iloc[0])/x.dropna().iloc[0]))

# #ATP and elo ranks change with moving average crossing
# df_ra['p1_rank_change'] = df_ra.groupby('p1_name')['p1_rank'].transform(lambda r: (r.shift(1).rolling(window=window_short, min_periods=2).mean() / r.shift(1).rolling(window=window_long, min_periods=2).mean()) - 1)
# df_ra['p2_rank_change'] = df_ra.groupby('p2_name')['p2_rank'].transform(lambda r: (r.shift(1).rolling(window=window_short, min_periods=2).mean() / r.shift(1).rolling(window=window_long, min_periods=2).mean()) - 1)

# df_ra['p1_rank_points_change'] = df_ra.groupby('p1_name')['p1_rank_points'].transform(lambda r: (r.shift(1).rolling(window=window_short, min_periods=2).mean() / r.shift(1).rolling(window=window_long, min_periods=2).mean()) - 1)
# df_ra['p2_rank_points_change'] = df_ra.groupby('p2_name')['p2_rank_points'].transform(lambda r: (r.shift(1).rolling(window=window_short, min_periods=2).mean() / r.shift(1).rolling(window=window_long, min_periods=2).mean()) -1) 

# df_ra['p1_elo_change'] = df_ra.groupby('p1_name')['p1_elo'].transform(lambda r: (r.shift(1).rolling(window=window_short, min_periods=2).mean() / r.shift(1).rolling(window=window_long, min_periods=2).mean()) - 1)
# df_ra['p2_elo_change'] = df_ra.groupby('p2_name')['p2_elo'].transform(lambda r: (r.shift(1).rolling(window=window_short, min_periods=2).mean() / r.shift(1).rolling(window=window_long, min_periods=2).mean()) - 1)

In [21]:
# we put NA to 0 because they correspond to the first appearance of the players in the database, therefore they have a 'progression' at 0.
df_ra = df_ra.fillna({'p1_rank_change':0, 'p2_rank_change':0, 'p1_rank_points_change':0, 'p2_rank_points_change':0, 'p1_elo_change':0, 'p2_elo_change':0})

In [22]:
# when trying to predict the outcome of the game, the 'strength' of each player is not enough,
# we neeed our model to be able to process the features of each player in relation to the other's
# we will change all the features to get ratios P1 / P2 instead of seperate columns

features = ['1stWon%',
'2ndWon%',
'1stServeEffectiveness',
'Ret2ServPtsRatio',
'ServeWon%',
'ReturnWon%',
'PtsDominanceRatio',
'BPConverted%',
'BPRatio',
'SetWon%',
'PtsWon%',
'Pts2Sets_OP_Ratio',
'GmsWon%',
'Pts2Gms_OP_Ratio',
'Gms2Sets_OP_Ratio',
'BPWon%',
'BP_OP_Ratio',
'BPSaved%',
'BPSaved_OP_Ratio',
'BPConverted_OP_Ratio',
'Ace%',
'DF%',
'1stServe%',
'1stReturnWon%',
'rank_points',
'rank',
'age',
'ht',
'elo']

# we put the ratios in place of the p1 columns and then drop p2 to keep only the ratios

for feature in features:
  #df_ra[('Diff_' + feature)] = df_ra[('p2_' + feature)] - df_ra[('p1_' + feature)]
  df_ra[('p1_' + feature)] = df_ra[('p1_' + feature)] / df_ra[('p2_' + feature)]
  df_ra.rename({('p1_' + feature):('ratio_' + feature)}, axis=1, inplace=True)
  df_ra.drop(('p2_' + feature), axis=1, inplace=True)



In [None]:
# finally, we save the file for future pre modeling processing and modeling
# at this point the dataframe can still use a dropna and drop inf

df_ra.to_csv('DTB_Rolling_Features_ratios_w60.csv', sep=',')
# df.to_csv('/content/drive/MyDrive/ProjetSports/DataTennis/DTB_Features.csv', sep=',')
