In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA


In [2]:
#reading csv dataset and shuffle it
data = pd.read_csv("./raw_data/atp_matches_till_2022.csv").sample(frac=1.0).reset_index(drop=True)

In [3]:
#removing undesired championship and bad scores
data = data.drop(data[data['tourney_id'] == '1968-9295'].index)
data = data[~data['score'].str.contains('[A-Za-z]', na=False)]

In [4]:
#dropping unused columns
cols_to_drop = [
    'winner_seed',
    'winner_entry',
    'loser_seed',
    'loser_entry',
    'tourney_id',
    'tourney_name',
    'match_num',
    'tourney_level',
    'tourney_date',
    'score',
    'winner_rank_points',
    'loser_rank_points',
    'winner_ioc',
    'loser_ioc'
    ]
new_data = data.drop(columns=cols_to_drop)

In [5]:
#filling na's with the mode
cols_to_fill = ['winner_hand', 'loser_hand']
modes = {col: new_data[col].mode()[0] for col in cols_to_fill}
new_data = new_data.fillna(modes)

In [6]:
#filling na's with the mean
cols = ['winner_ht','winner_age','loser_ht','loser_age']
new_data[cols] = new_data[cols].apply(lambda col: col.fillna(col.mean()))

In [7]:
#dropping rows that have empty values
new_data = new_data.dropna()

In [8]:
#rounding values of height and age
new_data[['winner_ht', 'loser_ht']] = round(new_data[['winner_ht', 'loser_ht']], 2)
new_data[['winner_age', 'loser_age']] = round(new_data[['winner_age', 'loser_age']])

In [9]:
#create a copy for winners and losers
winner_df = new_data.copy()
loser_df = new_data.copy()

In [10]:
#filling target column with 1 (win) and 0 (lose)
winner_df['target'] = 1
loser_df['target'] = 0

In [11]:
# Rename 'winner' and 'loser' in winner_df
winner_df.rename(columns={col: col.replace('winner', 'player_1') for col in winner_df.columns if 'winner' in col}, inplace=True)
winner_df.rename(columns={col: col.replace('loser', 'player_2') for col in winner_df.columns if 'loser' in col}, inplace=True)

# Rename 'w_' and 'l_' in winner_df
winner_df.rename(columns={col: col.replace('w_', 'player_1_') for col in winner_df.columns if 'w_' in col}, inplace=True)
winner_df.rename(columns={col: col.replace('l_', 'player_2_') for col in winner_df.columns if 'l_' in col}, inplace=True)

#rename draw_sizer
winner_df.rename(columns={'draplayer_1_size': 'draw_size'}, inplace=True)

# Rename 'loser' and 'winner' in loser_df
loser_df.rename(columns={col: col.replace('loser', 'player_1') for col in loser_df.columns if 'loser' in col}, inplace=True)
loser_df.rename(columns={col: col.replace('winner', 'player_2') for col in loser_df.columns if 'winner' in col}, inplace=True)

# Rename 'l_' and 'w_' in loser_df
loser_df.rename(columns={col: col.replace('l_', 'player_1_') for col in loser_df.columns if 'l_' in col}, inplace=True)
loser_df.rename(columns={col: col.replace('w_', 'player_2_') for col in loser_df.columns if 'w_' in col}, inplace=True)

#rename draw_sizer
loser_df.rename(columns={'draplayer_2_size': 'draw_size'}, inplace=True)

In [12]:
#concatenating both df's
df = pd.concat([winner_df, loser_df]).sort_index(kind='merge').reset_index(drop=True)

In [13]:
#creating height and age difference columns
df['ht_diff'] = (df['player_1_ht'] - df['player_2_ht']).abs()
df['age_diff'] = (df['player_1_age'] - df['player_2_age']).abs()

In [14]:
#putting together the player ID with it's name
df['player_1_id'] = df['player_1_id'].astype(str)
df['player_1_name'] = df['player_1_name'].astype(str)
df['player_1'] = df['player_1_id'] + ' - ' + df['player_1_name']

df['player_2_id'] = df['player_2_id'].astype(str)
df['player_2_name'] = df['player_2_name'].astype(str)
df['player_2'] = df['player_2_id'] + ' - ' + df['player_2_name']

In [15]:
#dropping player ID and name
df.drop(columns=['player_1_id', 'player_1_name', 'player_2_id', 'player_2_name'], inplace=True)

In [16]:
df.head()

Unnamed: 0,surface,draw_size,player_1_hand,player_1_ht,player_1_age,player_2_hand,player_2_ht,player_2_age,best_of,round,...,player_2_SvGms,player_2_bpSaved,player_2_bpFaced,player_1_rank,player_2_rank,target,ht_diff,age_diff,player_1,player_2
0,Clay,128,R,175.0,29.0,R,185.0,29.0,5,R64,...,13.0,12.0,16.0,7.0,81.0,1,10.0,0.0,103970 - David Ferrer,103898 - Julien Benneteau
1,Clay,128,R,185.0,29.0,R,175.0,29.0,5,R64,...,14.0,1.0,1.0,81.0,7.0,0,10.0,0.0,103898 - Julien Benneteau,103970 - David Ferrer
2,Hard,128,R,190.0,29.0,L,193.0,31.0,5,R128,...,24.0,9.0,12.0,62.0,91.0,1,3.0,2.0,102642 - Cyril Saulnier,102257 - Greg Rusedski
3,Hard,128,L,193.0,31.0,R,190.0,29.0,5,R128,...,23.0,1.0,3.0,91.0,62.0,0,3.0,2.0,102257 - Greg Rusedski,102642 - Cyril Saulnier
4,Clay,56,L,185.0,27.0,R,188.0,22.0,3,QF,...,13.0,2.0,6.0,5.0,34.0,1,3.0,5.0,104745 - Rafael Nadal,105777 - Grigor Dimitrov


In [17]:
cols_to_remove = ['player_1', 'player_2']
cols_removed = df[cols_to_remove]

In [18]:
columns_to_keep = [
    'player_2_df', 'player_1_ht', 'player_1_df', 'player_2_ht', 'player_1_bpSaved',
    'player_1_2ndWon', 'player_1_rank', 'age_diff', 'ht_diff', 'player_2_rank',
    'player_1_1stIn', 'player_2_age', 'draw_size', 'player_1_age',
    'player_2_bpSaved', 'player_2_2ndWon', 'player_2_ace', 'best_of', 'player_1_ace', 'player_1_1stWon','player_2_1stIn' ,'target'
]


df_filtered = df[columns_to_keep]

In [19]:
df_filtered.head()

Unnamed: 0,player_2_df,player_1_ht,player_1_df,player_2_ht,player_1_bpSaved,player_1_2ndWon,player_1_rank,age_diff,ht_diff,player_2_rank,...,draw_size,player_1_age,player_2_bpSaved,player_2_2ndWon,player_2_ace,best_of,player_1_ace,player_1_1stWon,player_2_1stIn,target
0,2.0,175.0,0.0,185.0,1.0,19.0,7.0,0.0,10.0,81.0,...,128,29.0,12.0,12.0,4.0,5,3.0,40.0,71.0,1
1,0.0,185.0,2.0,175.0,12.0,12.0,81.0,0.0,10.0,7.0,...,128,29.0,1.0,19.0,3.0,5,4.0,46.0,44.0,0
2,9.0,190.0,4.0,193.0,1.0,40.0,62.0,2.0,3.0,91.0,...,128,29.0,9.0,29.0,18.0,5,18.0,57.0,86.0,1
3,4.0,193.0,9.0,190.0,9.0,29.0,91.0,2.0,3.0,62.0,...,128,31.0,1.0,40.0,18.0,5,18.0,70.0,70.0,0
4,2.0,185.0,0.0,188.0,5.0,8.0,5.0,5.0,3.0,34.0,...,56,27.0,2.0,15.0,8.0,3,2.0,45.0,38.0,1


In [20]:
df.columns

Index(['surface', 'draw_size', 'player_1_hand', 'player_1_ht', 'player_1_age',
       'player_2_hand', 'player_2_ht', 'player_2_age', 'best_of', 'round',
       'minutes', 'player_1_ace', 'player_1_df', 'player_1_svpt',
       'player_1_1stIn', 'player_1_1stWon', 'player_1_2ndWon',
       'player_1_SvGms', 'player_1_bpSaved', 'player_1_bpFaced',
       'player_2_ace', 'player_2_df', 'player_2_svpt', 'player_2_1stIn',
       'player_2_1stWon', 'player_2_2ndWon', 'player_2_SvGms',
       'player_2_bpSaved', 'player_2_bpFaced', 'player_1_rank',
       'player_2_rank', 'target', 'ht_diff', 'age_diff', 'player_1',
       'player_2'],
      dtype='object')

In [21]:
num_cols = [
    'player_2_df', 'player_1_ht', 'player_1_df', 'player_2_ht', 'player_1_bpSaved',
    'player_1_2ndWon', 'player_1_rank', 'age_diff', 'ht_diff', 'player_2_rank',
    'player_1_1stIn', 'player_2_age', 'draw_size', 'player_1_age',
    'player_2_bpSaved', 'player_2_2ndWon', 'player_2_ace', 'best_of', 'player_1_ace', 'player_1_1stWon', 'player_2_1stIn'
]

In [22]:
# Transform numerical variables
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])


"cat_transformer = Pipeline(steps=[\n    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\n    ('onehot', OneHotEncoder(handle_unknown='ignore'))\n])"

In [23]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols)
    ])

In [24]:
#Final Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

In [25]:
X = df_filtered.drop(columns='target', axis=1)
y = df_filtered['target']


In [26]:
#Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [27]:
cross_val_score(pipeline, X, y, cv=5, scoring='accuracy').mean()

np.float64(0.8181190617661438)

In [28]:
#Fitting the pipeline
pipeline.fit(X_train, y_train)

In [29]:
pipeline.predict(X_test)

array([1, 1, 0, ..., 1, 1, 0])

In [31]:
X_train_processed = pipeline.named_steps['preprocessor'].transform(X_train)

In [32]:
columns = pipeline.named_steps['preprocessor'].get_feature_names_out()

X_train_p = pd.DataFrame(X_train_processed, columns=columns)

In [33]:
X_train_p.shape


(137958, 21)

In [36]:
#Adding names back
X_train_p[['player_1', 'player_2']] = cols_removed
