In [1]:
import pandas as pd
import numpy as np

In [2]:
#reading csv dataset and shuffle it
data = pd.read_csv("./raw_data/atp_matches_till_2022.csv").sample(frac=1.0).reset_index(drop=True)

In [3]:
#removing undesired championship and bad scores
data = data.drop(data[data['tourney_id'] == '1968-9295'].index)
data = data[~data['score'].str.contains('[A-Za-z]', na=False)]

In [4]:
#dropping unused columns
cols_to_drop = [
    'winner_seed',
    'winner_entry',
    'loser_seed',
    'loser_entry',
    'tourney_id',
    'tourney_name',
    'match_num',
    'tourney_level',
    'tourney_date',
    'score',
    'winner_rank_points',
    'loser_rank_points',
    'winner_ioc',
    'loser_ioc'
    ]
new_data = data.drop(columns=cols_to_drop)
# new_data = data.drop(columns=data.iloc[:, 16:])

In [5]:
#filling na's with the mode
cols_to_fill = ['winner_hand', 'loser_hand']
modes = {col: new_data[col].mode()[0] for col in cols_to_fill}
new_data = new_data.fillna(modes)

In [6]:
#filling na's with the mean
cols = ['winner_ht','winner_age','loser_ht','loser_age']
new_data[cols] = new_data[cols].apply(lambda col: col.fillna(col.mean()))

In [7]:
#dropping rows that have empty values
new_data = new_data.dropna()

In [8]:
#rounding values of height and age
new_data[['winner_ht', 'loser_ht']] = round(new_data[['winner_ht', 'loser_ht']], 2)
new_data[['winner_age', 'loser_age']] = round(new_data[['winner_age', 'loser_age']])

In [9]:
#create a copy for winners and losers
winner_df = new_data.copy()
loser_df = new_data.copy()

In [10]:
#filling target column with 1 (win) and 0 (lose)
winner_df['target'] = 1
loser_df['target'] = 0

In [11]:
#renaming column's title
winner_df.rename(columns={col: col.replace('winner', 'player_1') for col in winner_df.columns if 'winner' in col}, inplace=True)
winner_df.rename(columns={col: col.replace('loser', 'player_2') for col in winner_df.columns if 'loser' in col}, inplace=True)
loser_df.rename(columns={col: col.replace('loser', 'player_1') for col in loser_df.columns if 'loser' in col}, inplace=True)
loser_df.rename(columns={col: col.replace('winner', 'player_2') for col in loser_df.columns if 'winner' in col}, inplace=True)

In [12]:
#concatenating both df's
df = pd.concat([winner_df, loser_df]).sort_index(kind='merge').reset_index(drop=True)

In [13]:
#creating height and age difference columns
df['ht_diff'] = (df['player_1_ht'] - df['player_2_ht']).abs()
df['age_diff'] = (df['player_1_age'] - df['player_2_age']).abs()

In [14]:
#putting together the player ID with it's name
df['player_1_id'] = df['player_1_id'].astype(str)
df['player_1_name'] = df['player_1_name'].astype(str)
df['player_1'] = df['player_1_id'] + ' - ' + df['player_1_name']

df['player_2_id'] = df['player_2_id'].astype(str)
df['player_2_name'] = df['player_2_name'].astype(str)
df['player_2'] = df['player_2_id'] + ' - ' + df['player_2_name']

In [15]:
#dropping player ID and name
df.drop(columns=['player_1_id', 'player_1_name', 'player_2_id', 'player_2_name'], inplace=True)

In [16]:
df

Unnamed: 0,surface,draw_size,player_1_hand,player_1_ht,player_1_age,player_2_hand,player_2_ht,player_2_age,best_of,round,...,l_SvGms,l_bpSaved,l_bpFaced,player_1_rank,player_2_rank,target,ht_diff,age_diff,player_1,player_2
0,Hard,32,R,190.00,25.0,R,190.00,25.0,3,R32,...,7.0,12.0,16.0,165.0,99.0,1,0.00,0.0,101900 - Nicklas Kulti,101775 - Frederik Fetterlein
1,Hard,32,R,190.00,25.0,R,190.00,25.0,3,R32,...,7.0,12.0,16.0,99.0,165.0,0,0.00,0.0,101775 - Frederik Fetterlein,101900 - Nicklas Kulti
2,Hard,128,R,180.00,30.0,R,193.00,24.0,5,R64,...,19.0,3.0,9.0,10.0,95.0,1,13.00,6.0,105676 - David Goffin,144750 - Lloyd Harris
3,Hard,128,R,193.00,24.0,R,180.00,30.0,5,R64,...,19.0,3.0,9.0,95.0,10.0,0,13.00,6.0,144750 - Lloyd Harris,105676 - David Goffin
4,Grass,32,L,180.00,32.0,R,184.22,18.0,3,R32,...,14.0,6.0,11.0,229.0,558.0,1,4.22,14.0,101373 - Karsten Braasch,103715 - Kirill Ivanov Smolensky
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172443,Carpet,32,L,183.00,28.0,R,185.00,23.0,3,R16,...,9.0,5.0,9.0,727.0,21.0,0,2.00,5.0,101659 - Jan Apell,102450 - Tim Henman
172444,Hard,4,R,184.45,20.0,R,183.00,26.0,5,RR,...,19.0,6.0,14.0,504.0,1205.0,1,1.45,6.0,134120 - Kuan Yi Lee,105579 - Kittipong Wachiramanowong
172445,Hard,4,R,183.00,26.0,R,184.45,20.0,5,RR,...,19.0,6.0,14.0,1205.0,504.0,0,1.45,6.0,105579 - Kittipong Wachiramanowong,134120 - Kuan Yi Lee
172446,Hard,32,R,185.00,26.0,R,190.00,28.0,3,R32,...,13.0,3.0,7.0,34.0,5.0,1,5.00,2.0,103285 - Radek Stepanek,102845 - Carlos Moya
