In [1]:
import pandas as pd

In [2]:
#reading csv dataset and shuffle it
data = pd.read_csv("./raw_data/atp_matches_till_2022.csv").sample(frac=1.0).reset_index(drop=True)

In [3]:
#removing undesired championship and bad scores
data = data.drop(data[data['tourney_id'] == '1968-9295'].index)
data = data[~data['score'].str.contains('[A-Za-z]', na=False)]

In [4]:
#dropping unused columns
cols_to_drop = [
    'winner_seed',
    'winner_entry',
    'loser_seed',
    'loser_entry',
    'tourney_id',
    'tourney_name',
    'match_num',
    'tourney_level',
    'tourney_date',
    'score',
    'winner_rank_points',
    'loser_rank_points',
    'winner_ioc',
    'loser_ioc'
    ]
new_data = data.drop(columns=cols_to_drop)

In [5]:
#filling na's with the mode
cols_to_fill = ['winner_hand', 'loser_hand']
modes = {col: new_data[col].mode()[0] for col in cols_to_fill}
new_data = new_data.fillna(modes)

In [6]:
#filling na's with the mean
cols = ['winner_ht','winner_age','loser_ht','loser_age']
new_data[cols] = new_data[cols].apply(lambda col: col.fillna(col.mean()))

In [7]:
#dropping rows that have empty values
new_data = new_data.dropna()

In [8]:
#rounding values of height and age
new_data[['winner_ht', 'loser_ht']] = round(new_data[['winner_ht', 'loser_ht']], 2)
new_data[['winner_age', 'loser_age']] = round(new_data[['winner_age', 'loser_age']])

In [9]:
#create a copy for winners and losers
winner_df = new_data.copy()
loser_df = new_data.copy()

In [10]:
#filling target column with 1 (win) and 0 (lose)
winner_df['target'] = 1
loser_df['target'] = 0

In [11]:
# Rename 'winner' and 'loser' in winner_df
winner_df.rename(columns={col: col.replace('winner', 'player_1') for col in winner_df.columns if 'winner' in col}, inplace=True)
winner_df.rename(columns={col: col.replace('loser', 'player_2') for col in winner_df.columns if 'loser' in col}, inplace=True)

# Rename 'w_' and 'l_' in winner_df
winner_df.rename(columns={col: col.replace('w_', 'player_1_') for col in winner_df.columns if 'w_' in col}, inplace=True)
winner_df.rename(columns={col: col.replace('l_', 'player_2_') for col in winner_df.columns if 'l_' in col}, inplace=True)

#rename draw_sizer
winner_df.rename(columns={'draplayer_1_size': 'draw_size'}, inplace=True)

# Rename 'loser' and 'winner' in loser_df
loser_df.rename(columns={col: col.replace('loser', 'player_1') for col in loser_df.columns if 'loser' in col}, inplace=True)
loser_df.rename(columns={col: col.replace('winner', 'player_2') for col in loser_df.columns if 'winner' in col}, inplace=True)

# Rename 'l_' and 'w_' in loser_df
loser_df.rename(columns={col: col.replace('l_', 'player_1_') for col in loser_df.columns if 'l_' in col}, inplace=True)
loser_df.rename(columns={col: col.replace('w_', 'player_2_') for col in loser_df.columns if 'w_' in col}, inplace=True)

#rename draw_sizer
loser_df.rename(columns={'draplayer_2_size': 'draw_size'}, inplace=True)

In [12]:
#concatenating both df's
df = pd.concat([winner_df, loser_df]).sort_index(kind='merge').reset_index(drop=True)

In [13]:
#creating height and age difference columns
df['ht_diff'] = (df['player_1_ht'] - df['player_2_ht']).abs()
df['age_diff'] = (df['player_1_age'] - df['player_2_age']).abs()

In [14]:
#putting together the player ID with it's name
df['player_1_id'] = df['player_1_id'].astype(str)
df['player_1_name'] = df['player_1_name'].astype(str)
df['player_1'] = df['player_1_id'] + ' - ' + df['player_1_name']

df['player_2_id'] = df['player_2_id'].astype(str)
df['player_2_name'] = df['player_2_name'].astype(str)
df['player_2'] = df['player_2_id'] + ' - ' + df['player_2_name']

In [15]:
#dropping player ID and name
df.drop(columns=['player_1_id', 'player_1_name', 'player_2_id', 'player_2_name'], inplace=True)

In [18]:
player = pd.read_csv("./raw_data/atp_players_till_2022.csv")