In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
#reading csv dataset
data = pd.read_csv("./raw_data/atp_matches_till_2022.csv")

In [3]:
#removing undesired championship
data = data.drop(data[data['tourney_id'] == '1968-9295'].index)

In [4]:
#dropping unused columns (too many empty values)
cols_to_drop = ['winner_seed', 'winner_entry', 'loser_seed', 'loser_entry', 'tourney_id', 'tourney_name', 'match_num', 'draw_size', 'tourney_date']
data.drop(columns=cols_to_drop, inplace= True)
new_data = data.drop(columns=data.iloc[:, 17:])

In [5]:
#filling na's with the mode
cols_to_fill = ['winner_hand', 'loser_hand']
modes = {col: new_data[col].mode()[0] for col in cols_to_fill}
new_data = new_data.fillna(modes)

In [6]:
#filling na's with the mean
cols = ['winner_ht','winner_age','loser_ht','loser_age']
new_data[cols] = new_data[cols].apply(lambda col: col.fillna(col.mean()))

In [7]:
#dropping rows that have empty values (+-2000 rows removed in total)
new_data = new_data.dropna(subset=['score','loser_ioc','winner_ioc','surface'])

In [8]:
#removing parenthesis, non-digit (except -) and whitespaces
def clean_score(score):
    score = re.sub(r'\(.*?\)', '', score)
    score = re.sub(r'[^0-9\- ]', '', score)
    score = re.sub(r'\s+', ' ', score).strip()

    return score

In [9]:
#create new cleaned score
new_data['score_clean'] = new_data['score'].apply(clean_score).astype(str)

new_data.drop(columns=['score'], inplace=True)

new_data.rename(columns={'score_clean': 'score'}, inplace=True)

In [10]:
#rounding values of height and age
new_data[['winner_ht', 'loser_ht']] = round(new_data[['winner_ht', 'loser_ht']], 2)
new_data[['winner_age', 'loser_age']] = round(new_data[['winner_age', 'loser_age']])

In [11]:
#calculate the amount of points and sets for each player according to the score
def calculate_points_and_sets(score_str):
    player_1_points = 0
    player_2_points = 0
    player_1_sets = 0
    player_2_sets = 0
    scores = score_str.split()

    for score in scores:
        if '-' in score:
            try:
                p1, p2 = map(int, score.split('-'))
                player_1_points += p1
                player_2_points += p2

                # Count sets won by each player
                if p1 > p2:
                    player_1_sets += 1
                elif p2 > p1:
                    player_2_sets += 1
            except ValueError:
                continue

    return pd.Series([player_1_points, player_2_points, player_1_sets, player_2_sets])

In [12]:
#create a copy for winners and losers
winner_df = new_data.copy()
loser_df = new_data.copy()

In [13]:
#filling target column with 1 (win) and 0 (lose)
winner_df['target'] = 1
loser_df['target'] = 0

In [14]:
#removing rows with nan or 0
winner_df = winner_df[~winner_df['score'].isin(['nan', '0'])]
loser_df = loser_df[~loser_df['score'].isin(['nan', '0'])]

In [15]:
#renaming column's title
winner_df.rename(columns={col: col.replace('winner', 'player_1') for col in winner_df.columns if 'winner' in col}, inplace=True)
winner_df.rename(columns={col: col.replace('loser', 'player_2') for col in winner_df.columns if 'loser' in col}, inplace=True)
loser_df.rename(columns={col: col.replace('loser', 'player_1') for col in loser_df.columns if 'loser' in col}, inplace=True)
loser_df.rename(columns={col: col.replace('winner', 'player_2') for col in loser_df.columns if 'winner' in col}, inplace=True)

In [16]:
#creating 2 new columns for the total sum of the points per match
winner_df[['player_1_points', 'player_2_points', 'player_1_sets', 'player_2_sets']] = winner_df['score'].apply(calculate_points_and_sets)
loser_df[['player_1_points', 'player_2_points', 'player_1_sets', 'player_2_sets']] = loser_df['score'].apply(calculate_points_and_sets)

In [17]:
#renaming column's title
loser_df.rename(columns={'player_1_points': 'player_2_points', 'player_2_points': 'player_1_points'}, inplace=True)
loser_df.rename(columns={'player_1_sets': 'player_2_sets', 'player_2_sets': 'player_1_sets'}, inplace=True)

In [18]:
#dropping score column
winner_df.drop(columns=['score'], inplace=True)
loser_df.drop(columns=['score'], inplace=True)

In [19]:
#concatenating both df's
df = pd.concat([winner_df, loser_df], ignore_index=True)

In [20]:
#creating height and age difference columns
df['ht_diff'] = (df['player_1_ht'] - df['player_2_ht']).abs()
df['age_diff'] = (df['player_1_age'] - df['player_2_age']).abs()