# Merging the files and data cleaning


In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt


def combine_years(where):
    path = where
    files = os.listdir(path)

    combined_years_df = pd.concat(
        [pd.read_csv(f'{path}/{file}') for file in files if file.endswith('.csv')], ignore_index=True)

    return combined_years_df

df = combine_years('db/atp')
df.describe()

In [None]:
match_stats = ['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
               'tourney_date', 'match_num', 'score', 'best_of', 'round', 'minutes']

winner_stats = ['winner_id', 'winner_seed', 'winner_entry', 'winner_name', 'winner_hand',
                'winner_ht', 'winner_ioc', 'winner_age', 'winner_rank', 'winner_rank_points', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'w_SvGms']

loser_stats = ['loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc', 'loser_age',
               'loser_rank', 'loser_rank_points', 'l_ace', 'l_df', 'l_svpt', 'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced']

nulls_summary = pd.DataFrame(df[match_stats].isnull().any(), columns=['Nulls'])

nulls_summary['Num_of_nulls [qty]'] = pd.DataFrame(df.isnull().sum())

nulls_summary['Num_of_nulls [%]'] = round((df.isnull().mean()*100), 2)

print(nulls_summary)



**Cleaning**


In [None]:
df = df[df['best_of'] == 3]

df[['winner_rank', 'loser_rank']] = df[[
    'winner_rank', 'loser_rank']].fillna(2500)

df[['winner_rank_points', 'loser_rank_points']] = df[[
    'winner_rank_points', 'loser_rank_points']].fillna(1)

df.dropna(subset=['loser_ht', 'winner_ht',
          'loser_age', 'winner_age'], inplace=True)

def replace_missing_svgms_with_estimate(df):
    def parse_score(score):
        try:
            sets = score.split()
            games_winner = 0
            games_loser = 0

            for s in sets:
                if "(" in s:  
                    s = s.split("(")[0]
                if "-" in s:
                    w, l = s.split("-")
                    games_winner += int(w)
                    games_loser += int(l)

            return games_winner, games_loser
        except:
            return None, None
        
    df['games_winner'], df['games_loser'] = zip(
        *df['score'].apply(parse_score))

    df['estimated_w_SvGms'] = (df['games_winner'] + df['games_loser'] + 1) // 2
    df['estimated_l_SvGms'] = (df['games_winner'] + df['games_loser']) // 2

    df['w_SvGms'].fillna(df['estimated_w_SvGms'], inplace=True)
    df['l_SvGms'].fillna(df['estimated_l_SvGms'], inplace=True)

    df.drop(columns=['games_winner', 'games_loser',
            'estimated_w_SvGms', 'estimated_l_SvGms'], inplace=True)

    return df


df = replace_missing_svgms_with_estimate(df)

df.dropna(subset=['w_ace', 'w_df', 'l_ace', 'l_df'], inplace=True)

df.drop(columns=['minutes'], inplace=True)

df.dropna(subset=['winner_ht', 'loser_ht',
                  'winner_rank', 'loser_rank', 'l_ace', 'l_df', 'w_ace'], inplace=True)

df = df[df['w_df'].notnull()]

df['surface'] = df['surface'].fillna('Unknown')

df['tourney_level'] = df['tourney_level'].fillna('Unknown')

Adding index and scrambling results


In [None]:
from utils.features import result_scrambler

df['match_id'] = df.index + 1

df = result_scrambler(df)

df.fillna('ffill', inplace=True)

df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')

In [None]:
df.to_csv('db/out/output_std.csv')