In [None]:
# Importing necessary files.
import numpy as np
import pandas as pd
import seaborn as sns
import glob, os

# Added dask to accelerate data handling.
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# Collect all dates in a tuple.
dates_tuple = tuple([f'202402'+str(i) if i>=10 else '2024020'+str(i)  for i in range(1,30)] + 
                    [f'202403'+str(i) if i>=10 else '2024030'+str(i)  for i in range(1,32)])
# Define subfolders shorthand.
subfolders_dic = {'match':'raw_match_parquet',
                  'odds':'raw_odds_parquet',
                  'pbp':'raw_point_by_point_parquet',
                  'stats':'raw_statistics_parquet',
                  'power':'raw_tennis_power_parquet',
                  'votes':'raw_votes_parquet'
                  }
# Define function to render addresses.
def render_addresses(subfolder:str, date_arr = None, kind:str = ""):
    """
    Function to render addresses.
    It takes a subfolder, array of dates and kind of file as inputs.
    """
    result = []
    date_arr = dates_tuple if not date_arr else date_arr
    if kind:
        result = [file for dt in date_arr
        for file in glob.glob(f"Tennis Schema\\{dt}\\data\\raw\\{subfolders_dic[subfolder]}\\{kind}_[!s]*.parquet")]
    else:
        result = [file for dt in date_arr
        for file in glob.glob(f"Tennis Schema\\{dt}\\data\\raw\\{subfolders_dic[subfolder]}\\*.parquet")]
    return result

In [None]:
# How many players are there in the dataset?
players_id_df = dd.read_parquet(render_addresses('match', kind='*_team'),columns=['player_id'])

In [207]:
players_id_df.compute()

Unnamed: 0,player_id
0,192013
0,273680
0,77223
0,88992
0,248846
...,...
0,375214
0,215205
0,398073
0,303293


In [217]:
# The average height of players.
players_height_df =dd.read_parquet(render_addresses('match', kind='*_team'),columns=['player_id','height'])

In [219]:
players_height_df.drop_duplicates(subset='player_id', keep='first').compute()

Unnamed: 0,player_id,height
0,197546,1.85
0,340741,1.88
0,17046,1.80
0,130690,
0,156826,1.85
...,...,...
0,46651,1.73
0,265808,1.75
0,383458,
0,383428,


In [54]:
# Extracting the necessary coulumns for the player with highest number of wins.
event_addresses = render_addresses('match', date_arr = dates_tuple,kind='event')
home_addresses = render_addresses('match', date_arr = dates_tuple, kind='home_team')
away_addresses = render_addresses('match', date_arr = dates_tuple, kind='away_team')

match_df = dd.read_parquet(event_addresses, columns=['match_id','winner_code'])
home_df = dd.read_parquet(home_addresses, columns=['match_id','player_id','full_name'])
away_df = dd.read_parquet(away_addresses, columns=['match_id','player_id','full_name'])

In [61]:
# Extracting the results into csv for faster manipulation with pandas.
with ProgressBar():
    result = away_df.compute()

[########################################] | 100% Completed | 78.32 s


In [85]:
# Opening folders and then dropping duplicates.
match_df = pd.read_csv("CSVs\\3_match_df.csv")
home_df = pd.read_csv("CSVs\\3_home_df.csv")
away_df = pd.read_csv("CSVs\\3_away_df.csv")
match_df.drop_duplicates(subset='match_id', inplace=True)
home_df.drop_duplicates(subset='match_id', inplace=True)
away_df.drop_duplicates(subset='match_id', inplace=True)

In [None]:
# Data Wranglign with grouping and filtering.
winner_home_df = match_df.query('winner_code == 1').merge(home_df, on='match_id', how='inner')
winner_home_df = winner_home_df.drop(columns=['winner_code'])
winner_away_df = match_df.query('winner_code == 2').merge(away_df, on='match_id', how='inner')
winner_away_df = winner_away_df.drop(columns=['winner_code'])
winners_df = pd.concat([winner_home_df,winner_away_df])
winners_df = winners_df.dropna().drop_duplicates()
winners_df.player_id = winners_df.player_id.astype(int)

In [97]:
winners_df.groupby('player_id').agg({'match_id':'count',
                                    'full_name':'min'}).rename(
                                    columns={'match_id':'matches_won'}).sort_values(
                                    'matches_won',ascending=False)

Unnamed: 0_level_0,matches_won,full_name
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1
50901,29,"Popko, Dmitry"
231620,22,"Chidekh, Clement"
202572,21,"Gengel, Marek"
230049,21,"Jianu, Filip Cristian"
82133,20,"Dellien Velasco, Murkel Alejandro"
...,...,...
175258,1,"Benaissa, Amira"
175160,1,"Lavino, Irene"
59213,1,"Perez Garcia, Maria Paulina"
383459,1,"Jang, Gio"


In [108]:
# Extracting the time results into csv for faster manipulation with pandas.
time_df = dd.read_parquet(render_addresses('match', kind='time'))

with ProgressBar():
    result = time_df.compute()

[########################################] | 100% Completed | 145.17 s


In [158]:
time_df = pd.read_csv("CSVs\\4_time_df.csv")
time_df.drop_duplicates(subset='match_id', inplace=True)
time_df.dropna(subset=['period_1'], inplace=True)
time_df.drop(columns=['period_4','period_5','current_period_start_timestamp'], inplace=True)
time_df['duration'] = time_df[['period_1', 'period_2', 'period_3']].sum(axis=1, skipna=True).div(60).round(2)
time_df.sort_values(by=['duration'],ascending=False)

Unnamed: 0,match_id,period_1,period_2,period_3,duration
5799,12063611,167352.0,169438.0,,5613.17
5795,12063587,159144.0,161086.0,,5337.17
30980,12185562,4163.0,84588.0,88380.0,2952.18
17230,12121829,172605.0,810.0,,2890.25
3803,12054403,2170.0,80162.0,81551.0,2731.38
...,...,...,...,...,...
1026,12039701,77.0,26.0,,1.72
35668,12213484,3.0,5.0,,0.13
6344,12064959,2.0,6.0,,0.13
5853,12063889,4.0,4.0,,0.13


In [146]:
with ProgressBar():
    res = dd.read_parquet(render_addresses('match', kind='event')).compute()

[########################################] | 100% Completed | 170.04 s


In [163]:
# Data entry might be wrong!
# Look at https://www.sofascore.com/fr/tennis/match/o-gavrila-carolina-alves/OLvsnbR
res[res.match_id == 12177252]

Unnamed: 0,match_id,first_to_serve,home_team_seed,away_team_seed,custom_id,winner_code,default_period_count,start_datetime,match_slug,final_result_only
0,12177252,1,,,kLysfnxc,1,3,1711017000,beckley-caruso,False
0,12177252,1,,,kLysfnxc,1,3,1711017000,beckley-caruso,False
0,12177252,1,,,kLysfnxc,1,3,1711017000,beckley-caruso,False
