## Tennis Project
### Question 8

In [17]:
import pandas as pd
from pathlib import Path

In [16]:
root_path = Path("../data")

In [32]:
stat_files = list(root_path.rglob("statistics_*.parquet"))
player_files = list(root_path.rglob("*.parquet"))

print(f"Found {len(stat_files)} statistics files.")
print(f"Found {len(player_files)} player files.")

Found 23291 statistics files.
Found 442812 player files.


In [33]:
stat_files = stat_files[:20]
player_files = player_files[:20]

In [34]:
dfs_stats = []

for file in stat_files:
    df = pd.read_parquet(file)
    dfs_stats.append(df)

df_stats = pd.concat(dfs_stats, ignore_index=True)
df_stats.head()

Unnamed: 0,match_id,period,statistic_category_name,statistic_name,home_stat,away_stat,compare_code,statistic_type,value_type,home_value,away_value,home_total,away_total
0,12156295,ALL,service,aces,4,5,2,positive,event,4,5,,
1,12156295,ALL,service,double_faults,3,0,1,negative,event,3,0,,
2,12156295,ALL,service,first_serve,59/95 (62%),33/62 (53%),1,positive,team,59,33,95.0,62.0
3,12156295,ALL,service,second_serve,33/36 (92%),29/29 (100%),2,positive,team,33,29,36.0,29.0
4,12156295,ALL,service,first_serve_points,44/59 (75%),23/33 (70%),1,positive,team,44,23,59.0,33.0


In [35]:
df_faults = df_stats[df_stats["statistic_name"] == "double_faults"]
df_faults = df_faults[["match_id", "home_value", "away_value"]]
df_faults.head()

Unnamed: 0,match_id,home_value,away_value
1,12156295,3,0
21,12156295,2,0
38,12156295,1,0
55,12157171,1,1
75,12157171,1,1


In [36]:
df_home = df_faults[["match_id", "home_value"]].rename(columns={"home_value": "double_faults"})
df_home["team"] = "home"

df_away = df_faults[["match_id", "away_value"]].rename(columns={"away_value": "double_faults"})
df_away["team"] = "away"

df_combined = pd.concat([df_home, df_away], ignore_index=True)
df_combined.head()

Unnamed: 0,match_id,double_faults,team
0,12156295,3,home
1,12156295,2,home
2,12156295,1,home
3,12157171,1,home
4,12157171,1,home


In [37]:
dfs_players = []

for file in player_files:
    df = pd.read_parquet(file)
    dfs_players.append(df)

df_players = pd.concat(dfs_players, ignore_index=True)
df_players.head()

Unnamed: 0,match_id,name,slug,gender,user_count,residence,birthplace,height,weight,plays,turned_pro,current_prize,total_prize,player_id,current_rank,name_code,country,full_name
0,12156295,Rybakina E.,rybakina-elena,F,22673,,"Moscow, Russia",1.84,,right-handed,2016.0,1226279,11104942,186312,4,RYB,Kazakhstan,"Rybakina, Elena"
1,12157171,Sinner J.,sinner-jannik,M,92881,"Monte Carlo, Monaco","San Candido, Italy",1.88,68.0,right-handed,2018.0,3301073,17518144,206570,2,SIN,Italy,"Sinner, Jannik"
2,12173700,Martínez P.,martinez-pedro,M,3165,"Valencia, Spain","Alzira, Spain",1.85,76.0,right-handed,2016.0,94362,2787963,77223,77,MAR,Spain,"Martinez, Pedro"
3,12173714,Martínez P.,martinez-pedro,M,3050,"Valencia, Spain","Alzira, Spain",1.85,76.0,right-handed,2016.0,94362,2787963,77223,77,MAR,Spain,"Martinez, Pedro"
4,12173950,Nardi L.,nardi-luca,M,7432,,Pesaro,1.85,,right-handed,,123573,687717,289233,75,NAR,Italy,"Nardi, Luca"


In [39]:
df_gender = df_players[["match_id", "gender"]].drop_duplicates()
df_gender.head()

Unnamed: 0,match_id,gender
0,12156295,F
1,12157171,M
2,12173700,M
3,12173714,M
4,12173950,M


In [40]:
df_final = df_combined.merge(df_gender, on="match_id", how="left")


df_final = df_final.dropna(subset=["gender", "double_faults"])

df_final["double_faults"] = pd.to_numeric(df_final["double_faults"], errors="coerce")

df_final.head()

Unnamed: 0,match_id,double_faults,team,gender
0,12156295,3,home,F
1,12156295,2,home,F
2,12156295,1,home,F
3,12157171,1,home,M
4,12157171,1,home,M


In [41]:
df_final = df_combined.merge(df_gender, on="match_id", how="left")


df_final = df_final.dropna(subset=["gender", "double_faults"])


df_final["double_faults"] = pd.to_numeric(df_final["double_faults"], errors="coerce")


df_final.head()

Unnamed: 0,match_id,double_faults,team,gender
0,12156295,3,home,F
1,12156295,2,home,F
2,12156295,1,home,F
3,12157171,1,home,M
4,12157171,1,home,M


In [42]:
df_final.groupby("gender")["double_faults"].agg(["count", "mean", "std", "min", "max"])

Unnamed: 0_level_0,count,mean,std,min,max
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,28,3.285714,3.320611,0,14
M,92,1.804348,1.922905,0,10


### Result:Based on the statistics from 120 players (28 female and 92 male), female players made more double faults on average than male players.
- Average for females: 3.2
- Average for males: 1.8
This suggests that gender may influence the tendency to commit double faults in matches.
