In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

df = pd.read_csv("atp_transformed/2000-2024 with covid.csv")

# Display first rows
df.head()

# Display dataframe info
df.info()

# Check for missing values
df.isnull().sum()



We are dropping the rounds and scores because we are "predicting a players future best ranking aka peak ATP rank"
So we need input features, because it describes their ability, form and performance up to now.

#round_1 -> round_5 Many matches are best of 3, which means round_4 - round_5 are empty. Some tournaments report detailed set scores and some don't.
#score gives the full score string, it's text and not structured. Parsing it into usable numbers will take a LOT of cleaning.

Why they don’t add much value to our use case
1. They describe only a single match outcome

We already have match_outcome win/loss and can aggregate win rates per surface, tournament, etc.
The actual set by set scores don’t help predict future peak rank as much as long-term performance trends.


Format:
put in format values of ground types here + other format values !!!!!!! <---------

In [None]:
cols = ['score','round_1','round_2','round_3','round_4','round_5']
missing = df[cols].isna().mean().sort_values(ascending=False)

plt.figure(figsize=(8,4))
missing.plot(kind='bar', color='tomato')
plt.title('Missing value ratio for set/score columns')
plt.ylabel('Proportion missing')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.show()

# Plot to visualize missing data
missing

In [None]:

drop_cols = ['round_1','round_2','round_3','round_4','round_5','score', 'tourney_id', 'tourney_name','tourney_date', 'player_tourney_match_id']
df = df.drop(columns=drop_cols, errors='ignore')
# df['tourney_type'] = df['tourney_type'].replace({'matches': 1, 'futures': 2})
df['player_hand'] = df['player_hand'].replace({'L': 1, 'R': 2})

df.head()

Everything that is unknown or zero, we fill in the most common hand.

In [None]:
df['player_hand'].value_counts(dropna=False)

After cleaning, the player_hand column contains 973,260 right-handed, 147,640 left-handed, 567 ambidextrous, and 168,513 unknown entries.
Missing values 15 were filled with Unknown, resulting in a more complete categorical feature.

Also changed the string names of player_hand into numeric values so we can train a data model better.

In [None]:
# Create missing flag
df['player_hand_missing'] = df['player_hand'].isna().astype(int)

#  Fill missing values with 'U' (Unknown)
df['player_hand'] = df['player_hand'].fillna('U')

# Normalize all codes to readable text
df['player_hand'] = df['player_hand'].replace({
    'R': 'Right',
    'L': 'Left',
    'U': 'Unknown',
    'A': 'Ambidextrous'
})

df['player_hand'].head()

For minutes, we put it into three buckets. And we made a new column called match_length that has those three buckets. 
The match_lenght that had empty fields we put into 'common'. And we deleted the minutes column

In [None]:
#Count how many are missing (including blanks)
empty_minutes = df["minutes"].isna().sum() + (df["minutes"].astype(str).str.strip() == "").sum()
print(f"Empty or blank 'minutes': {empty_minutes}")

#Convert to numeric to ensure comparisons work
df["minutes"] = pd.to_numeric(df["minutes"], errors="coerce")

In [None]:
#Define the classifier function
def classify_match_length(x):
    if pd.isna(x):
        return np.nan
    elif x < 30:
        #short
        return 'short'  
    elif x <= 90:
        #medium
        return 'medium'  
    else:
        #long
        return 'long'  

In [None]:
# Create match_length column 
df["match_length"] = df["minutes"].apply(classify_match_length)

# Find the most common match length
most_common_length = df["match_length"].mode()[0]

# Fill missing values with the mode
df["match_length"] = df["match_length"].fillna(most_common_length)

# Move 'match_length' column after 'minutes'
minutes_index = df.columns.get_loc("minutes")
df.insert(minutes_index + 1, "match_length", df.pop("match_length"))

We want to fill in the missing values of player_height with the average. 

In [None]:
# Flag rows that were originally missing
df['player_height_missing'] = df['player_height'].isna().astype(int)

# Compute the mean (only from valid heights)
mean_height = df['player_height'].mean()
print(f"Mean height used for imputation: {mean_height:.2f} cm")

# Fill missing values with the mean height
df['player_height'] = df['player_height'].fillna(mean_height).round(0)

# Verify the result
print("Remaining NaNs:", df['player_height'].isna().sum())
print("Example values:\n", df[['player_height', 'player_height_missing']].head(10))

In [None]:
# Flag rows that were originally missing
df['player_age_missing'] = df['player_age'].isna().astype(int)

# Compute the mean (only from valid heights)
mean_age = df['player_age'].mean()
print(f"Mean age used for imputation: {mean_age:.2f} age")

# Fill missing values with the mean height
df['player_age'] = df['player_age'].fillna(mean_age).round(0)

# Verify the result
print("Remaining NaNs:", df['player_age'].isna().sum())
print("Example values:\n", df[['player_height', 'player_age_missing']].head(10))

In [None]:
# Calculate means player rank
rank_mean = df["player_rank"].mean()
rank_points_mean = df["player_rank_points"].mean()

df["player_rank"] = df["player_rank"].fillna(rank_mean).round(0).astype(int)
df["player_rank_points"] = df["player_rank_points"].fillna(rank_points_mean).round(0).astype(int)

In [None]:
# Fill missing player_country with the most common country
most_common_country = df["player_country"].mode()[0]
df["player_country"] = df["player_country"].fillna(most_common_country)

In [None]:
# Fill missing surface with the most common surface
most_common_surface = df["surface"].mode()[0]
df["surface"] = df["surface"].fillna(most_common_surface)

In [None]:
missing_ratio = df.isna().mean().sort_values(ascending=False)
missing_ratio.head(15)

In [None]:
# Choose one of the heavily-missing columns
column_to_check = 'points_on_serve'

# Calculate missing percentage per tourney_type
missing_by_tourney = (
    df.groupby('tourney_type')[column_to_check]
      .apply(lambda x: x.isna().mean() * 100)
      .sort_values(ascending=False)
)

print(f"Percentage of missing '{column_to_check}' by tourney_type:")
print(missing_by_tourney)


In [None]:
missing_by_tourney.plot(kind='bar', color='tomato', figsize=(8,4))
plt.title(f"Missing '{column_to_check}' (%) by Tournament Type")
plt.ylabel("Percentage Missing (%)")
plt.xlabel("Tournament Type")
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.show()

In [None]:
# open in data wrangler :)
df

# Save cleaned dataframe to new CSV (comment out to have it work)
# output_path = Path.cwd() / "atp_transformed" / "2000-2024_clean.csv"
# df.to_csv("atp_transformed/2000-2024_clean.csv", index=False)