In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../atp_transformed/2000-2024 players_3.csv", low_memory=False) # We take the preprocessed dataset

# Quick inspection
print(df.shape)
df.head()

(1282807, 133)


Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,score,best_of,tourney_round,...,total_diff_rolling_mean_10,mean_tb_numb_rolling_mean_10,median_tb_numb_rolling_mean_10,total_tb_numb_rolling_mean_10,mean_tb_diff_rolling_mean_10,median_tb_diff_rolling_mean_10,total_tb_diff_rolling_mean_10,elo_next_match,filled_player_rank,player_hand_missing
0,1969-6818,St. Petersburg,Clay,64.0,A,1969-03-17,262,6-3 6-1,3,R64,...,,,,,,,,1515.966092,2268.0,0
1,1969-2047,Jacksonville,Clay,32.0,A,1969-04-02,274,8-6 6-2,3,R32,...,7.0,,,0.0,,,0.0,1531.231343,2268.0,0
2,1970-2047,Jacksonville,Clay,32.0,A,1970-03-25,274,6-3 6-4,3,R32,...,8.5,,,0.0,,,0.0,,2268.0,0
3,1971-583,Manchester,Grass,128.0,A,1971-05-31,247,7-5 6-4,3,R64,...,,,,,,,,1515.32969,2268.0,0
4,1971-9341,Brisbane,Grass,64.0,A,1971-12-06,241,6-4 6-0,3,R64,...,9.0,,,0.0,,,0.0,,2268.0,0


In [None]:
# We want to predict the highest possible ELO 

# Add ELO max
elo_max_df  = df.groupby('player_id')['elo'].max().reset_index()    # We add a column 'elo max' this is the highest achived elo rank for each player
elo_max_df.rename(columns={'elo': 'elo_max'}, inplace=True)  # The current elo column is replaced by the max elo column 

# Merge elo_max into the main dataframe
df = df.merge(elo_max_df, on='player_id', how='left')      # We merge the max elo column into the df


df['target'] = df['elo_max']    # We define our target: The highest possible elo


features = [      # The features are all player based features
    'player_age', # The current age of the player
    'days_of_experience', # The amount of days since the player had his/her first tournament
    'months_of_experience', # The amount of months since the player had his/her first tournament
    'years_of_experience',  # The amount of years since the player had his/her first tournament
    'player_rank',            # The current rank of this player (1 is current best player)
    'player_rank_points',     # The total amount of ranking points this player has scored (accumulation of points)
    'set_dominance_rolling_mean_10', # Rolling median over the last 10 set_dominance rows of the player. Set_dominance: Amount of sets won/ total sets played (per record/game)
    'tb_dominance_rolling_mean_10', # Rolling median over the last 10 tb_dominance rows of the player. Tb_dominance: Amount of tie break sets won/ total tie break sets played (per record/game)
    'mean_numb_rolling_mean_10', # Average of the last 10 number of games the winner won per round in the match (mean_numb = (6 + 7 + 6) / 3 = 6.33)
    'mean_diff_rolling_mean_10', # Average per-set game dominance (outscored his opponent per game) measures how decisively a player outplays opponents, across the last 10 matches.
    'mean_tb_numb_rolling_mean_10',  # Average number of tie-break points the player wins per tie-break, averaged over their previous 10 matches.
    'mean_tb_diff_rolling_mean_10', # Average per-tie-break set game dominance (outscored his opponent per game) measures how decisively a player outplays opponents, across the last 10 tie-break sets and smoothed across recent matches.
    'draw_size_rolling_med_10', # Rolling median over the last 10 total number of players in a tournament this player played.
    'highest_finish_position_rolling_med_10' # The player’s typical best tournament result over their last 10 tournaments, where lower values indicate deeper runs.
]

for f in features:
    df[f] = df.groupby('player_id')[f].shift(1) # Shifts the column down by one row within each group. This prevents data leakage

# Don't fill with mean, it produces worst results
df = df.dropna(subset=features + ['target'])     # Drop all columns from the dataframe except features and target



In [None]:
df.__len__() # Number of records in the df

1100563

In [None]:
# One-hot encode tourney_level and drop one level as baseline
df = pd.get_dummies(df, columns=['tourney_level'], drop_first=True)  
features += [c for c in df.columns if c.startswith('tourney_level_')]  # Add encoded tourney_level columns to feature list

X = df[features].copy()  # Create feature matrix using selected features
y = df['target']  # Define target variable for modeling

In [None]:
# Define x/y train/test
df['tourney_date'] = pd.to_datetime(df['tourney_date'])  # Convert tournament date to datetime for time-based splitting
train = df[df['tourney_date'] < '2020-01-01']  # Use matches before 2020 as training data
test = df[df['tourney_date'] >= '2020-01-01']  # Use matches from 2020 onward as test data
# Time-based split: Mimics real-world prediction (no future data leaks into training). Often better for tournaments or time-series data.

X_train = train[features].fillna(train[features].median())  
y_train = train['target'] 
X_test = test[features].fillna(train[features].median())  
y_test = test['target'] 

dtrain = xgb.DMatrix(X_train, label=y_train)  
dtest  = xgb.DMatrix(X_test, label=y_test)  

params = {  # Define XGBoost hyperparameters
    "objective": "reg:squarederror",  # Use squared error loss for regression... ELO is a continuous variable, so regression with squared error (MSE) is appropriate.
    "learning_rate": 0.03,  # Control contribution of each tree... Low learning rate ensures gradual updates, preventing overfitting on 1.2M records. Slower learning allows the model to capture subtle patterns in player performance.
    "max_depth": 6,  # Controls tree complexity... Depth 6 balances learning non-linear interactions (like age × experience or rolling dominance metrics) without overfitting.
    "subsample": 0.8,  # Sample fraction of rows for each tree... Randomly samples 80% of rows per tree, improving generalization and speeding up training.
    "colsample_bytree": 0.8,  # Sample fraction of features for each tree... Randomly samples 80% of features per tree. Avoids over-reliance on a few correlated features (like: rolling averages of dominance).
    "seed": 42  # Ensure reproducible results
}

model = xgb.train(  # Train the XGBoost model
    params,
    dtrain,
    num_boost_round=1000,  # Maximum number of boosting iterations... Using a large num_boost_round combined with early stopping ensures the model has enough capacity without overfitting.
    evals=[(dtest, "eval")],  # Evaluate performance on test set
    early_stopping_rounds=50,  # Stop training if no improvement for 50 rounds... Prevents overfitting: Once the model stops improving on the validation/test set, we stop adding trees.
    verbose_eval=True  # Print evaluation metrics during training
)

y_pred = model.predict(dtest)  


[0]	eval-rmse:273.58309
[1]	eval-rmse:267.85688
[2]	eval-rmse:262.65051
[3]	eval-rmse:257.34982
[4]	eval-rmse:252.19704
[5]	eval-rmse:247.21858
[6]	eval-rmse:242.72319
[7]	eval-rmse:238.10903
[8]	eval-rmse:233.55843
[9]	eval-rmse:229.23030
[10]	eval-rmse:225.15235
[11]	eval-rmse:221.11329
[12]	eval-rmse:220.09058
[13]	eval-rmse:216.45281
[14]	eval-rmse:212.85372
[15]	eval-rmse:209.40894
[16]	eval-rmse:206.01528
[17]	eval-rmse:202.73106
[18]	eval-rmse:199.59182
[19]	eval-rmse:196.55681
[20]	eval-rmse:193.65913
[21]	eval-rmse:190.94883
[22]	eval-rmse:188.31307
[23]	eval-rmse:185.86498
[24]	eval-rmse:183.36687
[25]	eval-rmse:182.12328
[26]	eval-rmse:179.69764
[27]	eval-rmse:177.54999
[28]	eval-rmse:175.37848
[29]	eval-rmse:173.37372
[30]	eval-rmse:171.47126
[31]	eval-rmse:169.54144
[32]	eval-rmse:167.61837
[33]	eval-rmse:165.76824
[34]	eval-rmse:164.12427
[35]	eval-rmse:162.46458
[36]	eval-rmse:161.04364
[37]	eval-rmse:159.65243
[38]	eval-rmse:158.27691
[39]	eval-rmse:156.96277
[40]	eval-

In [None]:
mse = mean_squared_error(y_test, y_pred)  # Compute mean squared error between predictions and true values
rmse = np.sqrt(mse)  # Convert MSE to root mean squared error for interpretability
r2 = r2_score(y_test, y_pred)  # Compute R-squared to measure explained variance

print(f"RMSE: {rmse:.2f}")  
print(f"R2: {r2:.3f}")  


RMSE: 129.69
R2: 0.782


In [None]:
importance_dict = model.get_score(importance_type='weight')  # Get feature importance, options: 'weight', 'gain', 'cover'

feat_imp = pd.DataFrame({  # Convert importance dictionary to a DataFrame
    'feature': list(importance_dict.keys()),  
    'importance': list(importance_dict.values())  
}).sort_values(by='importance', ascending=False)  # Sort features by descending importance

print(feat_imp) 


                                   feature  importance
0                               player_age      3106.0
1                       days_of_experience      2927.0
4                              player_rank      1900.0
5                       player_rank_points      1825.0
2                     months_of_experience       929.0
13  highest_finish_position_rolling_med_10       611.0
6            set_dominance_rolling_mean_10       512.0
12                draw_size_rolling_med_10       435.0
10            mean_tb_numb_rolling_mean_10       360.0
9                mean_diff_rolling_mean_10       333.0
8                mean_numb_rolling_mean_10       266.0
19                         tourney_level_S       159.0
7             tb_dominance_rolling_mean_10       133.0
11            mean_tb_diff_rolling_mean_10       122.0
3                      years_of_experience       108.0
15                         tourney_level_C       105.0
14                         tourney_level_A        75.0
18        

In [None]:
# Example: early-career snapshot of a new player
new_player = pd.DataFrame({
    'player_age': [20],
    'days_of_experience': [150],
    'months_of_experience': [5],
    'years_of_experience': [0.4],
    'player_rank': [300],
    'player_rank_points': [120], # Important that a player has accumulated rank_points in order to be ranked
    'set_dominance_rolling_mean_10': [0.6],
    'tb_dominance_rolling_mean_10': [0.5],
    'mean_numb_rolling_mean_10': [6.0],
    'mean_diff_rolling_mean_10': [0.5],
    'mean_tb_numb_rolling_mean_10': [3.0],
    'mean_tb_diff_rolling_mean_10': [0.2],
    'draw_size_rolling_med_10': [32],
    'highest_finish_position_rolling_med_10': [16],
    # Dummy variables for tourney_level if used in training
    'tourney_level_A': [0],
    'tourney_level_D': [0],
    'tourney_level_F': [0],
    'tourney_level_G': [0],
    'tourney_level_M': [1]
})

# Fill any missing features with 0
for col in X_train.columns:
    if col not in new_player.columns:
        new_player[col] = 0

# Ensure column order matches training data
new_player = new_player[X_train.columns]

# Convert to DMatrix for Booster prediction
dnew = xgb.DMatrix(new_player)

# Predict highest Elo
pred_elo_max = model.predict(dnew)
print("Predicted highest Elo:", pred_elo_max[0])

Predicted highest Elo: 2013.0415
