In [100]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import numpy as np

## Load and manage data ##

In [101]:
# Load the dataset
df = pd.read_csv("data/player_10yr_stats.csv")
print('Dataset loaded. Shape:', df.shape)

Dataset loaded. Shape: (265, 123)


In [102]:
df.head(3)

Unnamed: 0,name,rookie_year,rookie_age,Y1_pts_per_g,Y1_ast_per_g,Y1_trb_per_g,Y1_stl_per_g,Y1_blk_per_g,Y1_mp_per_g,Y1_games_started,...,Y10_trb_per_g,Y10_stl_per_g,Y10_blk_per_g,Y10_mp_per_g,Y10_games_started,Y10_per,Y10_ts_pct,Y10_usg_pct,Y10_ws,Y10_bpm
0,Carmelo Anthony*,2004,19,21.0,2.8,6.1,1.2,0.5,36.5,82.0,...,6.9,0.8,0.5,37.0,67.0,24.8,0.56,35.6,9.5,4.3
1,Leandro Barbosa,2004,21,7.9,2.4,1.8,1.3,0.1,21.4,46.0,...,1.1,0.4,0.1,12.5,2.0,13.5,0.509,21.4,0.9,-0.7
2,Matt Barnes,2004,23,4.5,1.3,4.0,0.7,0.1,19.1,9.0,...,4.6,1.0,0.8,25.7,4.0,15.5,0.566,18.1,6.3,1.8


In [103]:
# add more useful columns
for year in range(1, 11):
    df[f"Y{year}_pts_per_36"] = (df[f"Y{year}_pts_per_g"] / df[f"Y{year}_mp_per_g"]) * 36
df["pts_growth"] = df["Y3_pts_per_g"] - df["Y1_pts_per_g"]
df["ast_growth"] = df["Y3_ast_per_g"] - df["Y1_ast_per_g"]
df["pp36_growth"] = df["Y3_pts_per_36"] - df["Y1_pts_per_36"]
df["per_growth"] = df["Y3_per"] - df["Y1_per"]
df["ts_growth"] = df["Y3_ts_pct"] - df["Y1_ts_pct"]
df["usg_growth"] = df["Y3_usg_pct"] - df["Y1_usg_pct"]
df["ws_growth"] = df["Y3_ws"] - df["Y1_ws"]
df["bpm_growth"] = df["Y3_bpm"] - df["Y1_bpm"]


## select features and train/test split ##

In [104]:
input_cols = ["rookie_age", "pts_growth", "ast_growth", "per_growth", "ts_growth", "usg_growth", "ws_growth", "bpm_growth"] + [
                    f"Y{year}_{stat}" 
                    for year in range(1, 4)
                    for stat in ["pts_per_g", "ast_per_g", "trb_per_g", "stl_per_g", "blk_per_g", "mp_per_g", "per", "usg_pct", "bpm"]
            ]

In [105]:
# Target: average points per game in years 4–10
df["avg_pts_future"] = df[[f"Y{y}_pts_per_g" for y in range(4, 11)]].mean(axis=1)
df["avg_ast_future"] = df[[f"Y{y}_ast_per_g" for y in range(4, 11)]].mean(axis=1)
df["avg_trb_future"] = df[[f"Y{y}_trb_per_g" for y in range(4, 11)]].mean(axis=1)
df["avg_stl_future"] = df[[f"Y{y}_stl_per_g" for y in range(4, 11)]].mean(axis=1)
df["avg_blk_future"] = df[[f"Y{y}_blk_per_g" for y in range(4, 11)]].mean(axis=1)
df["avg_per_future"] = df[[f"Y{y}_per" for y in range(4, 11)]].mean(axis=1)
df["avg_ts_future"] = df[[f"Y{y}_ts_pct" for y in range(4, 11)]].mean(axis=1)
df["avg_usg_future"] = df[[f"Y{y}_usg_pct" for y in range(4, 11)]].mean(axis=1)
df["avg_ws_future"] = df[[f"Y{y}_ws" for y in range(4, 11)]].mean(axis=1)
df["avg_bpm_future"] = df[[f"Y{y}_bpm" for y in range(4, 11)]].mean(axis=1)

target_cols = ["avg_pts_future", "avg_ast_future", "avg_trb_future", "avg_stl_future", "avg_blk_future", "avg_bpm_future"]

In [106]:
df_model = df[input_cols + target_cols + ["name"]].dropna()

X = df_model[input_cols]
y = df_model[target_cols]
names = df_model["name"]

X_train, X_test, y_train, y_test, names_train, names_test = train_test_split(
    X, y, names, test_size=0.2
)


## Gradient Boosting ##

In [107]:
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor

# Train XGBoost model
base_model = XGBRegressor(
    n_estimators=150,
    learning_rate=0.1,
    max_depth=6,
    objective="reg:squarederror",
    random_state=42
)
model = MultiOutputRegressor(base_model)
model.fit(X_train, y_train)


In [108]:
total_mae = 0
total_rmse = 0
total_r2 = 0

for i in range(50):
    y_pred = model.predict(X_test)

    total_mae += mean_absolute_error(y_test, y_pred)
    total_rmse += np.sqrt(mean_squared_error(y_test, y_pred))
    total_r2 += r2_score(y_test, y_pred)

print("\nModel Performance over 50 runs:")
print(f"MAE:  {total_mae/50:.2f}")
print(f"RMSE: {total_rmse/50:.2f}")
print(f"R²:   {total_r2/50:.3f}")




Model Performance over 50 runs:
MAE:  1.26
RMSE: 2.18
R²:   0.518


In [109]:
# separate run to visualise
y_pred = model.predict(X_test)

# Round predictions first
ppg_pred = np.round(y_pred[:, 0].astype(float), 2)
apg_pred = np.round(y_pred[:, 1], 2)
rpg_pred = np.round(y_pred[:, 2], 2)


# Create DataFrame
results_df = pd.DataFrame({
    "Player": names_test.values,
    "Actual PPG": y_test["avg_pts_future"].values.round(2),
    "Predicted PPG": ppg_pred,
    "Actual APG": y_test["avg_ast_future"].values.round(2),
    "Predicted APG": apg_pred,
    "Actual RPG": y_test["avg_trb_future"].values.round(2),
    "Predicted RPG": rpg_pred,
    "Actual SPG": y_test["avg_stl_future"].values.round(2),
    "Predicted SPG": np.round(y_pred[:, 3], 2),
    "Actual BPG": y_test["avg_blk_future"].values.round(2),
    "Predicted BPG": np.round(y_pred[:, 4], 2),
    #"Actual WS": y_test["avg_ws_future"].values.round(2),
    #"Predicted WS": np.round(y_pred[:, 5], 2),
    "Actual BPM": y_test["avg_bpm_future"].values.round(2),
    "Predicted BPM": np.round(y_pred[:, 5], 2)
})

In [110]:
print("\nSample Results:")
print(results_df.sort_values("Predicted BPM", ascending=False).head(10).to_string(index=False))


Sample Results:
         Player  Actual PPG  Predicted PPG  Actual APG  Predicted APG  Actual RPG  Predicted RPG  Actual SPG  Predicted SPG  Actual BPG  Predicted BPG  Actual BPM  Predicted BPM
   Derrick Rose       16.37          25.02        4.62           6.95        3.07           4.75        0.63           1.22        0.30           0.37       -1.03           6.44
      John Wall       20.00          26.00        9.67           8.51        4.18           5.50        1.72           1.32        0.75           0.48        3.07           6.36
   LeBron James       28.00          21.87        7.07           6.53        7.56          10.49        1.70           1.58        0.89           0.72       10.67           5.97
   Kevin Durant       27.79          24.96        4.31           3.09        7.60          11.46        1.16           1.44        1.13           1.19        8.60           5.46
    Rajon Rondo       11.77          17.95       10.46           5.82        5.17           5