# 08 - XGBoost model

Goals:
- Load the data `features_match_long_elo_22_23.csv` 
- Choose X (features) and Y (target = result H/D/A)
- Split train/test by time order
- Features standardization
- Train XG Boost
- Calculate : Log Loss, Brier Score, Accuracy
- plot the calibration curves

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, log_loss, brier_score_loss
from sklearn.calibration import calibration_curve

import matplotlib.pyplot as plt

import xgboost as xgb  # XGBoost library

pd.set_option("display.max_columns", 50)

In [None]:
# Path to the long-format dataset already enriched with Elo features
df = pd.read_csv("../data/processed/features_matches_long_elo_22_23.csv")

print("Long + Elo features shape:", df.shape)
df.head()

In [None]:
target_col = "result"

feature_cols = [
    "is_home",
    "rolling_xg_for_5",
    "rolling_xg_against_5",
    "rolling_xg_diff_5",
    "rolling_points_5",
    "strength_points_5",
    "elo_team_before",
    "elo_diff_for_team",
]

# Check NaNs before dropping
print("NaN per column BEFORE drop:")
print(df[feature_cols + [target_col]].isna().sum())

# Drop rows with missing values in features or target
df_clean = df.dropna(subset=feature_cols + [target_col]).reset_index(drop=True)

print("\nShape AFTER drop:", df_clean.shape)
print("NaN per column AFTER drop:")
print(df_clean[feature_cols + [target_col]].isna().sum())

# Ensure date is datetime and sort chronologically
df_clean["date"] = pd.to_datetime(df_clean["date"])
df_clean = df_clean.sort_values("date").reset_index(drop=True)

X = df_clean[feature_cols].copy()
y = df_clean[target_col].copy()

print("\nX shape:", X.shape)
print("y shape:", y.shape)

In [None]:
train_ratio = 0.8
n = len(df_clean)
train_size = int(train_ratio * n)

X_train = X.iloc[:train_size]
X_test  = X.iloc[train_size:]
y_train = y.iloc[:train_size]
y_test  = y.iloc[train_size:]

print("Train shapes:", X_train.shape, y_train.shape)
print("Test shapes :", X_test.shape, y_test.shape)

print("\nNaN in TRAIN features:")
print(X_train.isna().sum())

print("\nNaN in TEST features:")
print(X_test.isna().sum())

In [None]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)

print("Classes:", le.classes_)     # expected ['A', 'D', 'H']
print("First 10 encoded y_train:", y_train_enc[:10])

In [None]:
# XGBoost multiclass classifier
xgb_model = xgb.XGBClassifier(
    objective="multi:softprob",  # to get class probabilities
    num_class=3,
    n_estimators=400,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
    eval_metric="mlogloss",      # avoid warning
)

xgb_model.fit(X_train, y_train_enc)
print("XGBoost model trained!")

In [None]:
# Predicted probabilities and class predictions
y_proba_xgb = xgb_model.predict_proba(X_test)
y_pred_xgb  = xgb_model.predict(X_test)

print("Predicted probabilities shape:", y_proba_xgb.shape)
print("Sample predictions:", y_pred_xgb[:10])
print("Sample true labels:", y_test_enc[:10])

# Accuracy
acc_xgb = accuracy_score(y_test_enc, y_pred_xgb)
print("\nAccuracy on test set (XGBoost):", round(acc_xgb, 3))

# Log loss
ll_xgb = log_loss(y_test_enc, y_proba_xgb)
print("Log loss on test set (XGBoost):", round(ll_xgb, 4))

# Brier score per class
brier_scores_xgb = []
for class_idx, class_label in enumerate(le.classes_):
    y_true_binary = (y_test_enc == class_idx).astype(int)
    y_prob_class  = y_proba_xgb[:, class_idx]
    brier = brier_score_loss(y_true_binary, y_prob_class)
    brier_scores_xgb.append(brier)
    print(f"Brier score for class {class_label} (XGB): {brier:.4f}")

mean_brier_xgb = float(np.mean(brier_scores_xgb))
print("Mean Brier score (XGB):", round(mean_brier_xgb, 4))


In [None]:
plt.figure(figsize=(8, 6))

for class_idx, class_label in enumerate(le.classes_):
    y_true_binary = (y_test_enc == class_idx).astype(int)
    y_prob_class  = y_proba_xgb[:, class_idx]

    true_frac, pred_mean = calibration_curve(
        y_true_binary,
        y_prob_class,
        n_bins=10,
        strategy="uniform",
    )

    plt.plot(
        pred_mean,
        true_frac,
        marker="o",
        label=f"Class {class_label}",
    )

plt.plot([0, 1], [0, 1], "--", color="black", label="Perfect calibration")

plt.xlabel("Predicted probability")
plt.ylabel("Observed frequency")
plt.title("Calibration curves (XGBoost)")
plt.legend()
plt.grid(True)
plt.show()