# 09 - Calibration Platt Isotonic

Goals:
- Load the data `features_match_long_elo_22_23`
- Choose X (features) and y (target = result H/D/A)
- Split train/test by time order
- Train my 3 model :
    - Logistic regression
    - Random Forest
    - XGBoost
- Apply :
    - No calibration (raw)
    - Platt scaling (sigmoid)
    - Isotonic regression
- For each, calculate :
    - Accuracy
    - Log Loss
    - Brier Score
- Compare all results in a single summary table

In [31]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss, brier_score_loss
from sklearn.calibration import calibration_curve, CalibratedClassifierCV

import matplotlib.pyplot as plt
import xgboost as xgb

pd.set_option("display.max_columns", 50)

# Load long-format features with Elo (two rows per match, one per team)
df = pd.read_csv("../data/processed/features_matches_long_elo_22_23.csv")
print("Long + Elo features shape:", df.shape)
df.head()


Long + Elo features shape: (760, 33)


Unnamed: 0,match_id,season,date,team,opponent,is_home,result,goals_for,goals_against,points,prob_book_home,prob_book_draw,prob_book_away,goal_diff,xg_diff,xg_for,xg_against,rolling_xg_for_5,rolling_xg_against_5,rolling_xg_diff_5,rolling_points_5,rolling_home_points_5,rolling_away_points_5,strength_points_5,rolling_goal_diff_5,rolling_home_goal_diff_5,rolling_away_goal_diff_5,elo_home_before,elo_away_before,elo_diff_home,elo_team_before,elo_opponent_before,elo_diff_for_team
0,1,2022-2023,2022-08-05,Arsenal,Crystal Palace,0,A,2,0,3,0.218795,0.266625,0.514581,2,-0.2,1.0,1.2,,,,,,,,,,,1500.0,1500.0,0.0,1500.0,1500.0,0.0
1,16,2022-2023,2022-08-13,Arsenal,Leicester City,1,H,4,2,3,0.616659,0.222323,0.161018,2,2.2,2.7,0.5,,,,,,,,,,,1515.328421,1497.171579,18.156841,1515.328421,1497.171579,18.156841
2,23,2022-2023,2022-08-20,Arsenal,Bournemouth,0,A,3,0,3,0.121887,0.204302,0.673812,3,1.0,1.3,0.3,,,,,,,,,,,1500.192375,1524.387995,-24.19562,1524.387995,1500.192375,24.19562
3,35,2022-2023,2022-08-27,Arsenal,Fulham,1,H,2,1,3,0.73085,0.167736,0.101414,1,1.8,2.6,0.8,,,,,,,,,,,1538.87863,1509.853905,29.024725,1538.87863,1509.853905,29.024725
4,49,2022-2023,2022-08-31,Arsenal,Aston Villa,1,H,2,1,3,0.647946,0.209616,0.142438,1,2.0,2.4,0.4,,,,,,,,,,,1547.580022,1473.449924,74.130098,1547.580022,1473.449924,74.130098


In [32]:
target_col = "result"

feature_cols = [
    "is_home",
    "rolling_xg_for_5",
    "rolling_xg_against_5",
    "rolling_xg_diff_5",
    "rolling_points_5",
    "strength_points_5",
    "elo_home_before",
    "elo_away_before",
    "elo_diff_home"
]

# Check NaNs before dropping
print("NaN per column BEFORE drop:", df.shape)
print(df[feature_cols + [target_col]].isna().sum())

# Drop rows with missing values in features or target
df_clean = df.dropna(subset=feature_cols + [target_col]).reset_index(drop=True)

print("\nShape AFTER drop:", df_clean.shape)
print("NaN per column AFTER drop:")
print(df_clean[feature_cols + [target_col]].isna().sum())

# Ensure date is datetime and sort chronologically
df_clean["date"] = pd.to_datetime(df_clean["date"])
df_clean = df_clean.sort_values("date").reset_index(drop=True)

X = df_clean[feature_cols].copy()
y = df_clean[target_col].copy()

print("\nX shape:", X.shape)
print("y shape:", y.shape)

NaN per column BEFORE drop: (760, 33)
is_home                   0
rolling_xg_for_5        100
rolling_xg_against_5    100
rolling_xg_diff_5       100
rolling_points_5        100
strength_points_5       200
elo_home_before           0
elo_away_before           0
elo_diff_home             0
result                    0
dtype: int64

Shape AFTER drop: (560, 33)
NaN per column AFTER drop:
is_home                 0
rolling_xg_for_5        0
rolling_xg_against_5    0
rolling_xg_diff_5       0
rolling_points_5        0
strength_points_5       0
elo_home_before         0
elo_away_before         0
elo_diff_home           0
result                  0
dtype: int64

X shape: (560, 9)
y shape: (560,)


In [33]:
train_ratio = 0.8
train_size = int(train_ratio * len(df_clean))

X_train = X.iloc[:train_size]
X_test  = X.iloc[train_size:]
y_train = y.iloc[:train_size]
y_test  = y.iloc[train_size:]

print("Train shapes:", X_train.shape, y_train.shape)
print("Test shapes :", X_test.shape, y_test.shape)

# Encode target labels (H/D/A) into integers
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)

print("\nClasses:", le.classes_)     # expected ['A', 'D', 'H']
print("First 10 encoded y_train:", y_train_enc[:10])

Train shapes: (448, 9) (448,)
Test shapes : (112, 9) (112,)

Classes: ['A' 'D' 'H']
First 10 encoded y_train: [2 1 2 2 0 2 0 2 2 1]


In [34]:
# Scale features for logistic regression only
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

print("Scaled shapes:", X_train_scaled.shape, X_test_scaled.shape)

Scaled shapes: (448, 9) (112, 9)


In [35]:
# Compute accuracy, log loss and Brier scores for given probabilities.
def evaluate_probas(y_test_enc: np.ndarray, y_proba: np.ndarray, le: LabelEncoder, label: str = "") -> dict:
    # Predicted classes from probabilities
    y_pred_enc = np.argmax(y_proba, axis=1)

    accuracy = accuracy_score(y_test_enc, y_pred_enc)
    ll = log_loss(y_test_enc, y_proba)

    # Brier scores per class
    brier_scores = []
    for class_idx, class_label in enumerate(le.classes_):
        y_true_binary = (y_test_enc == class_idx).astype(int)
        y_prob_class = y_proba[:, class_idx]
        brier = brier_score_loss(y_true_binary, y_prob_class)
        brier_scores.append(brier)

    mean_brier = float(np.mean(brier_scores))

    print(f"\n=== {label} ===")
    print(f"Accuracy : {accuracy:.3f}")
    print(f"Log loss : {ll:.4f}")
    print(f"Mean Brier score: {mean_brier:.4f}")

    return {
        "accuracy": float(accuracy),
        "log_loss": float(ll),
        "brier_scores": brier_scores,
        "mean_brier": mean_brier,
    }

In [36]:
# Base logistic regression model (multinomial)
logit = LogisticRegression(
    multi_class="multinomial",
    solver="lbfgs",
    max_iter=500,
    random_state=42,
)

logit.fit(X_train_scaled, y_train_enc)
y_proba_logit_raw = logit.predict_proba(X_test_scaled)

metrics_logit_raw = evaluate_probas(
    y_test_enc,
    y_proba_logit_raw,
    le,
    label="Logistic (raw, uncalibrated)",
)


=== Logistic (raw, uncalibrated) ===
Accuracy : 0.607
Log loss : 0.9486
Mean Brier score: 0.1863




In [37]:
# Logistic regression with Platt scaling (sigmoid)
logit_platt = CalibratedClassifierCV(
    estimator=LogisticRegression(
        multi_class="multinomial",
        solver="lbfgs",
        max_iter=500,
        random_state=42,
    ),
    method="sigmoid",   # Platt scaling
    cv=3,               # 3-fold CV on training set
)

logit_platt.fit(X_train_scaled, y_train_enc)
y_proba_logit_platt = logit_platt.predict_proba(X_test_scaled)

metrics_logit_platt = evaluate_probas(
    y_test_enc,
    y_proba_logit_platt,
    le,
    label="Logistic + Platt (sigmoid)",
)


=== Logistic + Platt (sigmoid) ===
Accuracy : 0.616
Log loss : 0.9357
Mean Brier score: 0.1842




In [38]:
# Logistic regression with isotonic calibration
logit_iso = CalibratedClassifierCV(
    estimator=LogisticRegression(
        multi_class="multinomial",
        solver="lbfgs",
        max_iter=500,
        random_state=42,
    ),
    method="isotonic",
    cv=3,
)

logit_iso.fit(X_train_scaled, y_train_enc)
y_proba_logit_iso = logit_iso.predict_proba(X_test_scaled)

metrics_logit_iso = evaluate_probas(
    y_test_enc,
    y_proba_logit_iso,
    le,
    label="Logistic + Isotonic",
)


=== Logistic + Isotonic ===
Accuracy : 0.607
Log loss : 1.0154
Mean Brier score: 0.1934




In [39]:
# Base Random Forest model (uncalibrated)
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=1,
)

rf.fit(X_train, y_train_enc)
y_proba_rf_raw = rf.predict_proba(X_test)

metrics_rf_raw = evaluate_probas(
    y_test_enc,
    y_proba_rf_raw,
    le,
    label="Random Forest (raw, uncalibrated)",
)


=== Random Forest (raw, uncalibrated) ===
Accuracy : 0.562
Log loss : 0.9901
Mean Brier score: 0.1923


In [40]:
# Random Forest with Platt scaling
rf_platt = CalibratedClassifierCV(
    estimator=RandomForestClassifier(
        n_estimators=500,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42,
        n_jobs=-1,
    ),
    method="sigmoid",
    cv=3,
)

rf_platt.fit(X_train, y_train_enc)
y_proba_rf_platt = rf_platt.predict_proba(X_test)

metrics_rf_platt = evaluate_probas(
    y_test_enc,
    y_proba_rf_platt,
    le,
    label="Random Forest + Platt (sigmoid)",
)


=== Random Forest + Platt (sigmoid) ===
Accuracy : 0.607
Log loss : 0.9396
Mean Brier score: 0.1840


In [41]:
# Random Forest with isotonic calibration
rf_iso = CalibratedClassifierCV(
    estimator=RandomForestClassifier(
        n_estimators=500,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42,
        n_jobs=1,
    ),
    method="isotonic",
    cv=3,
)

rf_iso.fit(X_train, y_train_enc)
y_proba_rf_iso = rf_iso.predict_proba(X_test)

metrics_rf_iso = evaluate_probas(
    y_test_enc,
    y_proba_rf_iso,
    le,
    label="Random Forest + Isotonic",
)



=== Random Forest + Isotonic ===
Accuracy : 0.571
Log loss : 0.9814
Mean Brier score: 0.1880


In [42]:
# Base XGBoost model (uncalibrated)
xgb_model = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=3,
    n_estimators=400,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
    eval_metric="mlogloss",
)

xgb_model.fit(X_train, y_train_enc)
y_proba_xgb_raw = xgb_model.predict_proba(X_test)

metrics_xgb_raw = evaluate_probas(
    y_test_enc,
    y_proba_xgb_raw,
    le,
    label="XGBoost (raw, uncalibrated)",
)


=== XGBoost (raw, uncalibrated) ===
Accuracy : 0.527
Log loss : 1.2208
Mean Brier score: 0.2162


In [43]:
# XGBoost with Platt scaling
xgb_platt = CalibratedClassifierCV(
    estimator=xgb.XGBClassifier(
        objective="multi:softprob",
        num_class=3,
        n_estimators=400,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        eval_metric="mlogloss",
    ),
    method="sigmoid",
    cv=3,
)

xgb_platt.fit(X_train, y_train_enc)
y_proba_xgb_platt = xgb_platt.predict_proba(X_test)

metrics_xgb_platt = evaluate_probas(
    y_test_enc,
    y_proba_xgb_platt,
    le,
    label="XGBoost + Platt (sigmoid)",
)


=== XGBoost + Platt (sigmoid) ===
Accuracy : 0.589
Log loss : 0.9550
Mean Brier score: 0.1878


In [44]:
# XGBoost with isotonic calibration
xgb_iso = CalibratedClassifierCV(
    estimator=xgb.XGBClassifier(
        objective="multi:softprob",
        num_class=3,
        n_estimators=400,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1,
        eval_metric="mlogloss",
    ),
    method="isotonic",
    cv=3,
)

xgb_iso.fit(X_train, y_train_enc)
y_proba_xgb_iso = xgb_iso.predict_proba(X_test)

metrics_xgb_iso = evaluate_probas(
    y_test_enc,
    y_proba_xgb_iso,
    le,
    label="XGBoost + Isotonic",
)


=== XGBoost + Isotonic ===
Accuracy : 0.562
Log loss : 1.5808
Mean Brier score: 0.1918


In [45]:
summary = []

def add_row(model_name, calib_type, metrics):
    summary.append({
        "model": model_name,
        "calibration": calib_type,
        "accuracy": metrics["accuracy"],
        "log_loss": metrics["log_loss"],
        "mean_brier": metrics["mean_brier"],
    })

add_row("Logistic", "raw",      metrics_logit_raw)
add_row("Logistic", "Platt",    metrics_logit_platt)
add_row("Logistic", "Isotonic", metrics_logit_iso)

add_row("RandomForest", "raw",      metrics_rf_raw)
add_row("RandomForest", "Platt",    metrics_rf_platt)
add_row("RandomForest", "Isotonic", metrics_rf_iso)

add_row("XGBoost", "raw",      metrics_xgb_raw)
add_row("XGBoost", "Platt",    metrics_xgb_platt)
add_row("XGBoost", "Isotonic", metrics_xgb_iso)

df_summary = pd.DataFrame(summary)
df_summary.sort_values(["model", "calibration"])

Unnamed: 0,model,calibration,accuracy,log_loss,mean_brier
2,Logistic,Isotonic,0.607143,1.015391,0.193393
1,Logistic,Platt,0.616071,0.935696,0.184179
0,Logistic,raw,0.607143,0.948638,0.186287
5,RandomForest,Isotonic,0.571429,0.981431,0.188008
4,RandomForest,Platt,0.607143,0.939642,0.183973
3,RandomForest,raw,0.5625,0.990133,0.192254
8,XGBoost,Isotonic,0.5625,1.580755,0.191785
7,XGBoost,Platt,0.589286,0.954976,0.187843
6,XGBoost,raw,0.526786,1.220824,0.216242
