# 06 - Random Forest model

Goals:
- Load the data `features_match_long_22_23.csv` and `elo_rating_22_23`
- Merge them on the left (to keep all the features of match_long)
- Calculate the Elo difference from each team perspective
- Choose X (features) and Y (target = result H/D/A)
- Split train/test by time order
- Features standardization
- Train Random Forest
- Calculate : Log Loss, Brier Score, Accuracy
- plot the calibration curves

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss, brier_score_loss
from sklearn.calibration import calibration_curve

import matplotlib.pyplot as plt

pd.set_option("display.max_columns", 50)

In [None]:
# Load long-format features (2 two rows per match: one per team)
df_feat = pd.read_csv("../data/processed/features_matches_long_22_23.csv")
print("Features shape:", df_feat.shape)

# Load match-level Elo ratings (one row per match, wide format).
df_elo = pd.read_csv("../data/processed/elo_rating_22_23.csv", parse_dates=["date"])
print("Elo shape:", df_elo.shape)

df_feat.head()

In [None]:
# Merge on match_id (safer than on date/teams)
df = df_feat.merge(
    df_elo[[
        "match_id",
        "elo_home_before",
        "elo_away_before",
        "elo_diff_home"
    ]],
    on="match_id",
    how="left"
)

print("Merged shape:", df.shape)
df[["match_id", "team", "is_home", "elo_home_before", "elo_away_before", "result"]].head(10)

In [None]:
# For each row (team perspective) we create:
# elo_team_before: Elo of this team before the match
# elo_opponent_before: Elo of the opponent before the match
# elo_diff_for_team: team Elo - opponent Elo

# Team's Elo (home if is_home=1, away otherwise)
df["elo_team_before"] = np.where(
    df["is_home"] == 1,
    df["elo_home_before"],
    df["elo_away_before"]
)

# Opponent's elo
df["elo_opponent_before"] = np.where(
    df["is_home"] == 1,
    df["elo_away_before"],
    df["elo_home_before"]
)

# Elo difference from the team's perspective
df["elo_diff_for_team"] = df["elo_team_before"] - df["elo_opponent_before"]

df[["date", "team", "opponent", "is_home",
    "elo_team_before", "elo_opponent_before", "elo_diff_for_team"]].head(10)

In [None]:
target_col = "result"

feature_cols = [
    "is_home",
    "rolling_xg_for_5",
    "rolling_xg_against_5",
    "rolling_xg_diff_5",
    "rolling_points_5",
    "strength_points_5",
    "elo_team_before",
    "elo_diff_for_team",
]

# Check for NaN before cleaning
print("NaN per column BEFORE drop:")
print(df[feature_cols + [target_col]].isna().sum())

# Drop lines with NaN in features or target
df_clean = df.dropna(subset=feature_cols + [target_col]).reset_index(drop=True)

print("\nShape AFTER drop:", df_clean.shape)
print("NaN per column AFTER drop:")
print(df_clean[feature_cols + [target_col]].isna().sum())

# Ensure that the date is in datetime format and sort chronologically.
df_clean["date"] = pd.to_datetime(df_clean["date"])
df_clean = df_clean.sort_values("date").reset_index(drop=True)

X = df_clean[feature_cols].copy()
y = df_clean[target_col].copy()

print("\nX shape:", X.shape)
print("y shape:", y.shape)

In [None]:
train_ratio = 0.8
train_size = int(train_ratio * len(df_clean))

X_train = X.iloc[:train_size]
X_test  = X.iloc[train_size:]
y_train = y.iloc[:train_size]
y_test  = y.iloc[train_size:]

print("Train shapes:", X_train.shape, y_train.shape)
print("Test shapes :", X_test.shape, y_test.shape)

print("\nNaN in TRAIN features:")
print(X_train.isna().sum())

print("\nNaN in TEST features:")
print(X_test.isna().sum())

In [None]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)

print("Classes:", le.classes_)     #  ['A', 'D', 'H']
print("First 10 encoded y_train:", y_train_enc[:10])

In [None]:
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,        # let the tree grow ( can tune it later)
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1,             # uses all available cores
)

rf.fit(X_train, y_train_enc)
print("Random Forest trained!")

In [None]:
# Probabilities and predictions
y_proba_rf = rf.predict_proba(X_test)
y_pred_rf  = rf.predict(X_test)

print("Predicted probabilities shape:", y_proba_rf.shape)
print("Sample predictions:", y_pred_rf[:10])
print("Sample true labels:", y_test_enc[:10])

# Accuracy
acc_rf = accuracy_score(y_test_enc, y_pred_rf)
print("\nAccuracy on test set (Random Forest):", round(acc_rf, 3))

# Log loss
ll_rf = log_loss(y_test_enc, y_proba_rf)
print("Log loss on test set (Random Forest):", round(ll_rf, 4))

# Brier score par classe
brier_scores_rf = []
for class_idx, class_label in enumerate(le.classes_):
    y_true_binary = (y_test_enc == class_idx).astype(int)
    y_prob_class  = y_proba_rf[:, class_idx]
    brier = brier_score_loss(y_true_binary, y_prob_class)
    brier_scores_rf.append(brier)
    print(f"Brier score for class {class_label} (RF): {brier:.4f}")

mean_brier_rf = float(np.mean(brier_scores_rf))
print("Mean Brier score (RF):", round(mean_brier_rf, 4))

In [None]:
plt.figure(figsize=(8, 6))

for class_idx, class_label in enumerate(le.classes_):
    y_true_binary = (y_test_enc == class_idx).astype(int)
    y_prob_class  = y_proba_rf[:, class_idx]

    true_frac, pred_mean = calibration_curve(
        y_true_binary,
        y_prob_class,
        n_bins=10,
        strategy="uniform",
    )

    plt.plot(pred_mean, true_frac, marker="o", label=f"Class {class_label}")

plt.plot([0, 1], [0, 1], "--", color="black", label="Perfect calibration")

plt.xlabel("Predicted probability")
plt.ylabel("Observed frequency")
plt.title("Calibration curves (Random Forest)")
plt.legend()
plt.grid(True)
plt.show()