# 04 - Baseline Model Logistic

Goals:
- Load the data `features_match_long_22_23.csv` and inspect them
- Clean / filter what is need
- Choose X (features) and Y (target = result H/D/A)
- Split train/test by time order
- Features standardization
- Train Multionomial Logistic Regression
- Calculate : Log Loss, Brier Score, Accuracy
- plot the calibration curves

In [None]:
import pandas as pd
import numpy as np

# Display more columns when printing dataframes (optional)
pd.set_option("display.max_columns", 50)

In [None]:
# Load engineered long-format features
df = pd.read_csv("../data/processed/features_matches_long_22_23.csv")

print("Initial shape :", df.shape)
df.head()

In [None]:
df.columns

In [None]:
# Define target column
target_col = "result"

# Define feature columns used for the model
feature_cols = [
    "is_home",
    "rolling_xg_for_5",
    "rolling_xg_against_5",
    "rolling_xg_diff_5",
    "rolling_points_5",
    "strength_points_5",
]

# Check missing values before dropping
print("\nShape BEFORE dropna:", df.shape)
print("NaN per column BEFORE drop:")
print(df[feature_cols + [target_col]].isna().sum())

# Drop rows with any missing value in features or target
df = df.dropna(subset=feature_cols + [target_col]).reset_index(drop=True)

print("\nShape AFTER dropna:", df.shape)
print("NaN per column AFTER drop:")
print(df[feature_cols + [target_col]].isna().sum())

In [None]:
# Ensure date is a proper datetime column
df["date"] = pd.to_datetime(df["date"])

# Sort by chronological order
df = df.sort_values("date").reset_index(drop=True)

# Build feature matrix X and target vector y
X = df[feature_cols].copy()
y = df[target_col].copy()

print("X shape:", X.shape)
print("y shape:", y.shape)

In [None]:
# Time-based train/test split (80% train, 20% test)
train_size = int(0.8 * len(df))

X_train = X.iloc[:train_size]
X_test  = X.iloc[train_size:]
y_train = y.iloc[:train_size]
y_test  = y.iloc[train_size:]

print("Train shapes:", X_train.shape, y_train.shape)
print("Test shapes :", X_test.shape, y_test.shape)

# Sanity check: no missing values in train/test features
print("\nNaN in TRAIN features:")
print(X_train.isna().sum())

print("\nNaN in TEST features:")
print(X_test.isna().sum())

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode target labels as integers
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)

print("Classes:", le.classes_)  # Expected: ['A' 'D' 'H']
print("First 10 encoded y_train:", y_train_enc[:10])

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Fit on train set only, then transform both train and test
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

print("Train scaled shape:", X_train_scaled.shape)
print("Test scaled shape :", X_test_scaled.shape)

# Check for NaNs after scaling (should be zero)
print("NaN in X_train_scaled:", np.isnan(X_train_scaled).sum())
print("NaN in X_test_scaled :", np.isnan(X_test_scaled).sum())

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize multinomial logistic regression model
model = LogisticRegression(
    multi_class="multinomial",
    solver="lbfgs",
    max_iter=500,
)

# Fit model on scaled training data
model.fit(X_train_scaled, y_train_enc)

print("Model trained!")

In [None]:
# --- Step 1: Get predictions on the test set ---

# Predicted probabilities for each class (A, D, H)
y_proba = model.predict_proba(X_test_scaled)

# Predicted class labels (encoded as 0/1/2)
y_pred_enc = model.predict(X_test_scaled)

print("Predicted probabilities shape:", y_proba.shape)
print("Encoded predictions sample:", y_pred_enc[:10])
print("True encoded labels sample:", y_test_enc[:10])

In [None]:
# --- Step 2: Accuracy score ---

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test_enc, y_pred_enc)
print("Accuracy on test set:", round(accuracy, 3))

In [None]:
# --- Step 3: Log loss (cross-entropy) ---

from sklearn.metrics import log_loss

logloss = log_loss(y_test_enc, y_proba)
print("Log loss on test set:", round(logloss, 4))

In [None]:
# --- Step 4: Brier score per class ---

from sklearn.metrics import brier_score_loss
import numpy as np

brier_scores = []

for class_idx, class_label in enumerate(le.classes_):
    # Build a binary true label: 1 if this class, 0 otherwise
    y_true_binary = (y_test_enc == class_idx).astype(int)
    
    # Predicted probability for this class
    y_prob_class = y_proba[:, class_idx]
    
    # Brier score for this class
    brier = brier_score_loss(y_true_binary, y_prob_class)
    brier_scores.append(brier)
    
    print(f"Brier score for class {class_label}: {brier:.4f}")

print("Mean Brier score:", round(np.mean(brier_scores), 4))

In [None]:
# --- Step 5: Calibration (reliability) curves ---

from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))

for class_idx, class_label in enumerate(le.classes_):
    # Binary true labels for this class
    y_true_binary = (y_test_enc == class_idx).astype(int)
    y_prob_class = y_proba[:, class_idx]
    
    # Compute calibration curve
    true_frac, pred_mean = calibration_curve(
        y_true_binary,
        y_prob_class,
        n_bins=10,
        strategy="uniform"
    )
    
    plt.plot(pred_mean, true_frac, marker="o", label=f"Class {class_label}")

# Perfect calibration line
plt.plot([0, 1], [0, 1], "--", color="black", label="Perfect calibration")

plt.xlabel("Predicted probability")
plt.ylabel("Observed frequency")
plt.title("Calibration curves (multinomial logistic regression)")
plt.legend()
plt.grid(True)
plt.show()