# Building a Baseline Logistic Regression Classifier

In [20]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, log_loss, brier_score_loss
from sklearn.calibration import CalibratedClassifierCV
import matplotlib.pyplot as plt
import warnings
import os
import time
import math
import requests
import pandas as pd
import numpy as np

warnings.filterwarnings('ignore')


In [21]:
data = pd.read_csv('numeric_df.csv')
data.head()

Unnamed: 0,home_team_elo,away_team_elo,home_xG_to_date,away_xG_to_date,home_xG_against_to_date,away_xG_against_to_date,home_goals_scored_to_date,away_goals_scored_to_date,home_goals_conceded_to_date,away_goals_conceded_to_date,home_points_to_date,away_points_to_date,home_form,away_form,match_result
0,1884.934448,1697.498169,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0
1,1673.780518,1576.490356,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,0
2,1633.799683,1692.95166,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,2
3,1567.101318,1837.004272,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,2
4,1670.871338,1914.848877,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0,2


In [22]:
# Define the features and target variable
X = data.drop('match_result', axis=1)
y = data['match_result']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training set and test set using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)



In [23]:
# Initialize and train the multinomial logistic regression model with balanced class weights
logistic_model = LogisticRegression(multi_class='multinomial', class_weight='balanced')
logistic_model.fit(X_train, y_train)


LogisticRegression(class_weight='balanced', multi_class='multinomial')

In [24]:
# Predict on the test set
y_pred = logistic_model.predict(X_test)

# Generate a classification report
class_report = classification_report(y_test, y_pred, target_names=['Home Win', 'Away Win', 'Draw'])
print(class_report)


              precision    recall  f1-score   support

    Home Win       0.69      0.61      0.65       204
    Away Win       0.22      0.22      0.22       101
        Draw       0.52      0.60      0.56       151

    accuracy                           0.52       456
   macro avg       0.48      0.48      0.48       456
weighted avg       0.53      0.52      0.52       456



In [25]:
# Calibrate the logistic regression model to improve probability estimates
calibrated_model = CalibratedClassifierCV(logistic_model, method='sigmoid', cv=5)
calibrated_model.fit(X_train, y_train)

# Predict probabilities on the test set
y_prob_calibrated = calibrated_model.predict_proba(X_test)


In [26]:
# Binarize the test labels for ROC AUC calculation
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])

# Calculate the AUC ROC score using the binarized y target for the test set
try:
    roc_auc_ovr_corrected = roc_auc_score(y_test_bin, y_prob_calibrated, multi_class="ovr", average="macro")
    print(f"ROC AUC Score (One-vs-Rest, Macro-Averaged): {roc_auc_ovr_corrected:.4f}")
except ValueError as e:
    print("ROC AUC score cannot be computed for one or more classes:", e)

# Calculate Log Loss
log_loss_score = log_loss(y_test, y_prob_calibrated)
print(f"Log Loss: {log_loss_score:.4f}")

from sklearn.metrics import brier_score_loss

# Initialize a list to store the Brier scores for each class
brier_scores = []

# Loop over each class
for i in range(y_prob_calibrated.shape[1]):
    # Binarize the true labels for the current class
    y_test_bin_class = (y_test == i).astype(int)
    
    # Compute the Brier score for the current class
    brier_score_class = brier_score_loss(y_test_bin_class, y_prob_calibrated[:, i])
    brier_scores.append(brier_score_class)

# Calculate the mean Brier score across all classes
mean_brier_score = np.mean(brier_scores)
print(f"Mean Brier Score: {mean_brier_score:.4f}")

ROC AUC Score (One-vs-Rest, Macro-Averaged): 0.6846
Log Loss: 0.9483
Mean Brier Score: 0.1863


In [27]:
# Perform stratified cross-validation for further model validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in skf.split(X_scaled, y):
    X_train_cv, X_test_cv = X_scaled[train_index], X_scaled[test_index]
    y_train_cv, y_test_cv = y[train_index], y[test_index]

    # Train and evaluate on each fold (optional step for in-depth analysis)
    logistic_model_cv = LogisticRegression(multi_class='multinomial', class_weight='balanced')
    logistic_model_cv.fit(X_train_cv, y_train_cv)
    
    y_prob_cv = logistic_model_cv.predict_proba(X_test_cv)
    roc_auc_cv = roc_auc_score(label_binarize(y_test_cv, classes=[0, 1, 2]), y_prob_cv, multi_class="ovr", average="macro")
    
    print(f"Fold ROC AUC: {roc_auc_cv:.4f}")


Fold ROC AUC: 0.7024
Fold ROC AUC: 0.6735
Fold ROC AUC: 0.6451
Fold ROC AUC: 0.6711
Fold ROC AUC: 0.6479


In [None]:
# Binarize the y_test data for multi-class ROC AUC calculation
y_test_binarized = label_binarize(y_test, classes=[0, 1, 2])

# Calculate the ROC curve and AUC for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(3):  # Assuming 3 classes
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Calculate the macro-average ROC curve and ROC area
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(3)]))

# Then interpolate all ROC curves at these points
mean_tpr = np.zeros_like(all_fpr)
for i in range(3):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= 3

# Calculate the AUC
macro_auc_roc = auc(all_fpr, mean_tpr)

# Print the AUC
print(f"Macro-average AUC ROC: {macro_auc_roc}")


Macro-Avg computes the AUC ROC for each class and then averages them, mitigating the class imbalance in the target variable

In [None]:
# Assuming logistic_model is your trained logistic regression model
y_prob_ovr = logistic_model_ovr.predict_proba(X_test)

# Binarize the y_test data for multi-class ROC AUC calculation
y_test_binarized = label_binarize(y_test, classes=[0, 1, 2])
n_classes = y_test_binarized.shape[1]

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_prob_ovr[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test_binarized.ravel(), y_prob_ovr.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# Aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at these points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure(figsize=(10, 8))

colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label='ROC curve of class {0} (area = {1:0.2f})'
                   ''.format(i, roc_auc[i]))



plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)


plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()
