In [1]:
import numpy as np
import pandas as pd

# Load full matches data
matches = pd.read_csv("Matches.csv")
print("Original shape of matches:", matches.shape)

# Make date usable and extract year
matches["MatchDate"] = pd.to_datetime(matches["MatchDate"], errors="coerce")
matches["Year"] = matches["MatchDate"].dt.year

# Keep only rows with valid full-time result
valid_results = ["H", "D", "A"]
matches = matches[matches["FTResult"].isin(valid_results)].copy()
print("After keeping FTResult in {H,D,A}:", matches.shape)

# only on Premier League, 2005–2024
matches = matches[matches["Division"] == "E0"].copy()
matches = matches[(matches["Year"] >= 2005) & (matches["Year"] <= 2024)].copy()
print("After filtering Division=E0 and 2005–2024:", matches.shape)

# Remove draws (we only predict home vs away)
matches_no_draw = matches[matches["FTResult"] != "D"].copy()

# Target: 1 = home win, 0 = away win
matches_no_draw["home_win"] = (matches_no_draw["FTResult"] == "H").astype(int)

print("After removing draws:", matches_no_draw.shape)
print("home_win value counts:")
print(matches_no_draw["home_win"].value_counts())

# If odds are present, convert them to implied probabilities
if all(col in matches_no_draw.columns for col in ["OddHome", "OddDraw", "OddAway"]):

    def odds_to_probs(row):
        oh, od, oa = row["OddHome"], row["OddDraw"], row["OddAway"]
        if oh <= 0 or od <= 0 or oa <= 0:
            return pd.Series([np.nan, np.nan, np.nan])
        inv = np.array([1/oh, 1/od, 1/oa], dtype=float)
        p = inv / inv.sum()
        return pd.Series(p)

    matches_no_draw[["p_home", "p_draw", "p_away"]] = (
        matches_no_draw[["OddHome", "OddDraw", "OddAway"]]
        .apply(odds_to_probs, axis=1)
    )

# Columns that are "after the match", NOT be used as inputs
leak_cols = [
    "FTResult",
    "home_win",
    "FTHome", "FTAway",
    "HTHome", "HTAway", "HTResult",
    "HomeShots", "AwayShots",
    "HomeTarget", "AwayTarget",
    "HomeFouls", "AwayFouls",
    "HomeCorners", "AwayCorners",
    "HomeYellow", "AwayYellow",
    "HomeRed", "AwayRed",
]

# All numeric columns
numeric_cols = [
    col for col in matches_no_draw.columns
    if pd.api.types.is_numeric_dtype(matches_no_draw[col])
]

# Features = numeric columns minus leakage
feature_cols_all = [
    col for col in numeric_cols
    if col not in leak_cols
]

print("Number of numeric feature columns used:", len(feature_cols_all))
print("Sample of feature columns:", feature_cols_all[:20])

# Build full feature matrix and label
X_full = matches_no_draw[feature_cols_all].copy()
y_full = matches_no_draw["home_win"].copy()

# Drop any rows with missing values in X or y
data_full = pd.concat([X_full, y_full], axis=1).dropna()
X_full = data_full[feature_cols_all]
y_full = data_full["home_win"]

print("Final shape of X_full:", X_full.shape)
print("Final shape of y_full:", y_full.shape)

X_full.head()

  matches = pd.read_csv("Matches.csv")


Original shape of matches: (230557, 48)
After keeping FTResult in {H,D,A}: (230554, 49)
After filtering Division=E0 and 2005–2024: (7543, 49)
After removing draws: (5711, 50)
home_win value counts:
home_win
1    3460
0    2251
Name: count, dtype: int64
Number of numeric feature columns used: 29
Sample of feature columns: ['HomeElo', 'AwayElo', 'Form3Home', 'Form5Home', 'Form3Away', 'Form5Away', 'OddHome', 'OddDraw', 'OddAway', 'MaxHome', 'MaxDraw', 'MaxAway', 'Over25', 'Under25', 'MaxOver25', 'MaxUnder25', 'HandiSize', 'HandiHome', 'HandiAway', 'C_LTH']
Final shape of X_full: (5601, 29)
Final shape of y_full: (5601,)


Unnamed: 0,HomeElo,AwayElo,Form3Home,Form5Home,Form3Away,Form5Away,OddHome,OddDraw,OddAway,MaxHome,...,C_LTH,C_LTA,C_VHD,C_VAD,C_HTB,C_PHB,Year,p_home,p_draw,p_away
27662,1702.64,1862.81,6.0,6.0,1.0,5.0,5.0,3.4,1.72,5.65,...,0.0906,0.7048,0.041,0.0191,0.0887,0.0559,2005,0.185958,0.273467,0.540575
27666,1638.74,1705.45,4.0,5.0,5.0,9.0,2.75,3.2,2.5,3.25,...,0.5329,0.0171,0.0205,0.0531,0.1235,0.253,2005,0.337909,0.290391,0.3717
27667,1628.42,1657.04,7.0,10.0,1.0,2.0,2.37,3.2,2.9,2.6,...,0.0941,0.1359,0.01,0.5768,0.1287,0.0545,2005,0.390951,0.289548,0.319501
27668,1573.54,1682.25,7.0,11.0,7.0,8.0,2.5,3.2,2.75,2.6,...,0.2148,0.0575,0.5646,0.0099,0.0249,0.1282,2005,0.3717,0.290391,0.337909
27735,1911.87,1723.82,7.0,13.0,0.0,4.0,1.4,4.2,8.0,1.5,...,0.0103,0.0099,0.0099,0.9153,0.0449,0.0099,2005,0.662983,0.220994,0.116022


In [62]:
import numpy as np
import random
import tensorflow as tf

SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)

In [63]:
# how much stronger / in-form home team is

# Elo difference (home strength - away strength)
matches_no_draw["EloDiff"] = matches_no_draw["HomeElo"] - matches_no_draw["AwayElo"]

# Recent form differences (points in last 3 and 5 games)
matches_no_draw["FormDiff3"] = matches_no_draw["Form3Home"] - matches_no_draw["Form3Away"]
matches_no_draw["FormDiff5"] = matches_no_draw["Form5Home"] - matches_no_draw["Form5Away"]

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Outer train/test split
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(
    X_full,
    y_full,
    test_size=0.2,
    random_state=42,
    stratify=y_full
)

print("Train size:", X_train_full.shape[0])
print("Test size:", X_test_full.shape[0])

# Inner split: train vs validation
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_full,
    y_train_full,
    test_size=0.2,
    random_state=42,
    stratify=y_train_full
)

print("Inner train size:", X_tr.shape[0])
print("Validation size:", X_val.shape[0])

# Scale features (fit on inner train only)
scaler = StandardScaler()
X_tr_scaled   = scaler.fit_transform(X_tr)
X_val_scaled  = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test_full)

print("Scaled shapes:", X_tr_scaled.shape, X_val_scaled.shape, X_test_scaled.shape)

Train size: 4480
Test size: 1121
Inner train size: 3584
Validation size: 896
Scaled shapes: (3584, 29) (896, 29) (1121, 29)


In [65]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

input_dim = X_tr_scaled.shape[1]
print("Number of input features:", input_dim)

mlp_model = keras.Sequential([
    layers.Input(shape=(input_dim,)),

    # Bigger first hidden layer
    layers.Dense(256, activation="relu", kernel_regularizer=keras.regularizers.l2(1e-4)),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    # Second hidden layer
    layers.Dense(128, activation="relu", kernel_regularizer=keras.regularizers.l2(1e-4)),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    # Third hidden layer
    layers.Dense(64, activation="relu", kernel_regularizer=keras.regularizers.l2(1e-4)),
    layers.BatchNormalization(),

    # Output layer: probability that home team wins
    layers.Dense(1, activation="sigmoid")
])

mlp_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

mlp_model.summary()

Number of input features: 29


In [73]:
from tensorflow.keras import callbacks

checkpoint = callbacks.ModelCheckpoint(
    "best_val_model.keras",      
    monitor="val_accuracy",      
    mode="max",                  
    save_best_only=True,        
    verbose=1
)

In [74]:
from tensorflow.keras import callbacks

early_stop = callbacks.EarlyStopping(
    monitor="val_loss",
    patience=6,
    restore_best_weights=True
)

reduce_lr = callbacks.ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=3,
    min_lr=1e-5,
    verbose=1
)

history = mlp_model.fit(
    X_tr_scaled, y_tr,
    validation_data=(X_val_scaled, y_val),
    epochs=90,
    batch_size=256,
    callbacks=[early_stop, checkpoint],
    verbose=1,
    shuffle=False
)

Epoch 1/90
[1m 1/14[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 47ms/step - accuracy: 0.6953 - loss: 0.5924
Epoch 1: val_accuracy improved from None to 0.70982, saving model to best_val_model.keras
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7299 - loss: 0.5574 - val_accuracy: 0.7098 - val_loss: 0.5875
Epoch 2/90
[1m 1/14[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 7ms/step - accuracy: 0.7070 - loss: 0.5833
Epoch 2: val_accuracy did not improve from 0.70982
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7374 - loss: 0.5562 - val_accuracy: 0.7054 - val_loss: 0.5878
Epoch 3/90
[1m 1/14[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m0s[0m 6ms/step - accuracy: 0.7344 - loss: 0.5745
Epoch 3: val_accuracy did not improve from 0.70982
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7369 - loss: 0.5504 - val_accuracy: 0.7020 - val_loss: 0.5888
Epoch 

In [85]:
from sklearn.metrics import accuracy_score

# Probabilities on validation set
y_val_proba = mlp_model.predict(X_val_scaled).ravel()

best_t = 0.1
best_acc = 0.0

# Try thresholds between 0.3 and 0.7
for t in np.linspace(0.3, 0.7, 41):   # step of 0.01
    y_val_pred = (y_val_proba >= t).astype(int)
    acc = accuracy_score(y_val, y_val_pred)
    if acc > best_acc:
        best_acc = acc
        best_t = t

print("Best threshold on validation set:", best_t)
print("Best validation accuracy:", best_acc)

[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 619us/step
Best threshold on validation set: 0.48
Best validation accuracy: 0.71875


In [86]:
best_mlp = keras.models.load_model("best_val_model.keras")

y_test_proba = best_mlp.predict(X_test_scaled).ravel()
y_test_pred  = (y_test_proba >= 0.5).astype(int)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

test_acc = accuracy_score(y_test_full, y_test_pred)
print("Test accuracy (best val model, t=0.5):", test_acc)

print("\nClassification report:")
print(classification_report(y_test_full, y_test_pred))
print("Confusion matrix:")
print(confusion_matrix(y_test_full, y_test_pred))

auc = roc_auc_score(y_test_full, y_test_proba)
print("\nROC AUC:", auc)

[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Test accuracy (best val model, t=0.5): 0.7234611953612846

Classification report:
              precision    recall  f1-score   support

           0       0.69      0.55      0.61       443
           1       0.74      0.83      0.79       678

    accuracy                           0.72      1121
   macro avg       0.71      0.69      0.70      1121
weighted avg       0.72      0.72      0.72      1121

Confusion matrix:
[[245 198]
 [112 566]]

ROC AUC: 0.7857627998961224
