### English Premier League Match Predictor
#### Data Loading and Preprocessing


In [59]:
import pandas as pd

#Loading Dataset
file_path = "C:\\Users\\CheyoChomba\\OneDrive - MOYO Business Advisory\\Documents\Presentations\\COP - Software Engineering\\Match Predictor\\dataset\\premier-league-matches.csv"
df = pd.read_csv(file_path)

df.head(10)

Unnamed: 0,Season_End_Year,Wk,Date,Home,HomeGoals,AwayGoals,Away,FTR
0,1993,1,1992-08-15,Coventry City,2,1,Middlesbrough,H
1,1993,1,1992-08-15,Leeds United,2,1,Wimbledon,H
2,1993,1,1992-08-15,Sheffield Utd,2,1,Manchester Utd,H
3,1993,1,1992-08-15,Crystal Palace,3,3,Blackburn,D
4,1993,1,1992-08-15,Arsenal,2,4,Norwich City,A
5,1993,1,1992-08-15,Ipswich Town,1,1,Aston Villa,D
6,1993,1,1992-08-15,Everton,1,1,Sheffield Weds,D
7,1993,1,1992-08-15,Southampton,0,0,Tottenham,D
8,1993,1,1992-08-15,Chelsea,1,1,Oldham Athletic,D
9,1993,1,1992-08-16,Nott'ham Forest,1,0,Liverpool,H


### Feature Engineering

In [60]:
# Convert the date column to datetime format
df["Date"] = pd.to_datetime(df["Date"])


# Converting categorical feature 'FTR' to numerical representation
df["NFTR"] = df["FTR"].map({"H": 0, "D": 1, "A": 2})

df.head(10)

Unnamed: 0,Season_End_Year,Wk,Date,Home,HomeGoals,AwayGoals,Away,FTR,NFTR
0,1993,1,1992-08-15,Coventry City,2,1,Middlesbrough,H,0
1,1993,1,1992-08-15,Leeds United,2,1,Wimbledon,H,0
2,1993,1,1992-08-15,Sheffield Utd,2,1,Manchester Utd,H,0
3,1993,1,1992-08-15,Crystal Palace,3,3,Blackburn,D,1
4,1993,1,1992-08-15,Arsenal,2,4,Norwich City,A,2
5,1993,1,1992-08-15,Ipswich Town,1,1,Aston Villa,D,1
6,1993,1,1992-08-15,Everton,1,1,Sheffield Weds,D,1
7,1993,1,1992-08-15,Southampton,0,0,Tottenham,D,1
8,1993,1,1992-08-15,Chelsea,1,1,Oldham Athletic,D,1
9,1993,1,1992-08-16,Nott'ham Forest,1,0,Liverpool,H,0


In [61]:
# Since we are using this model to predict future match outcomes, our challenge is that recent form won’t always be available at the time of prediction
# Season long statistics will be used (N = 38)
# Since it ensures data will always be available at time of prediction and won't require recent form

def compute_season_stats(df):
    season_stats = {}
    
    # Group by season and team to calculate per-season metrics
    grouped = df.groupby(["Season_End_Year", "Home"])

    for (season, team), matches in grouped:
        total_games = len(matches)
        total_goals_scored = matches["HomeGoals"].sum()
        total_goals_conceded = matches["AwayGoals"].sum()
        total_wins = (matches["NFTR"] == 0).sum()
        total_draws = (matches["NFTR"] == 1).sum()

        season_stats[(season, team)] = {
            # The aim here is to incorporate additional features that can indicate 'xG' expected goals
            # Expected goals measures the likelihood a team will score; it is a prevalent statistic in modern football
            # Helpful indicator for attacking strength
            "Season_Home_xG": total_goals_scored / total_games if total_games > 0 else 0,
            
            # The aim here is to incorporate additional features that can indicate 'xGA' expected goals against
            # Expected goals against measures the likelihood a team will concede
            # Helpful indicator for defensive strength
            "Season_Home_xGA": total_goals_conceded / total_games if total_games > 0 else 0,
            
            # Incorporate season win percentage, relevant since the model will be used to calculate home/away win percentage
            "Season_Home_WinPerc": total_wins / total_games if total_games > 0 else 0,
            
            # Incorporate season draw percentage, important for improving draw predictions
            "Season_Home_DrawPerc": total_draws / total_games if total_games > 0 else 0,
            
            # Goal Difference, another key performance metric
            "Season_Home_GD": total_goals_scored - total_goals_conceded,
        }
    
    grouped_away = df.groupby(["Season_End_Year", "Away"])
    for (season, team), matches in grouped_away:
        total_games = len(matches)
        total_goals_scored = matches["AwayGoals"].sum()
        total_goals_conceded = matches["HomeGoals"].sum()
        total_wins = (matches["NFTR"] == 2).sum()
        total_draws = (matches["NFTR"] == 1).sum()

        if (season, team) in season_stats:
            season_stats[(season, team)].update({
                "Season_Away_xG": total_goals_scored / total_games if total_games > 0 else 0,
                "Season_Away_xGA": total_goals_conceded / total_games if total_games > 0 else 0,
                "Season_Away_WinPerc": total_wins / total_games if total_games > 0 else 0,
                "Season_Away_DrawPerc": total_draws / total_games if total_games > 0 else 0,
                "Season_Away_GD": total_goals_scored - total_goals_conceded,
            })
        else:
            season_stats[(season, team)] = {
                "Season_Away_xG": total_goals_scored / total_games if total_games > 0 else 0,
                "Season_Away_xGA": total_goals_conceded / total_games if total_games > 0 else 0,
                "Season_Away_WinPerc": total_wins / total_games if total_games > 0 else 0,
                "Season_Away_DrawPerc": total_draws / total_games if total_games > 0 else 0,
                "Season_Away_GD": total_goals_scored - total_goals_conceded,
            }
    
    return season_stats

# Compute season statistics
season_stats = compute_season_stats(df)

# Compute Head-to-Head Statistics
# Since head-to-head matchups are important in football, we calculate win/draw percentages between teams
head_to_head_stats = df.groupby(["Home", "Away"]).agg(
    Head_to_Head_HomeWinPerc=("FTR", lambda x: (x == "H").mean()),
    Head_to_Head_AwayWinPerc=("FTR", lambda x: (x == "A").mean()),
    Head_to_Head_DrawPerc=("FTR", lambda x: (x == "D").mean())
).fillna(0).to_dict("index")

def add_season_features(row):
    season = row["Season_End_Year"]
    home_team = row["Home"] #Home Team
    away_team = row["Away"] #Away Team
    home_stats = season_stats.get((season, home_team), {})
    away_stats = season_stats.get((season, away_team), {})
    head_to_head = head_to_head_stats.get((home_team, away_team), {
        "Head_to_Head_HomeWinPerc": 0, "Head_to_Head_AwayWinPerc": 0, "Head_to_Head_DrawPerc": 0 #Checks head to season head stats between each team
    })
    # Merges mutiple feature sources to a panda series
    return pd.Series({
        **home_stats, 
        **away_stats,
        "Season_Home_DrawPerc": home_stats.get("Season_Home_DrawPerc", 0),
        "Season_Away_DrawPerc": away_stats.get("Season_Away_DrawPerc", 0),
        "Head_to_Head_HomeWinPerc": head_to_head["Head_to_Head_HomeWinPerc"],
        "Head_to_Head_AwayWinPerc": head_to_head["Head_to_Head_AwayWinPerc"],
        "Head_to_Head_DrawPerc": head_to_head["Head_to_Head_DrawPerc"]
    })

# Apply season-long features to each match
season_features = df.apply(add_season_features, axis=1)

# Combine with the original dataset
df = pd.concat([df, season_features], axis=1)

df.head(10)

Unnamed: 0,Season_End_Year,Wk,Date,Home,HomeGoals,AwayGoals,Away,FTR,NFTR,Season_Home_xG,...,Season_Home_DrawPerc,Season_Home_GD,Season_Away_xG,Season_Away_xGA,Season_Away_WinPerc,Season_Away_DrawPerc,Season_Away_GD,Head_to_Head_HomeWinPerc,Head_to_Head_AwayWinPerc,Head_to_Head_DrawPerc
0,1993,1,1992-08-15,Coventry City,2,1,Middlesbrough,H,0,1.571429,...,0.190476,6.0,1.0,2.285714,0.142857,0.285714,-27.0,0.5,0.333333,0.166667
1,1993,1,1992-08-15,Leeds United,2,1,Wimbledon,H,0,1.52381,...,0.380952,9.0,1.142857,1.52381,0.238095,0.380952,-8.0,0.625,0.0,0.375
2,1993,1,1992-08-15,Sheffield Utd,2,1,Manchester Utd,H,0,1.857143,...,0.285714,25.0,1.333333,0.809524,0.47619,0.333333,11.0,0.2,0.6,0.2
3,1993,1,1992-08-15,Crystal Palace,3,3,Blackburn,D,1,1.809524,...,0.428571,20.0,1.428571,1.333333,0.333333,0.333333,2.0,0.0,0.5,0.5
4,1993,1,1992-08-15,Arsenal,2,4,Norwich City,A,2,1.47619,...,0.285714,12.0,1.428571,2.190476,0.380952,0.142857,-16.0,0.7,0.1,0.2
5,1993,1,1992-08-15,Ipswich Town,1,1,Aston Villa,D,1,1.714286,...,0.428571,20.0,1.0,1.142857,0.380952,0.285714,-3.0,0.0,0.6,0.4
6,1993,1,1992-08-15,Everton,1,1,Sheffield Weds,D,1,1.619048,...,0.285714,8.0,1.0,1.190476,0.285714,0.285714,-4.0,0.125,0.5,0.375
7,1993,1,1992-08-15,Southampton,0,0,Tottenham,D,1,1.904762,...,0.285714,15.0,0.952381,1.952381,0.238095,0.285714,-21.0,0.416667,0.291667,0.291667
8,1993,1,1992-08-15,Chelsea,1,1,Oldham Athletic,D,1,2.047619,...,0.333333,13.0,0.952381,2.095238,0.142857,0.190476,-24.0,0.0,0.5,0.5
9,1993,1,1992-08-16,Nott'ham Forest,1,0,Liverpool,H,0,1.952381,...,0.190476,23.0,1.0,1.761905,0.142857,0.333333,-16.0,0.5,0.0,0.5


### Model Training

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Select relevant features for the model
features = [
    "Season_Home_xG", "Season_Home_xGA", "Season_Home_WinPerc", "Season_Home_GD",
    "Season_Away_xG", "Season_Away_xGA", "Season_Away_WinPerc", "Season_Away_GD",
    "Season_Home_DrawPerc", "Season_Away_DrawPerc",
    "Head_to_Head_HomeWinPerc", "Head_to_Head_AwayWinPerc", "Head_to_Head_DrawPerc"
]

# Define X (features) and y (target variable)
X = df[features]
y = df["NFTR"]

# Split data into training and testing sets (60% training, 40% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# Initialize and train the model (XGBoost Classifier)
model = XGBClassifier(
    objective="multi:softmax",
    num_class=3,
    eval_metric="mlogloss",
    use_label_encoder=False,
    learning_rate=0.01,
    n_estimators=200,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8
)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=["Home Win", "Draw", "Away Win"])

print(f"XGBoost Model Accuracy: {accuracy:.4f}")
print(report)


XGBoost Model Accuracy: 0.6279
              precision    recall  f1-score   support

    Home Win       0.67      0.76      0.71      2208
        Draw       0.60      0.47      0.52      1239
    Away Win       0.57      0.56      0.56      1364

    accuracy                           0.63      4811
   macro avg       0.61      0.60      0.60      4811
weighted avg       0.62      0.63      0.62      4811



In [63]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor
from sklearn.metrics import accuracy_score, classification_report

# Select relevant features
features = [
    "Season_Home_xG", "Season_Home_xGA", "Season_Home_WinPerc", "Season_Home_GD",
    "Season_Away_xG", "Season_Away_xGA", "Season_Away_WinPerc", "Season_Away_GD",
    "Season_Home_DrawPerc", "Season_Away_DrawPerc",
    "Head_to_Head_HomeWinPerc", "Head_to_Head_AwayWinPerc", "Head_to_Head_DrawPerc"
]

X = df[features]
y = df["NFTR"]  

if "HomeGoals" not in df.columns or "AwayGoals" not in df.columns:
    raise KeyError("Ensure your dataset contains 'HomeGoals' and 'AwayGoals' columns.")

# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# Train Poisson regression models separately for home and away goals
model_home = PoissonRegressor(alpha=0.1, max_iter=500)
model_away = PoissonRegressor(alpha=0.1, max_iter=500)

model_home.fit(X_train, df.loc[X_train.index, "HomeGoals"])  # Home goals as target
model_away.fit(X_train, df.loc[X_train.index, "AwayGoals"])  # Away goals as target

# Predict home and away goals
y_pred_home = model_home.predict(X_test)
y_pred_away = model_away.predict(X_test)

# Convert goal predictions into match results
y_pred = []
for home, away in zip(y_pred_home, y_pred_away):
    if home > away:
        y_pred.append(0)  # Home Win
    elif home < away:
        y_pred.append(2)  # Away Win
    else:
        y_pred.append(1)  # Draw

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=["Home Win", "Draw", "Away Win"], zero_division=1)

print(f"Poisson Regression Model Accuracy: {accuracy:.4f}")
print(report)


Poisson Regression Model Accuracy: 0.5481
              precision    recall  f1-score   support

    Home Win       0.56      0.87      0.68      2208
        Draw       1.00      0.00      0.00      1239
    Away Win       0.52      0.52      0.52      1364

    accuracy                           0.55      4811
   macro avg       0.69      0.46      0.40      4811
weighted avg       0.66      0.55      0.46      4811



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, LeakyReLU

# Define features and target
features = [
    "Season_Home_xG", "Season_Home_xGA", "Season_Home_WinPerc", "Season_Home_GD",
    "Season_Away_xG", "Season_Away_xGA", "Season_Away_WinPerc", "Season_Away_GD",
    "Season_Home_DrawPerc", "Season_Away_DrawPerc",
    "Head_to_Head_HomeWinPerc", "Head_to_Head_AwayWinPerc", "Head_to_Head_DrawPerc"
]

X = df[features]

# Use converted categorical feature as target variable
y = df["NFTR"]

# Split dataset into features and target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

# Convert target to categorical
y_train_cat = to_categorical(y_train, num_classes=3)
y_test_cat = to_categorical(y_test, num_classes=3)

# Define Neural Network
from tensorflow.keras.layers import LeakyReLU

nn_model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(128),
    LeakyReLU(negative_slope=0.1),
    Dropout(0.3),
    Dense(64),
    LeakyReLU(negative_slope=0.1),
    Dropout(0.3),
    Dense(32),
    LeakyReLU(negative_slope=0.1),
    Dense(3, activation="softmax")
])

# Optimizers are algorithms used to update the weights of a neural network to minimize loss and improve accuracy during training.
# Adam is a widely used optimizer and works well without requiring extensive training
optimizer = Adam(learning_rate=0.001)

nn_model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"])

# Train the model
nn_model.fit(X_train, y_train_cat, epochs=30, batch_size=32, validation_data=(X_test, y_test_cat))

# Predictions
y_pred_nn = np.argmax(nn_model.predict(X_test), axis=1)

Epoch 1/30
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4644 - loss: 1.1850 - val_accuracy: 0.5205 - val_loss: 1.0107
Epoch 2/30
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5067 - loss: 1.0297 - val_accuracy: 0.5178 - val_loss: 0.9962
Epoch 3/30
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5161 - loss: 1.0107 - val_accuracy: 0.5242 - val_loss: 0.9909
Epoch 4/30
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5105 - loss: 1.0053 - val_accuracy: 0.5321 - val_loss: 0.9568
Epoch 5/30
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5305 - loss: 0.9599 - val_accuracy: 0.5650 - val_loss: 0.9204
Epoch 6/30
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5513 - loss: 0.9512 - val_accuracy: 0.5797 - val_loss: 0.8921
Epoch 7/30
[1m226/226[0m 

### Evaluation

In [71]:
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
import numpy as np

# Function to evaluate model performance
def evaluate_predictions(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_true - y_pred))
    exact_matches = np.mean(np.round(y_pred) == y_true)
    within_one = np.mean(np.abs(np.round(y_pred) - y_true) <= 1)

    print(f"\n {model_name} Metrics:")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAE: {mae:.3f}")
    print(f"Exact Match Accuracy: {exact_matches:.1%}")
    print(f"Within ±1 Goal: {within_one:.1%}\n")

    # Print classification report
    print(classification_report(y_true, y_pred, target_names=["Home Win", "Draw", "Away Win"], zero_division=1))

# --- Neural Network Evaluation ---
y_pred_nn = np.argmax(nn_model.predict(X_test), axis=1)
print("Neural Network Evaluation:")
evaluate_predictions(y_test, y_pred_nn, "Neural Network")

# --- XGBoost Evaluation ---
y_pred_xgb = model.predict(X_test)  # XGBoost predictions
print("XGBoost Evaluation:")
evaluate_predictions(y_test, y_pred_xgb, "XGBoost")

# --- Poisson Regression Evaluation ---
y_pred_home_poisson = model_home.predict(X_test)
y_pred_away_poisson = model_away.predict(X_test)

# Convert predicted goals into match results
y_pred_poisson = []
for home, away in zip(y_pred_home_poisson, y_pred_away_poisson):
    if home > away:
        y_pred_poisson.append(0)  # Home Win
    elif home < away:
        y_pred_poisson.append(2)  # Away Win
    else:
        y_pred_poisson.append(1)  # Draw

print("Poisson Regression Evaluation:")
evaluate_predictions(y_test, y_pred_poisson, "Poisson Regression")


[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 816us/step
Neural Network Evaluation:

 Neural Network Metrics:
RMSE: 0.909
MAE: 0.522
Exact Match Accuracy: 63.0%
Within ±1 Goal: 84.8%

              precision    recall  f1-score   support

    Home Win       0.68      0.74      0.71      2208
        Draw       0.59      0.49      0.54      1239
    Away Win       0.57      0.58      0.58      1364

    accuracy                           0.63      4811
   macro avg       0.61      0.60      0.61      4811
weighted avg       0.63      0.63      0.63      4811

XGBoost Evaluation:

 XGBoost Metrics:
RMSE: 0.914
MAE: 0.527
Exact Match Accuracy: 62.8%
Within ±1 Goal: 84.5%

              precision    recall  f1-score   support

    Home Win       0.67      0.76      0.71      2208
        Draw       0.60      0.47      0.52      1239
    Away Win       0.57      0.56      0.56      1364

    accuracy                           0.63      4811
   macro avg       0.61      0

### Saving trained Neural Network Model and Dataframe

In [72]:
nn_model.save("neural_net_model.keras")
print("Model saved successfully as 'neural_net_model.keras'")

# Save processed dataset to CSV (including computed season stats & head-to-head stats)
df.to_csv("processed_premier_league_matches.csv", index=False)
print("Processed dataset saved successfully.")

Model saved successfully as 'neural_net_model.keras'
Processed dataset saved successfully.


In [67]:
# Get unique team names from both home and away columns
unique_teams = sorted(set(df["Home"].unique()).union(set(df["Away"].unique())))

# Print and save team names
print("Teams in dataset:", unique_teams)

# Save to a CSV for manual checking if needed
pd.DataFrame(unique_teams, columns=["Team"]).to_csv("team_names.csv", index=False)


Teams in dataset: ['Arsenal', 'Aston Villa', 'Barnsley', 'Birmingham City', 'Blackburn', 'Blackpool', 'Bolton', 'Bournemouth', 'Bradford City', 'Brentford', 'Brighton', 'Burnley', 'Cardiff City', 'Charlton Ath', 'Chelsea', 'Coventry City', 'Crystal Palace', 'Derby County', 'Everton', 'Fulham', 'Huddersfield', 'Hull City', 'Ipswich Town', 'Leeds United', 'Leicester City', 'Liverpool', 'Manchester City', 'Manchester Utd', 'Middlesbrough', 'Newcastle Utd', 'Norwich City', "Nott'ham Forest", 'Oldham Athletic', 'Portsmouth', 'QPR', 'Reading', 'Sheffield Utd', 'Sheffield Weds', 'Southampton', 'Stoke City', 'Sunderland', 'Swansea City', 'Swindon Town', 'Tottenham', 'Watford', 'West Brom', 'West Ham', 'Wigan Athletic', 'Wimbledon', 'Wolves']
