<a href="https://colab.research.google.com/github/Mubby03/Netwalker/blob/master/Premier%20League%20Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Data handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Models
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

# Replace 'home' with the exact path if needed
file_path = '/content/drive/MyDrive/epl_final.csv'
df = pd.read_csv(file_path)

# Quick check
print("Dataset loaded! Number of rows:", len(df))
print(df.head())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset loaded! Number of rows: 9380
    Season   MatchDate  HomeTeam       AwayTeam  FullTimeHomeGoals  \
0  2000/01  2000-08-19  Charlton       Man City                  4   
1  2000/01  2000-08-19   Chelsea       West Ham                  4   
2  2000/01  2000-08-19  Coventry  Middlesbrough                  1   
3  2000/01  2000-08-19     Derby    Southampton                  2   
4  2000/01  2000-08-19     Leeds        Everton                  2   

   FullTimeAwayGoals FullTimeResult  HalfTimeHomeGoals  HalfTimeAwayGoals  \
0                  0              H                  2                  0   
1                  2              H                  1                  0   
2                  3              A                  1                  1   
3                  2              D                  1                  2   
4                  0        

In [None]:
# Check column names
print(df.columns)

# Check for missing values
print(df.isnull().sum())

# Inspect data types
print(df.dtypes)


Index(['Season', 'MatchDate', 'HomeTeam', 'AwayTeam', 'FullTimeHomeGoals',
       'FullTimeAwayGoals', 'FullTimeResult', 'HalfTimeHomeGoals',
       'HalfTimeAwayGoals', 'HalfTimeResult', 'HomeShots', 'AwayShots',
       'HomeShotsOnTarget', 'AwayShotsOnTarget', 'HomeCorners', 'AwayCorners',
       'HomeFouls', 'AwayFouls', 'HomeYellowCards', 'AwayYellowCards',
       'HomeRedCards', 'AwayRedCards'],
      dtype='object')
Season               0
MatchDate            0
HomeTeam             0
AwayTeam             0
FullTimeHomeGoals    0
FullTimeAwayGoals    0
FullTimeResult       0
HalfTimeHomeGoals    0
HalfTimeAwayGoals    0
HalfTimeResult       0
HomeShots            0
AwayShots            0
HomeShotsOnTarget    0
AwayShotsOnTarget    0
HomeCorners          0
AwayCorners          0
HomeFouls            0
AwayFouls            0
HomeYellowCards      0
AwayYellowCards      0
HomeRedCards         0
AwayRedCards         0
dtype: int64
Season               object
MatchDate            object

In [None]:
target = 'FullTimeResult'
features = df.drop(columns=[target, 'Season', 'MatchDate'])

In [None]:
categorical_cols = ['HomeTeam', 'AwayTeam']

df_encoded = pd.get_dummies(df, columns=categorical_cols)


In [None]:
X = df_encoded.drop(columns=[target])
y = df[target]

# Optional: convert H/D/A to numeric labels
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)  # H->0, D->1, A->2


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)


In [None]:
# Columns to drop
drop_cols = ['Season', 'MatchDate', 'HalfTimeResult', 'FullTimeResult']  # target is separate

X = df.drop(columns=drop_cols)
y = df['FullTimeResult']

# One-hot encode the teams
X = pd.get_dummies(X, columns=['HomeTeam', 'AwayTeam'])

# Convert all remaining object columns (if any) to numeric
for col in X.select_dtypes(include=['object']).columns:
    X[col] = pd.to_numeric(X[col], errors='coerce')  # convert or set NaN


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # H -> 0, D -> 1, A -> 2

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)


In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from google.colab import drive
import tensorflow as tf
from tensorflow.keras import layers, Model, Input
from tensorflow.keras.utils import to_categorical

In [None]:

# Optional: install shap in runtime if not already installed
# try:
#     import shap
# except Exception:
#     print("shap not present; installing...")
#     !pip install -q shap
#     import shap

# --------------------------
# 1) Load dataset (mount)
# --------------------------
drive.mount('/content/drive', force_remount=True)
file_path = '/content/drive/MyDrive/epl_final.csv'
df = pd.read_csv(file_path)

# Basic cleaning / prepare
df.columns = df.columns.str.strip().str.replace(' ', '_')
date_cols = [c for c in df.columns if 'date' in c.lower()]
if len(date_cols) == 0:
    raise RuntimeError("No date column found.")
date_col = date_cols[0]
df[date_col] = pd.to_datetime(df[date_col], errors='coerce', infer_datetime_format=True)
df = df.dropna(subset=[date_col]).sort_values(by=date_col).reset_index(drop=True)

required = ['FullTimeHomeGoals','FullTimeAwayGoals','HomeShots','AwayShots',
            'HomeShotsOnTarget','AwayShotsOnTarget','HomeCorners','AwayCorners',
            'HomeFouls','AwayFouls','FullTimeResult','HomeTeam','AwayTeam']
missing = [c for c in required if c not in df.columns]
if missing:
    raise RuntimeError(f"Missing columns: {missing}")

def result_to_label(r):
    if r == 'H': return 2
    elif r == 'D': return 1
    else: return 0
df['label'] = df['FullTimeResult'].apply(result_to_label)

# --------------------------
# 2) Build per-team sequences
# --------------------------
N = 5  # number of past games

def build_team_sequences(df):
    team_history = {}
    teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).unique()
    for team in teams:
        tdf = df[(df['HomeTeam']==team) | (df['AwayTeam']==team)].sort_values(by=date_col)
        seq = []
        for _, r in tdf.iterrows():
            if r['HomeTeam'] == team:
                stats = [r['FullTimeHomeGoals'], r['FullTimeAwayGoals'], r['HomeShots'], r['AwayShots'],
                         r['HomeShotsOnTarget'], r['AwayShotsOnTarget'], r['HomeCorners'], r['AwayCorners'],
                         r['HomeFouls'], r['AwayFouls']]
            else:
                stats = [r['FullTimeAwayGoals'], r['FullTimeHomeGoals'], r['AwayShots'], r['HomeShots'],
                         r['AwayShotsOnTarget'], r['HomeShotsOnTarget'], r['AwayCorners'], r['HomeCorners'],
                         r['AwayFouls'], r['HomeFouls']]
            seq.append(stats)
        team_history[team] = np.array(seq, dtype=float) if len(seq)>0 else np.zeros((0,10))
    return team_history

team_sequences = build_team_sequences(df)
print("Team sequences built for:", len(team_sequences))

# --------------------------
# 3) Build match-level arrays
# --------------------------
X_home, X_away, y, match_dates = [], [], [], []
for _, row in df.iterrows():
    h, a = row['HomeTeam'], row['AwayTeam']
    if h not in team_sequences or a not in team_sequences:
        continue
    def lastN(seq):
        if seq.shape[0] >= N: return seq[-N:]
        pad = np.zeros((N - seq.shape[0], seq.shape[1]))
        return np.vstack((pad, seq))
    X_home.append(lastN(team_sequences[h]))
    X_away.append(lastN(team_sequences[a]))
    y.append(int(row['label']))
    match_dates.append(row[date_col])

X_home, X_away, y = np.array(X_home), np.array(X_away), np.array(y)
match_dates = pd.Series(match_dates)
y_cat = to_categorical(y, 3)
print(" Shapes:", X_home.shape, X_away.shape, y_cat.shape)

# --------------------------
# 4) Time-based train/test split
# --------------------------
match_dates_num = match_dates.view("int64") // 10**9
split_val = np.percentile(match_dates_num, 80)
train_idx = match_dates_num < split_val
test_idx = match_dates_num >= split_val

Xh_train, Xh_test = X_home[train_idx], X_home[test_idx]
Xa_train, Xa_test = X_away[train_idx], X_away[test_idx]
y_train, y_test = y_cat[train_idx], y_cat[test_idx]
print("Train/Test:", len(y_train), len(y_test))

# --------------------------
# 5) Build Dual-BiLSTM model with goals regression heads
# --------------------------
F = X_home.shape[2]
home_in, away_in = Input((N,F)), Input((N,F))
home_lstm = layers.Bidirectional(layers.LSTM(64))(home_in)
away_lstm = layers.Bidirectional(layers.LSTM(64))(away_in)
x = layers.Concatenate()([home_lstm, away_lstm])
x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(0.3)(x)

clf_out = layers.Dense(3, activation='softmax', name='result_out')(x)
home_goal_out = layers.Dense(1, activation='linear', name='home_goal_out')(x)
away_goal_out = layers.Dense(1, activation='linear', name='away_goal_out')(x)

model = Model([home_in, away_in], [clf_out, home_goal_out, away_goal_out])
model.compile(optimizer='adam',
              loss={'result_out':'categorical_crossentropy','home_goal_out':'mse','away_goal_out':'mse'},
              loss_weights={'result_out':1.0,'home_goal_out':0.5,'away_goal_out':0.5},
              metrics={'result_out':'accuracy'})
model.summary()

# --------------------------
# 6) Train
# --------------------------
home_goals = df['FullTimeHomeGoals'].astype(float).values
away_goals = df['FullTimeAwayGoals'].astype(float).values
hg_train, hg_test = home_goals[train_idx], home_goals[test_idx]
ag_train, ag_test = away_goals[train_idx], away_goals[test_idx]

history = model.fit([Xh_train, Xa_train],
                    {'result_out':y_train, 'home_goal_out':hg_train, 'away_goal_out':ag_train},
                    validation_data=([Xh_test, Xa_test], {'result_out':y_test, 'home_goal_out':hg_test, 'away_goal_out':ag_test}),
                    epochs=20, batch_size=64, verbose=2)

# --------------------------
# 7) Predict fixture with Poisson simulation
# --------------------------
from collections import Counter
def predict_fixture_with_scores(home_team, away_team, sims=5000, top_n=6):
    def build_seq(team):
        seq = team_sequences[team]
        if seq.shape[0] >= N:
            return seq[-N:].reshape(1,N,F)
        pad = np.zeros((N - seq.shape[0], seq.shape[1]))
        return np.vstack((pad, seq)).reshape(1,N,F)
    hs, as_ = build_seq(home_team), build_seq(away_team)
    clf_p, h_mean, a_mean = model.predict([hs, as_])
    h_mean, a_mean = float(h_mean[0][0]), float(a_mean[0][0])
    clf_probs = clf_p[0]
    sims_results = [(np.random.poisson(h_mean), np.random.poisson(a_mean)) for _ in range(sims)]
    cnt = Counter(sims_results)
    top = cnt.most_common(top_n)
    hw = sum(1 for h,a in sims_results if h>a)/sims
    dr = sum(1 for h,a in sims_results if h==a)/sims
    aw = sum(1 for h,a in sims_results if h<a)/sims
    return {'means':(h_mean,a_mean),'model_probs':clf_probs,
            'sim_probs':{'home':hw,'draw':dr,'away':aw},
            'top_scores':top}

# --------------------------
# 8) SHAP explainability (safe version)
# --------------------------
def explain_fixture_shap(home_team, away_team, bg_samples=100):
    # Validate
    if home_team not in team_sequences or away_team not in team_sequences:
        raise ValueError("Team not found.")

    # Background samples (for DeepExplainer baseline)
    m = min(bg_samples, Xh_train.shape[0])
    bg_idx = np.random.choice(np.arange(Xh_train.shape[0]), m, replace=False)
    bg = [Xh_train[bg_idx], Xa_train[bg_idx]]

    # Build a submodel for only the classification output
    clf_model = Model(
        inputs=model.inputs,
        outputs=model.get_layer('result_out').output
    )

    # Create the SHAP DeepExplainer
    explainer = shap.DeepExplainer(clf_model, bg)

    # Build sequences for the fixture
    def build_seq(team):
        s = team_sequences[team]
        if s.shape[0] >= N:
            return s[-N:].reshape(1, N, F)
        elif s.shape[0] == 0:
            return np.zeros((1, N, F))
        else:
            pad = np.zeros((N - s.shape[0], s.shape[1]))
            return np.vstack((pad, s)).reshape(1, N, F)

    hs = build_seq(home_team)
    as_ = build_seq(away_team)

    # Compute SHAP values for the classification output only
    shap_values = explainer.shap_values([hs, as_])

    # shap_values is a list of arrays (one per class), each with two elements: [home_input, away_input]
    # We'll take the mean of absolute SHAPs across classes
    shap_mean_home = np.mean([np.abs(sv[0]) for sv in shap_values], axis=0)[0]
    shap_mean_away = np.mean([np.abs(sv[1]) for sv in shap_values], axis=0)[0]

    # Aggregate importance
    home_feature_importance = np.mean(shap_mean_home, axis=0)
    away_feature_importance = np.mean(shap_mean_away, axis=0)

    feature_names = [
        'goals_for','goals_against','shots','opp_shots',
        'sot','opp_sot','corners','opp_corners','fouls','opp_fouls'
    ]

    explanation = {
        'home_feature_importance': dict(zip(feature_names, home_feature_importance.tolist())),
        'away_feature_importance': dict(zip(feature_names, away_feature_importance.tolist()))
    }

    return explanation




# --------------------------
# 9) Example usage
# --------------------------
# teams = sorted(df['HomeTeam'].unique())
# print("\nTeams (sample):", teams[:10])

# example_home, example_away = teams[0], teams[1]
# print(f"\nExample: {example_home} vs {example_away}")
# res = predict_fixture_with_scores(example_home, example_away)
# print("Expected goals:", np.round(res['means'],2))
# print("Model probs:", np.round(res['model_probs'],3))
# print("Simulated win probs:", res['sim_probs'])
# print("Top simulated scores:", res['top_scores'])

# print("\nComputing SHAP explanation (this can take a bit)...")
# shap_expl = explain_fixture_shap(example_home, example_away, bg_samples=80)
# print("Home feature importance:")
# for k,v in shap_expl['home_feature_importance'].items():
#     print(f"  {k}: {v:.6f}")
# print("Away feature importance:")
# for k,v in shap_expl['away_feature_importance'].items():
#     print(f"  {k}: {v:.6f}")





Mounted at /content/drive


  df[date_col] = pd.to_datetime(df[date_col], errors='coerce', infer_datetime_format=True)


Team sequences built for: 46
 Shapes: (9380, 5, 10) (9380, 5, 10) (9380, 3)
Train/Test: 7500 1880


  match_dates_num = match_dates.view("int64") // 10**9


Epoch 1/20
118/118 - 14s - 115ms/step - away_goal_out_loss: 1.3253 - home_goal_out_loss: 1.6856 - loss: 2.5444 - result_out_accuracy: 0.4807 - result_out_loss: 1.0352 - val_away_goal_out_loss: 1.4558 - val_home_goal_out_loss: 1.7773 - val_loss: 2.6342 - val_result_out_accuracy: 0.5261 - val_result_out_loss: 1.0101
Epoch 2/20
118/118 - 2s - 17ms/step - away_goal_out_loss: 1.2275 - home_goal_out_loss: 1.5847 - loss: 2.4204 - result_out_accuracy: 0.5052 - result_out_loss: 1.0136 - val_away_goal_out_loss: 1.4562 - val_home_goal_out_loss: 1.6963 - val_loss: 2.6000 - val_result_out_accuracy: 0.4995 - val_result_out_loss: 1.0183
Epoch 3/20
118/118 - 2s - 18ms/step - away_goal_out_loss: 1.2048 - home_goal_out_loss: 1.5598 - loss: 2.3906 - result_out_accuracy: 0.5148 - result_out_loss: 1.0045 - val_away_goal_out_loss: 1.4459 - val_home_goal_out_loss: 1.7135 - val_loss: 2.6017 - val_result_out_accuracy: 0.5080 - val_result_out_loss: 1.0179
Epoch 4/20
118/118 - 2s - 17ms/step - away_goal_out_loss

In [None]:
2# --------------------------
# 9) Interactive example usage
# --------------------------
teams = sorted(df['HomeTeam'].unique())
print("\nTeams:")
for i, t in enumerate(teams):
    print(f"{i}: {t}")

# Let user pick home and away by index
home_idx = int(input("Enter the number for the HOME team: "))
away_idx = int(input("Enter the number for the AWAY team: "))

example_home, example_away = teams[home_idx], teams[away_idx]
print(f"\nSelected Fixture: {example_home} vs {example_away}")

# Run prediction
res = predict_fixture_with_scores(example_home, example_away)
print("\nExpected goals:", np.round(res['means'],2))
print("Model probs:", np.round(res['model_probs'],3))
print("Simulated win probabilities:", res['sim_probs'])
print("Top simulated scores:", res['top_scores'])

# # SHAP explanation
# print("\nComputing SHAP explanation (this can take a bit)...")
# shap_expl = explain_fixture_shap(example_home, example_away, bg_samples=80)
# print("\nHome feature importance:")
# for k,v in shap_expl['home_feature_importance'].items():
#     print(f"  {k}: {v:.6f}")
# print("\nAway feature importance:")
# for k,v in shap_expl['away_feature_importance'].items():
#     print(f"  {k}: {v:.6f}")



Teams:
0: Arsenal
1: Aston Villa
2: Birmingham
3: Blackburn
4: Blackpool
5: Bolton
6: Bournemouth
7: Bradford
8: Brentford
9: Brighton
10: Burnley
11: Cardiff
12: Charlton
13: Chelsea
14: Coventry
15: Crystal Palace
16: Derby
17: Everton
18: Fulham
19: Huddersfield
20: Hull
21: Ipswich
22: Leeds
23: Leicester
24: Liverpool
25: Luton
26: Man City
27: Man United
28: Middlesbrough
29: Newcastle
30: Norwich
31: Nott'm Forest
32: Portsmouth
33: QPR
34: Reading
35: Sheffield United
36: Southampton
37: Stoke
38: Sunderland
39: Swansea
40: Tottenham
41: Watford
42: West Brom
43: West Ham
44: Wigan
45: Wolves
Enter the number for the HOME team: 17
Enter the number for the AWAY team: 40

Selected Fixture: Everton vs Tottenham
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step

Expected goals: [1.49 1.35]
Model probs: [0.299 0.307 0.394]
Simulated win probabilities: {'home': 0.4282, 'draw': 0.2282, 'away': 0.3436}
Top simulated scores: [((1, 1), 545), ((1, 0), 466), ((2, 1),

In [None]:
from sklearn.metrics import mean_absolute_error

# --------------------------
# 1) Predict on test set
# --------------------------
y_pred_result, y_pred_home, y_pred_away = model.predict([Xh_test, Xa_test], verbose=0)

# --------------------------
# 2) Compute MAE for home and away goals
# --------------------------
mae_home = mean_absolute_error(hg_test, y_pred_home)
mae_away = mean_absolute_error(ag_test, y_pred_away)

print(f"MAE - Home Goals: {mae_home:.3f}")
print(f"MAE - Away Goals: {mae_away:.3f}")


MAE - Home Goals: 1.037
MAE - Away Goals: 0.931
