In [2]:
import pandas as pd
import numpy as np
import math
import pickle
from pathlib import Path
from collections import defaultdict
from math import exp, factorial
from sklearn.linear_model import PoissonRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier


In [3]:
# ------------------------------------------------
# 1. Load dataset
# ------------------------------------------------
results = pd.read_csv("results.csv")
results.columns = results.columns.str.strip()

# Raw dataset had no date column → create synthetic timeline using seasons
results["date"] = None
start = pd.Timestamp("2019-08-09")

for season in sorted(results["season"].unique()):
    mask = results["season"] == season
    n = mask.sum()
    results.loc[mask, "date"] = pd.date_range(start=start, periods=n, freq="D")
    start += pd.Timedelta(days=n + 20)

results = results.sort_values("date").reset_index(drop=True)
print("✅ Dataset loaded and timeline created!\n")

teams = sorted(pd.concat([results['home_team'], results['away_team']]).unique())


✅ Dataset loaded and timeline created!



In [4]:
# ------------------------------------------------
# 2. Train Poisson to get attack/defence strengths
# ------------------------------------------------
Xg, yg = [], []

for _, r in results.iterrows():
    Xg.append([r.home_team, r.away_team, 1])
    yg.append(r.home_goals)
    Xg.append([r.away_team, r.home_team, 0])
    yg.append(r.away_goals)

df_g = pd.DataFrame(Xg, columns=['team','opp','home'])
df_g = pd.get_dummies(df_g, columns=['team','opp'])
yg = np.array(yg)

poisson = PoissonRegressor(max_iter=300)
poisson.fit(df_g, yg)

attack = {t:1.0 for t in teams}
defense = {t:1.0 for t in teams}

for i, c in enumerate(df_g.columns):
    if c.startswith("team_"):
        attack[c.split("_",1)[1]] = math.exp(poisson.coef_[i])
    if c.startswith("opp_"):
        defense[c.split("_",1)[1]] = math.exp(poisson.coef_[i])

print("✅ Attack/Defense ratings trained!\n")

✅ Attack/Defense ratings trained!



In [5]:
# ------------------------------------------------
# 3. Compute features from tuned v1
# ------------------------------------------------
elo = {t:1500 for t in teams}
HOME_ADV = 65
K = 30

def expected(a,b):
    return 1/(1+10**((b-a)/400))

def update_elo(Rh, Ra, Sh):
    Eh = expected(Rh, Ra)
    new_Rh = Rh + K * (Sh - Eh)
    new_Ra = Ra + K * ((1 - Sh) - (1 - Eh))
    return new_Rh, new_Ra

for _, r in results.iterrows():
    h, a = r.home_team, r.away_team
    Rh, Ra = elo[h] + HOME_ADV, elo[a]

    if r.result == 'H': Sh = 1
    elif r.result == 'A': Sh = 0
    else: Sh = 0.5

    new_Rh, new_Ra = update_elo(Rh, Ra, Sh)
    elo[h], elo[a] = new_Rh - HOME_ADV, new_Ra

results['EloDiff'] = results.apply(lambda r: elo[r.home_team]-elo[r.away_team], axis=1)

# tuned lambda: league normalized + Elo influenced
league_goal_avg = results["home_goals"].mean()

def tuned_lambda(team, opp, home=False):
    att = attack.get(team, 1.0)
    defn = defense.get(opp, 1.0)
    elo_factor = exp((elo[team] - elo[opp]) / 1000)
    home_factor = 1.15 if home else 1.0
    lam = league_goal_avg * att * defn * elo_factor + home_factor
    return max(lam, 0.10)

results["lam_home"] = results.apply(lambda r: tuned_lambda(r.home_team, r.away_team, home=True), axis=1)
results["lam_away"] = results.apply(lambda r: tuned_lambda(r.away_team, r.home_team, home=False), axis=1)
results["xG_diff"]  = results["lam_home"] - results["lam_away"]

# fix rolling form flags
results['home_win_flag'] = (results['result'] == 'H').astype(int)
results['away_win_flag'] = (results['result'] == 'A').astype(int)

results['form_home'] = results.groupby('home_team')['home_win_flag'].rolling(5, min_periods=1).mean().reset_index(drop=True)
results['form_away'] = results.groupby('away_team')['away_win_flag'].rolling(5, min_periods=1).mean().reset_index(drop=True)

# avg recent goals
results['goal_avg_home_10'] = results.groupby('home_team')['home_goals'].rolling(10, min_periods=1).mean().reset_index(drop=True)
results['goal_avg_away_10'] = results.groupby('away_team')['away_goals'].rolling(10, min_periods=1).mean().reset_index(drop=True)

# xG form last 5 approximated
results['xg_form_home_5'] = results.groupby('home_team')['lam_home'].rolling(5, min_periods=1).mean().reset_index(drop=True)
results['xg_form_away_5'] = results.groupby('away_team')['lam_away'].rolling(5, min_periods=1).mean().reset_index(drop=True)

# expected points from lambda comparison
def get_exp_points(lam_h, lam_a):
    if lam_h > lam_a + 0.3:
        return 3
    elif lam_a > lam_h + 0.3:
        return 0
    else:
        return 1

results['expected_points'] = results.apply(lambda r: get_exp_points(r.lam_home, r.lam_away), axis=1)

# drop old result column manipulation for classifier
y_data = results['result'].map({'H':2,'D':1,'A':0})

X_full = results[['xG_diff','form_home','form_away','EloDiff','goal_avg_home_10','goal_avg_away_10','xg_form_home_5','xg_form_away_5','expected_points']].fillna(0.33)

In [6]:
# ------------------------------------------------
# 4. Train XGBoost match outcome classifier using meaningful 9 features
# ------------------------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_full)

xgb = XGBClassifier(
    objective="multi:softprob",
    num_class=3,
    n_estimators=350,
    learning_rate=0.035,
    max_depth=3,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42
)
xgb.fit(X_scaled, y_data)
print("✅ XGBoost outcome classifier trained!\n")


✅ XGBoost outcome classifier trained!



In [7]:
# ------------------------------------------------
# 5. Save all model artifacts for Streamlit
# ------------------------------------------------
Path("models").mkdir(exist_ok=True)

pickle.dump(attack, open("models/attack.pkl","wb"))
pickle.dump(defense, open("models/defense.pkl","wb"))
pickle.dump(elo, open("models/elo_ratings.pkl","wb"))
pickle.dump(xgb, open("models/xgboost_model.pkl","wb"))
pickle.dump(
    {"scaler": scaler, "cols": list(X_full.columns)},
    open("models/feature_scaler.pkl","wb")
)

print("✅ All models & ratings saved in `/models` folder!")

✅ All models & ratings saved in `/models` folder!
