In [2]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import PoissonRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [11]:
main_path = "League Table (2026).xlsx"
base = pd.read_excel(main_path, sheet_name="Base")

def norm(s):
    return str(s).strip().lower().replace(" ", "_")

for col in ["home_team", "away_team"]:
    base[col] = base[col].map(norm)

# regulation only (so ties mean something)
train = base[base["max went_ot"] == 0].copy()

# build long-format training set from train
home_df = pd.DataFrame({
    "goals": train["sum home_goals"].astype(int),
    "team_for": train["home_team"],
    "team_against": train["away_team"],
    "is_home": 1
})
away_df = pd.DataFrame({
    "goals": train["sum away_goals"].astype(int),
    "team_for": train["away_team"],
    "team_against": train["home_team"],
    "is_home": 0
})
long = pd.concat([home_df, away_df], ignore_index=True)

long["team_for"] = long["team_for"].map(norm)
long["team_against"] = long["team_against"].map(norm)

X_raw = long[["team_for", "team_against", "is_home"]]
y = long["goals"].values

pre = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), ["team_for", "team_against"]),
        ("num", "passthrough", ["is_home"])
    ]
)

model = Pipeline(steps=[
    ("pre", pre),
    ("pois", PoissonRegressor(alpha=0.5, max_iter=5000))
])

model.fit(X_raw, y)

def predict_lambdas(home_team, away_team):
    Xh = pd.DataFrame([{"team_for": home_team, "team_against": away_team, "is_home": 1}])
    Xa = pd.DataFrame([{"team_for": away_team, "team_against": home_team, "is_home": 0}])
    return float(model.predict(Xh)[0]), float(model.predict(Xa)[0])

def win_prob_home(lam_h, lam_a, rng, sims=100_000, p_ot=0.5):
    gh = rng.poisson(lam_h, size=sims)
    ga = rng.poisson(lam_a, size=sims)

    regwin = (gh > ga)
    tie = (gh == ga)

    # resolve ties via OT
    ot_home_win = rng.random(np.sum(tie)) < p_ot
    home_win = np.sum(regwin) + np.sum(ot_home_win)

    return home_win / sims   # single number, no ties possible


rng = np.random.default_rng(0)

In [56]:
home = "uae"
away = "indonesia"
home_x, away_x = predict_lambdas(home, away)

win_prob = win_prob_home(home_x, away_x, rng)
print(f"{home} vs {away}, win prob: {win_prob}")

lam_h, lam_a = predict_lambdas(home, away)
print("lambdas:", lam_h, lam_a)
print("win prob:", win_prob_home(lam_h, lam_a, rng))


uae vs indonesia, win prob: 0.52578
lambdas: 2.8349218440860313 2.6716472010877785
win prob: 0.52838
