In [31]:
import numpy as np
import pandas as pd
from typing import Tuple, Optional, Iterable, Dict, List

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import log_loss, accuracy_score


In [32]:
def compute_pi_ratings(
    leagues: pd.DataFrame,
    c_scale: float = 800.0,
    mu1: float = 0.05,
    mu2: float = 0.5,
    date_col: str = "date",
    home_col: str = "home_team",
    away_col: str = "away_team",
    hg_col: str = "home_goals",
    ag_col: str = "away_goals",
    score_valid_col: Optional[str] = "is_valid_score",
    sort_by: Iterable[str] = ("date", "league", "season"),
    today: Optional[pd.Timestamp] = None,
) -> Tuple[pd.DataFrame, Dict[str, float]]:
    """
    Compute PRE/POST PI ratings per match. Only valid, nonâ€‘future rows update ratings.
    """
    def exp_goal_diff(c: float, hr: float, ar: float) -> float:
        if not (c > 0):
            raise ValueError("c must be > 0")
        return np.sinh(hr / c) - np.sinh(ar / c)

    def update_ratings(
        wehome: float, weaway: float,
        hrhome: float, hraway: float,
        arhome: float, araway: float,
        mu1_local: float, mu2_local: float
    ) -> Tuple[float, float, float, float]:
        dh = wehome * mu1_local
        da = weaway * mu1_local
        hrhome_new = hrhome + dh
        hraway_new = hraway - mu2_local * dh
        araway_new = araway + da
        arhome_new = arhome - mu2_local * da
        return hrhome_new, hraway_new, arhome_new, araway_new

    required = {home_col, away_col, hg_col, ag_col}
    missing = required - set(leagues.columns)
    if missing:
        raise ValueError(f"Missing required columns: {sorted(missing)}")

    df = leagues.copy()
    sort_cols = [c for c in sort_by if c in df.columns] if sort_by else []
    df = df.sort_values(sort_cols).reset_index(drop=True) if sort_cols else df.reset_index(drop=True)

    # Init team dict
    teams = pd.unique(pd.concat([df[home_col], df[away_col]]).dropna())
    home_key = "Home {}"
    away_key = "Away {}"
    pi_dictionary: Dict[str, float] = {home_key.format(t): 0.0 for t in teams}
    pi_dictionary.update({away_key.format(t): 0.0 for t in teams})

    # Ensure output cols
    pre_cols = ["Home Home Rating", "Home Away Rating", "Away Home Rating", "Away Away Rating"]
    post_cols = ["Home Home Rating Post", "Home Away Rating Post", "Away Home Rating Post", "Away Away Rating Post"]
    aux_cols = ["Expected GD", "Obs GD", "Err (Obs-Exp)"]
    for col in pre_cols + post_cols + aux_cols:
        if col not in df.columns:
            df[col] = np.nan

    # Today boundary
    if today is None:
        today = pd.Timestamp("today").normalize()

    # Future mask
    if date_col in df.columns:
        dates = pd.to_datetime(df[date_col], errors="coerce")
        try:
            dates = dates.dt.tz_convert(None)
        except Exception:
            pass
        is_future = dates.dt.normalize() > today
    else:
        is_future = pd.Series(False, index=df.index)

    # Valid scored matches
    has_scores = df[hg_col].notna() & df[ag_col].notna()
    if score_valid_col and score_valid_col in df.columns:
        has_scores &= df[score_valid_col].fillna(False)
    valid_mask = has_scores & (~is_future)

    # Iterate chronologically
    for i, row in df.loc[valid_mask].iterrows():
        home = row[home_col]
        away = row[away_col]
        home_score = float(row[hg_col])
        away_score = float(row[ag_col])

        h_hr = pi_dictionary[home_key.format(home)]
        h_ar = pi_dictionary[away_key.format(home)]
        a_hr = pi_dictionary[home_key.format(away)]
        a_ar = pi_dictionary[away_key.format(away)]

        # PRE
        df.at[i, "Home Home Rating"] = h_hr
        df.at[i, "Home Away Rating"] = h_ar
        df.at[i, "Away Home Rating"] = a_hr
        df.at[i, "Away Away Rating"] = a_ar

        # Error
        obs_gd = home_score - away_score
        exp_gd = exp_goal_diff(c_scale, h_hr, a_ar)
        err = obs_gd - exp_gd
        df.at[i, "Expected GD"] = exp_gd
        df.at[i, "Obs GD"] = obs_gd
        df.at[i, "Err (Obs-Exp)"] = err

        # Symmetric weighted update
        if np.isclose(err, 0.0):
            wehome = 0.0
            weaway = 0.0
        else:
            mag = np.log1p(abs(err))
            sgn = np.sign(err)
            wehome = sgn * mag
            weaway = -wehome
        wehome *= mu1
        weaway *= mu1

        h_hr_new, h_ar_new, a_hr_new, a_ar_new = update_ratings(
            wehome, weaway, h_hr, h_ar, a_hr, a_ar, 1.0, mu2
        )

        # POST
        df.at[i, "Home Home Rating Post"] = h_hr_new
        df.at[i, "Home Away Rating Post"] = h_ar_new
        df.at[i, "Away Home Rating Post"] = a_hr_new
        df.at[i, "Away Away Rating Post"] = a_ar_new

        # Commit
        pi_dictionary[home_key.format(home)] = h_hr_new
        pi_dictionary[away_key.format(home)] = h_ar_new
        pi_dictionary[home_key.format(away)] = a_hr_new
        pi_dictionary[away_key.format(away)] = a_ar_new

    return df, pi_dictionary


def make_labels(df: pd.DataFrame) -> pd.Series:
    """0=Away win, 1=Draw, 2=Home win."""
    res = np.sign(df["home_goals"].values - df["away_goals"].values)
    return pd.Series(res, index=df.index).map({-1: 0, 0: 1, 1: 2})

In [34]:

# Cell 3: xG rolling features (last 3, past-only)
def add_xg_last3_features(
    df: pd.DataFrame,
    date_col: str = "date",
    home_col: str = "home_team",
    away_col: str = "away_team",
    home_xg_col: str = "home_xg",
    away_xg_col: str = "away_xg",
) -> pd.DataFrame:
    """
    Adds per-team last-3 average xG scored and conceded (past-only via shift):
      home_xg3_for, home_xg3_against, away_xg3_for, away_xg3_against
    Only uses historic xG columns; requires xG present in the source rows used in the rolling window.
    """
    out = df.copy()
    # Normalize date
    if date_col in out.columns:
        dates = pd.to_datetime(out[date_col], errors="coerce")
        try:
            dates = dates.dt.tz_convert(None)
        except Exception:
            pass
        out[date_col] = dates

    # Long team timeline with xG for/against
    home_part = out[[date_col, home_col, home_xg_col, away_xg_col]].rename(
        columns={home_col: "team", home_xg_col: "xg_for", away_xg_col: "xg_against"}
    )
    away_part = out[[date_col, away_col, home_xg_col, away_xg_col]].rename(
        columns={away_col: "team", away_xg_col: "xg_for", home_xg_col: "xg_against"}
    )
    team_games = pd.concat([home_part, away_part], axis=0, ignore_index=True)
    team_games = team_games.sort_values([date_col, "team"]).reset_index(drop=True)

    # Past-only rolling means (window=3, requires >=3 prior games with xG)
    def _roll3(s: pd.Series) -> pd.Series:
        return s.shift(1).rolling(window=3, min_periods=3).mean()

    team_games["xg3_for"] = team_games.groupby("team", group_keys=False)["xg_for"].apply(_roll3)
    team_games["xg3_against"] = team_games.groupby("team", group_keys=False)["xg_against"].apply(_roll3)

    # Merge back for home/away teams at match date
    # Home features
    home_feats = team_games[[date_col, "team", "xg3_for", "xg3_against"]].rename(
        columns={
            "team": home_col,
            "xg3_for": "home_xg3_for",
            "xg3_against": "home_xg3_against",
        }
    )
    out = out.merge(home_feats, on=[date_col, home_col], how="left")

    # Away features
    away_feats = team_games[[date_col, "team", "xg3_for", "xg3_against"]].rename(
        columns={
            "team": away_col,
            "xg3_for": "away_xg3_for",
            "xg3_against": "away_xg3_against",
        }
    )
    out = out.merge(away_feats, on=[date_col, away_col], how="left")

    return out


In [18]:
# load pickle file

# Cell 4: dataset builders with xG features
def build_Xy_with_xg(df_with_pi: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
    """
    Build training data using PRE PI + per-team last-3 xG features.
    Trains only on rows where:
      - Scores are present and valid.
      - home_xg and away_xg exist.
      - Both teams have >=3 prior games with xG (rolling features not null).
    """
    df_feat = add_xg_last3_features(df_with_pi)

    # Base masks
    scored = df_feat["home_goals"].notna() & df_feat["away_goals"].notna()
    if "is_valid_score" in df_feat.columns:
        scored &= df_feat["is_valid_score"].fillna(False)

    has_xg = df_feat["home_xg"].notna() & df_feat["away_xg"].notna()

    # PRE PI present
    has_pi = df_feat["Home Home Rating"].notna() & df_feat["Away Away Rating"].notna()

    # Last-3 xG present for both teams
    has_xg3 = (
        df_feat["home_xg3_for"].notna()
        & df_feat["home_xg3_against"].notna()
        & df_feat["away_xg3_for"].notna()
        & df_feat["away_xg3_against"].notna()
    )

    mask = scored & has_xg & has_pi & has_xg3

    feat_cols = [
        "Home Home Rating",
        "Away Away Rating",
        "home_xg3_for",
        "home_xg3_against",
        "away_xg3_for",
        "away_xg3_against",
    ]
    X = df_feat.loc[mask, feat_cols].copy()
    y = make_labels(df_feat.loc[mask])
    return X, y


def ts_cv_score_logit(X: pd.DataFrame, y: pd.Series, n_splits: int = 5) -> Tuple[float, float]:
    """
    TimeSeriesSplit evaluation returning mean log loss and mean accuracy.
    """
    tscv = TimeSeriesSplit(n_splits=n_splits)
    losses: List[float] = []
    accs: List[float] = []
    for tr, te in tscv.split(X):
        pipe = make_pipeline(
            StandardScaler(),
            LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=2000)
        )
        pipe.fit(X.iloc[tr], y.iloc[tr])
        y_proba = pipe.predict_proba(X.iloc[te])
        y_pred = np.argmax(y_proba, axis=1)
        losses.append(log_loss(y.iloc[te], y_proba, labels=[0, 1, 2]))
        accs.append(accuracy_score(y.iloc[te], y_pred))
    return float(np.mean(losses)), float(np.mean(accs))


In [19]:

# Cell 5: tuning with xG features
def tune_pi_for_logit(
    leagues: pd.DataFrame,
    c_grid: Iterable[float],
    mu1_grid: Iterable[float],
    mu2_grid: Iterable[float],
    n_splits: int = 5,
) -> Tuple[Dict[str, float], pd.DataFrame]:
    """
    Grid-search over (c_scale, mu1, mu2) using PRE PI + xG last-3 features.
    Selects by mean log loss (lower is better).
    """
    rows: List[Dict] = []
    best = {"c_scale": None, "mu1": None, "mu2": None, "mean_log_loss": np.inf, "mean_acc": 0.0}

    for c in c_grid:
        for m1 in mu1_grid:
            for m2 in mu2_grid:
                df_pi, _ = compute_pi_ratings(
                    leagues,
                    c_scale=c,
                    mu1=m1,
                    mu2=m2,
                    sort_by=("date", "league", "season"),
                )
                X, y = build_Xy_with_xg(df_pi)
                # Need enough samples
                if len(X) < max(60, 2 * n_splits):
                    continue

                mean_ll, mean_acc = ts_cv_score_logit(X, y, n_splits=n_splits)
                rows.append({"c_scale": c, "mu1": m1, "mu2": m2,
                             "mean_log_loss": mean_ll, "mean_acc": mean_acc})

                if mean_ll < best["mean_log_loss"]:
                    best = {"c_scale": c, "mu1": m1, "mu2": m2,
                            "mean_log_loss": mean_ll, "mean_acc": mean_acc}

    results = pd.DataFrame(rows).sort_values(
        ["mean_log_loss", "mean_acc"], ascending=[True, False]
    ).reset_index(drop=True)
    return best, results

In [13]:

# Cell 6: final training pipeline (returns 7 items for convenient unpacking)
def train_logit_pipeline(
    leagues: pd.DataFrame,
    c_grid: Iterable[float],
    mu1_grid: Iterable[float],
    mu2_grid: Iterable[float],
    n_splits: int = 5,
):
    """
    Orchestrate tuning + final fit with xG features and return:
      final_model, best_params, cv_results, df_with_pi_and_xg, final_pi_dict, X, y
    """
    best_params, cv_results = tune_pi_for_logit(leagues, c_grid, mu1_grid, mu2_grid, n_splits=n_splits)

    # Recompute PI with best params
    df_with_pi, final_pi_dict = compute_pi_ratings(
        leagues,
        c_scale=best_params["c_scale"],
        mu1=best_params["mu1"],
        mu2=best_params["mu2"],
        sort_by=("date", "league", "season"),
    )

    # Build features and labels
    X, y = build_Xy_with_xg(df_with_pi)

    # Final multinomial logit
    final_model = make_pipeline(
        StandardScaler(),
        LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=2000)
    )
    final_model.fit(X, y)

    # Return df augmented with xG 3-game features for convenience
    df_with_pi_and_xg = add_xg_last3_features(df_with_pi)
    return final_model, best_params, cv_results, df_with_pi_and_xg, final_pi_dict, X, y



In [35]:

# Cell 7: helpers for predictions with xG features
def _get_classes(model) -> np.ndarray:
    if hasattr(model, "classes_"):
        return np.asarray(model.classes_)
    if hasattr(model, "steps") and len(model.steps) and hasattr(model.steps[-1][1], "classes_"):
        return np.asarray(model.steps[-1][1].classes_)
    return np.array([0, 1, 2])


def predict_next_game_probs_for_each_team(
    leagues: pd.DataFrame,
    model,
    c_scale: float,
    mu1: float,
    mu2: float,
    date_col: str = "date",
    home_col: str = "home_team",
    away_col: str = "away_team",
    sort_by: Iterable[str] = ("date", "league", "season"),
    today: Optional[pd.Timestamp] = None,
) -> pd.DataFrame:
    """
    For each team, pick its next scheduled match and return probabilities using:
      PRE PI + last-3 xG scored/conceded. Uses only past xG (shifted).
    Rows without 3 prior xG matches for both teams are skipped.
    """
    # Ratings up to today (no future updates)
    df_pi, pi_dict = compute_pi_ratings(
        leagues,
        c_scale=c_scale, mu1=mu1, mu2=mu2,
        date_col=date_col, home_col=home_col, away_col=away_col,
        sort_by=sort_by, today=today
    )
    df_feat = add_xg_last3_features(df_pi, date_col=date_col, home_col=home_col, away_col=away_col)

    if today is None:
        today = pd.Timestamp("today").normalize()

    dates = pd.to_datetime(df_feat[date_col], errors="coerce")
    try:
        dates = dates.dt.tz_convert(None)
    except Exception:
        pass
    is_future = dates.dt.normalize() > today
    future = df_feat.loc[is_future].copy()
    if future.empty:
        return pd.DataFrame(columns=[date_col, home_col, away_col, "p_away", "p_draw", "p_home"])

    order_cols = [date_col] + [c for c in ["league", "season"] if c in future.columns]
    future = future.sort_values(order_cols)

    # One next match per team
    teams = pd.unique(pd.concat([df_feat[home_col], df_feat[away_col]]).dropna())
    next_idx: List[int] = []
    for t in teams:
        rows = future[(future[home_col] == t) | (future[away_col] == t)]
        if not rows.empty:
            next_idx.append(rows.iloc[0].name)

    fut_subset = df_feat.loc[pd.unique(next_idx)].sort_values(order_cols)

    # Features: fill PRE PI via dict (future rows), xG last-3 must exist
    X_pred = pd.DataFrame({
        "Home Home Rating": [pi_dict.get(f"Home {h}", 0.0) for h in fut_subset[home_col]],
        "Away Away Rating": [pi_dict.get(f"Away {a}", 0.0) for a in fut_subset[away_col]],
        "home_xg3_for": fut_subset["home_xg3_for"].values,
        "home_xg3_against": fut_subset["home_xg3_against"].values,
        "away_xg3_for": fut_subset["away_xg3_for"].values,
        "away_xg3_against": fut_subset["away_xg3_against"].values,
    }, index=fut_subset.index)

    # Require xG last-3 present
    ok = X_pred.notna().all(axis=1)
    X_pred = X_pred.loc[ok]
    fut_subset = fut_subset.loc[ok]
    if X_pred.empty:
        return pd.DataFrame(columns=[date_col, home_col, away_col, "p_away", "p_draw", "p_home"])

    proba = model.predict_proba(X_pred)
    classes = _get_classes(model)

    def _col_for(label: int) -> int:
        loc = np.where(classes == label)[0]
        return int(loc[0]) if len(loc) else label

    out = fut_subset[[c for c in [date_col, "league", "season", home_col, away_col] if c in fut_subset.columns]].copy()
    out["p_away"] = proba[:, _col_for(0)]
    out["p_draw"] = proba[:, _col_for(1)]
    out["p_home"] = proba[:, _col_for(2)]
    return out.sort_values(order_cols).reset_index(drop=True)


def add_probabilities_for_all_games(
    leagues: pd.DataFrame,
    model,
    c_scale: float,
    mu1: float,
    mu2: float,
    date_col: str = "date",
    home_col: str = "home_team",
    away_col: str = "away_team",
    sort_by: Iterable[str] = ("date", "league", "season"),
    today: Optional[pd.Timestamp] = None,
) -> pd.DataFrame:
    """
    Add p_away, p_draw, p_home for all rows where features exist:
      PRE PI (future PRE filled from dict) + last-3 xG scored/conceded for both teams.
    Rows without last-3 xG for both teams are left without probabilities.
    """
    df_pi, pi_dict = compute_pi_ratings(
        leagues,
        c_scale=c_scale, mu1=mu1, mu2=mu2,
        date_col=date_col, home_col=home_col, away_col=away_col,
        sort_by=sort_by, today=today
    )
    df_feat = add_xg_last3_features(df_pi, date_col=date_col, home_col=home_col, away_col=away_col)

    # Build features for all rows; fill PRE PI via dict if missing
    hh = df_feat["Home Home Rating"].copy()
    aa = df_feat["Away Away Rating"].copy()
    hh_fallback = df_feat[home_col].map(lambda h: pi_dict.get(f"Home {h}", 0.0))
    aa_fallback = df_feat[away_col].map(lambda a: pi_dict.get(f"Away {a}", 0.0))

    X_pred = pd.DataFrame({
        "Home Home Rating": hh.fillna(hh_fallback),
        "Away Away Rating": aa.fillna(aa_fallback),
        "home_xg3_for": df_feat["home_xg3_for"],
        "home_xg3_against": df_feat["home_xg3_against"],
        "away_xg3_for": df_feat["away_xg3_for"],
        "away_xg3_against": df_feat["away_xg3_against"],
    }, index=df_feat.index)

    ok = X_pred.notna().all(axis=1)
    proba = np.full((len(df_feat), 3), np.nan, dtype=float)
    if ok.any():
        proba_ok = model.predict_proba(X_pred.loc[ok])
        classes = _get_classes(model)

        def _col_for(label: int) -> int:
            loc = np.where(classes == label)[0]
            return int(loc[0]) if len(loc) else label

        proba[ok, _col_for(0)] = proba_ok[:, _col_for(0)]
        proba[ok, _col_for(1)] = proba_ok[:, _col_for(1)]
        proba[ok, _col_for(2)] = proba_ok[:, _col_for(2)]

    df_out = df_feat.copy()
    df_out["p_away"] = proba[:, 0]
    df_out["p_draw"] = proba[:, 1]
    df_out["p_home"] = proba[:, 2]

    order_cols = [c for c in (date_col, "league", "season") if c in df_out.columns]
    if order_cols:
        df_out = df_out.sort_values(order_cols).reset_index(drop=True)
    return df_out


In [38]:
leagues = pd.read_pickle('/Users/luisenriquekaiser/Documents/soccer_betting_forecast/data/processed/top5_leagues_schedule.pkl')
pi_rating_grid = {
     'K': [600.0, 700.0, 800.0, 900.0, 1000.0],
     'decay': [0.02, 0.05, 0.08],
     'home_adv': [0.3, 0.5, 0.7]
 }
final_model, best_params, cv_results, df_with_pi_and_xg, final_pi_dict, X, y = train_logit_pipeline(
     leagues,
     c_grid=pi_rating_grid['K'],
     mu1_grid=pi_rating_grid['decay'],
     mu2_grid=pi_rating_grid['home_adv'],
     n_splits=5
 )




In [39]:
all_probs = add_probabilities_for_all_games(
     leagues,
     model=final_model,
     c_scale=best_params["c_scale"],
     mu1=best_params["mu1"],
     mu2=best_params["mu2"],
 )
 next_probs = predict_next_game_probs_for_each_team(
     leagues,
     model=final_model,
     c_scale=best_params["c_scale"],
     mu1=best_params["mu1"],
     mu2=best_params["mu2"],
 )

IndentationError: unexpected indent (999312248.py, line 8)

In [None]:
e