In [1]:
# If first time in this environment:
!pip install -q MLB-StatsAPI pandas numpy

import math
from dataclasses import dataclass
from typing import Dict, Tuple, Optional

import numpy as np
import pandas as pd
import statsapi

In [2]:
# Logistic and container
_SIGMOID = lambda x: 1.0 / (1.0 + np.exp(-x))

@dataclass
class DivisionStrengths:
    thetas: Dict[str, float]  # division -> theta (mean-centered)
    home_field: float         # league-wide home-field (log-odds)
    converged: bool
    iters: int

def _prepare_interdivision(df: pd.DataFrame) -> pd.DataFrame:
    req_cols = {"home_team","away_team","home_division","away_division","home_score","away_score","date"}
    missing = req_cols - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {sorted(missing)}")
    x = df.copy()
    x["date"] = pd.to_datetime(x["date"])
    x = x.loc[x["home_division"] != x["away_division"]].copy()
    x["home_win"] = (x["home_score"] > x["away_score"]).astype(int)
    return x

def fit_division_strengths(
    df: pd.DataFrame,
    max_iter: int = 1200,
    tol: float = 1e-7,
    l2: float = 0.1,
    verbose: bool = False,
) -> DivisionStrengths:
    """
    Fit Bradley–Terry strengths for divisions + a single home-field parameter,
    using only inter-division games in df.
    """
    games = _prepare_interdivision(df)
    if games.empty:
        return DivisionStrengths(thetas={}, home_field=0.0, converged=True, iters=0)

    divisions: np.ndarray = np.unique(pd.concat([games["home_division"], games["away_division"]], ignore_index=True).values)
    D = len(divisions)
    div_index = {d:i for i,d in enumerate(divisions)}

    i_home = games["home_division"].map(div_index).to_numpy()
    i_away = games["away_division"].map(div_index).to_numpy()
    y = games["home_win"].to_numpy().astype(float)

    theta = np.zeros(D, dtype=float)
    h = 0.0
    g2_theta = np.zeros_like(theta)
    g2_h = 0.0
    base_lr = 0.2

    def ll_and_grad(theta, h):
        eta = theta[i_home] - theta[i_away] + h
        p = _SIGMOID(eta)
        eps = 1e-12
        ll = np.sum(y*np.log(p+eps) + (1-y)*np.log(1-p+eps)) - 0.5*l2*np.sum(theta**2)
        diff = (y - p)
        g_theta = np.zeros_like(theta)
        np.add.at(g_theta, i_home, diff)
        np.add.at(g_theta, i_away, -diff)
        g_theta -= l2*theta
        g_h = np.sum(diff)
        return ll, g_theta, g_h

    prev_ll = -np.inf
    converged = False
    iters = 0
    for t in range(1, max_iter+1):
        ll, g_theta, g_h = ll_and_grad(theta, h)
        g2_theta += g_theta**2
        g2_h += g_h**2
        theta += (base_lr / (np.sqrt(g2_theta) + 1e-8)) * g_theta
        h     += (base_lr / (np.sqrt(g2_h)    + 1e-8)) * g_h
        theta -= np.mean(theta)  # identifiability

        if abs(ll - prev_ll) < tol:
            converged, iters = True, t
            break
        prev_ll, iters = ll, t

    thetas = {d: float(theta[div_index[d]]) for d in divisions}
    return DivisionStrengths(thetas=thetas, home_field=float(h), converged=converged, iters=iters)

def predict_home_win_prob(home_div: str, away_div: str, strengths: DivisionStrengths) -> float:
    a = strengths.thetas.get(home_div, 0.0)
    b = strengths.thetas.get(away_div, 0.0)
    h = strengths.home_field
    return float(_SIGMOID(a - b + h))


In [3]:
def _fetch_team_divisions(season: int) -> pd.DataFrame:
    teams = statsapi.get('teams', {'season': season, 'sportId': 1})['teams']
    rows = []
    for t in teams:
        rows.append({
            'team_id': t['id'],
            'team_name': t['name'],
            'abbrev': t.get('abbreviation') or t.get('teamCode'),
            'division_id': (t.get('division') or {}).get('id'),
            'division_name': (t.get('division') or {}).get('name'),
        })
    return pd.DataFrame(rows)

def fetch_mlb_scores_with_divisions(season: int, include_postseason: bool = True) -> pd.DataFrame:
    team_lu = _fetch_team_divisions(season)
    id_to_div = dict(zip(team_lu.team_id, team_lu.division_name))
    id_to_name = dict(zip(team_lu.team_id, team_lu.team_name))

    sched = statsapi.get('schedule', {'sportId': 1, 'season': season})
    rows = []
    for d in sched.get('dates', []):
        for g in d.get('games', []):
            game_type = g.get('gameType')  # 'R','S','F','D','L','W','E', etc.
            is_post = game_type not in ('S', 'R')
            if not include_postseason and is_post:
                continue

            status = g.get('status', {}).get('abstractGameState')
            detailed = g.get('status', {}).get('detailedState', '')
            if status != 'Final' and 'Final' not in detailed:
                continue

            home = g['teams']['home']; away = g['teams']['away']
            home_id = home['team']['id']; away_id = away['team']['id']

            rows.append({
                'date': g['gameDate'],  # UTC ISO string
                'season_type': ('Postseason' if is_post else 'Regular'),
                'game_pk': g['gamePk'],
                'home_team': id_to_name.get(home_id, home['team']['name']),
                'away_team': id_to_name.get(away_id, away['team']['name']),
                'home_division': id_to_div.get(home_id),
                'away_division': id_to_div.get(away_id),
                'home_score': home.get('score'),
                'away_score': away.get('score'),
                'venue': (g.get('venue') or {}).get('name'),
            })

    df = pd.DataFrame(rows)
    df['date'] = pd.to_datetime(df['date'])
    # make ordering stable
    try:
        df['date'] = df['date'].dt.tz_convert(None)  # drop tz if present
    except Exception:
        pass
    df = df.dropna(subset=['home_score','away_score']).astype({'home_score': int, 'away_score': int})
    df = df.sort_values(['date','game_pk']).reset_index(drop=True)
    return df

season = 2024
games_2024 = fetch_mlb_scores_with_divisions(season=season, include_postseason=True)
print(games_2024.head(3))
print("Total final games:", len(games_2024), "Dates:", games_2024['date'].min(), "→", games_2024['date'].max())


                 date season_type  game_pk         home_team  \
0 2024-02-22 20:10:00     Regular   748266  San Diego Padres   
1 2024-02-23 18:05:00  Postseason   748344    Boston Red Sox   
2 2024-02-23 20:05:00     Regular   748263     Texas Rangers   

              away_team         home_division            away_division  \
0   Los Angeles Dodgers  National League West     National League West   
1  Northeastern Huskies  American League East                     None   
2    Kansas City Royals  American League West  American League Central   

   home_score  away_score             venue  
0           1          14    Peoria Stadium  
1           7           2      JetBlue Park  
2           5           4  Surprise Stadium  
Total final games: 2943 Dates: 2024-02-22 20:10:00 → 2024-10-31 00:08:00


In [6]:
# --- PATCH: robust handling of missing divisions ---

def _prepare_interdivision(df: pd.DataFrame) -> pd.DataFrame:
    """Validate, keep inter-division games, drop rows with missing divisions, add home_win."""
    req_cols = {"home_team","away_team","home_division","away_division","home_score","away_score","date"}
    missing = req_cols - set(df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {sorted(missing)}")
    x = df.copy()
    x["date"] = pd.to_datetime(x["date"])
    # Drop games where either division is missing (e.g., All-Star games)
    x = x.dropna(subset=["home_division","away_division"])
    # Only inter-division games
    x = x.loc[x["home_division"] != x["away_division"]].copy()
    # Outcome from home perspective
    x["home_win"] = (x["home_score"] > x["away_score"]).astype(int)
    return x

def fetch_mlb_scores_with_divisions(season: int, include_postseason: bool = True) -> pd.DataFrame:
    import statsapi, pandas as pd
    teams = statsapi.get('teams', {'season': season, 'sportId': 1})['teams']
    id_to_div = {t['id']: (t.get('division') or {}).get('name') for t in teams}
    id_to_name = {t['id']: t['name'] for t in teams}

    sched = statsapi.get('schedule', {'sportId': 1, 'season': season})
    rows = []
    for d in sched.get('dates', []):
        for g in d.get('games', []):
            gt = g.get('gameType')  # 'R', 'S', 'A', etc.
            is_post = gt not in ('S', 'R')
            if not include_postseason and is_post:
                continue
            status = g.get('status', {}).get('abstractGameState')
            detailed = g.get('status', {}).get('detailedState', '')
            if status != 'Final' and 'Final' not in detailed:
                continue

            home = g['teams']['home']; away = g['teams']['away']
            hid = home['team']['id']; aid = away['team']['id']
            rows.append({
                'date': g['gameDate'],
                'season_type': ('Postseason' if is_post else 'Regular'),
                'game_pk': g['gamePk'],
                'home_team': id_to_name.get(hid, home['team']['name']),
                'away_team': id_to_name.get(aid, away['team']['name']),
                'home_division': id_to_div.get(hid),
                'away_division': id_to_div.get(aid),
                'home_score': home.get('score'),
                'away_score': away.get('score'),
                'venue': (g.get('venue') or {}).get('name'),
                'game_type': gt,
            })

    df = pd.DataFrame(rows)
    df['date'] = pd.to_datetime(df['date'], utc=True).dt.tz_convert(None)
    df = df.dropna(subset=['home_score','away_score']).astype({'home_score': int, 'away_score': int})
    # Drop games with missing divisions up-front (e.g., All-Star Game)
    df = df.dropna(subset=['home_division','away_division'])
    df = df.sort_values(['date','game_pk']).reset_index(drop=True)
    return df


In [10]:
def rolling_division_backtest(games: pd.DataFrame, l2: float = 0.1) -> pd.DataFrame:
    """
    Chronological loop:
      - Fit division strengths on PRIOR inter-division games only
      - Predict current game's winner using division strengths + home-field
    """
    preds = []
    played = pd.DataFrame(columns=games.columns)  # accumulator of prior games

    for _, row in games.iterrows():
        strengths = fit_division_strengths(played, l2=l2)
        p_home = predict_home_win_prob(row['home_division'], row['away_division'], strengths)
        pred_home = (p_home >= 0.5)
        actual_home_win = row['home_score'] > row['away_score']

        preds.append({
            'date': row['date'],
            'season_type': row['season_type'],
            'game_pk': row['game_pk'],
            'home_team': row['home_team'],
            'away_team': row['away_team'],
            'home_division': row['home_division'],
            'away_division': row['away_division'],
            'home_score': row['home_score'],
            'away_score': row['away_score'],
            'pred_home_prob': p_home,
            'predicted_winner': row['home_team'] if pred_home else row['away_team'],
            'actual_winner': row['home_team'] if actual_home_win else row['away_team'],
            'correct': bool(pred_home == actual_home_win),
            'home_field_snapshot': strengths.home_field,
            'theta_snapshot': strengths.thetas,
        })

        # add the just-finished game to history
        played = pd.concat([played, row.to_frame().T], ignore_index=True)

    pred_df = pd.DataFrame(preds)
    pred_df['actual_home_win'] = (pred_df['home_score'] > pred_df['away_score'])
    return pred_df

In [12]:
def backtest_division_model(
    season: int,
    include_postseason: bool = True,
    l2: float = 0.1,
    calib_low: float = 0.3,
    calib_high: float = 0.7,
    calib_bins: int = 9,
    save_csv: bool = True,
    outfile: str | None = None,
):
    """
    End-to-end backtest for a given MLB season using division-only strengths.
    Returns (preds_df, by_type_df, calibration_df).
    """
    # 1) Pull games
    games = fetch_mlb_scores_with_divisions(season=season, include_postseason=include_postseason)
    if games.empty:
        raise ValueError(f"No final games found for season {season} with include_postseason={include_postseason}.")

    # 2) Rolling backtest (no peeking)
    preds = rolling_division_backtest(games, l2=l2)
    if preds.empty:
        raise RuntimeError("Backtest produced no predictions (unexpected empty DataFrame).")

    # 3) Metrics
    overall = preds['correct'].mean()
    by_type = preds.groupby('season_type', observed=True)['correct'].mean().rename('accuracy').reset_index()

    # 4) Calibration (around coin-flip region by default)
    preds['actual_home_win'] = (preds['home_score'] > preds['away_score'])
    bins = np.linspace(calib_low, calib_high, calib_bins)
    cal = (
        preds
        .assign(bin=pd.cut(preds['pred_home_prob'], bins))
        .dropna(subset=['bin'])
        .groupby('bin', observed=True)
        .agg(
            n=('correct','size'),
            home_prob=('pred_home_prob','mean'),
            home_win_rate=('actual_home_win','mean')
        )
        .reset_index()
    )

    # 5) Save (optional)
    if save_csv:
        path = outfile or f"predictions_division_model_{season}.csv"
        preds.to_csv(path, index=False)
        print(f"[saved] {path}")

    # 6) Print quick summary
    print(f"Season {season} — division-only model")
    print(f"Overall accuracy: {overall:.3%}")
    print(by_type)
    print("\nCalibration (bins around 0.5):")
    print(cal)

    return preds, by_type, cal

In [13]:
preds_2024, by_type_2024, cal_2024 = backtest_division_model(
    season=2024,
    include_postseason=True,
    l2=0.1,
    calib_low=0.3,
    calib_high=0.7,
    calib_bins=9,
    save_csv=True,
)

[saved] predictions_division_model_2024.csv
Season 2024 — division-only model
Overall accuracy: 51.485%
  season_type  accuracy
0  Postseason  0.604651
1     Regular  0.513514

Calibration (bins around 0.5):
           bin     n  home_prob  home_win_rate
0  (0.3, 0.35]     9   0.330523       0.333333
1  (0.35, 0.4]    37   0.381418       0.513514
2  (0.4, 0.45]   181   0.431484       0.535912
3  (0.45, 0.5]   686   0.479294       0.497085
4  (0.5, 0.55]  1529   0.522184       0.518640
5  (0.55, 0.6]   394   0.572059       0.553299
6  (0.6, 0.65]    51   0.616265       0.490196
7  (0.65, 0.7]     5   0.668703       0.200000


In [14]:
preds_2023, by_type_2023, cal_2023 = backtest_division_model(
    season=2023,
    include_postseason=True,
    l2=0.1,
    calib_low=0.3,
    calib_high=0.7,
    calib_bins=9,
    save_csv=True,
)

[saved] predictions_division_model_2023.csv
Season 2023 — division-only model
Overall accuracy: 51.822%
  season_type  accuracy
0  Postseason  0.317073
1     Regular  0.521064

Calibration (bins around 0.5):
           bin     n  home_prob  home_win_rate
0  (0.3, 0.35]     7   0.329342       0.571429
1  (0.35, 0.4]    52   0.384637       0.423077
2  (0.4, 0.45]   245   0.430283       0.526531
3  (0.45, 0.5]   626   0.479486       0.488818
4  (0.5, 0.55]  1421   0.518807       0.517241
5  (0.55, 0.6]   410   0.572143       0.546341
6  (0.6, 0.65]   121   0.624024       0.553719
7  (0.65, 0.7]    18   0.675911       0.555556


In [15]:
preds_2022, by_type_2022, cal_2022 = backtest_division_model(
    season=2022,
    include_postseason=True,
    l2=0.1,
    calib_low=0.3,
    calib_high=0.7,
    calib_bins=9,
    save_csv=True,
)

[saved] predictions_division_model_2022.csv
Season 2022 — division-only model
Overall accuracy: 53.888%
  season_type  accuracy
0  Postseason  0.525000
1     Regular  0.539089

Calibration (bins around 0.5):
           bin     n  home_prob  home_win_rate
0  (0.3, 0.35]     5   0.321940       0.600000
1  (0.35, 0.4]     9   0.380672       0.888889
2  (0.4, 0.45]   123   0.429365       0.560976
3  (0.45, 0.5]   408   0.478090       0.473039
4  (0.5, 0.55]  1426   0.524595       0.546283
5  (0.55, 0.6]   527   0.568707       0.550285
6  (0.6, 0.65]   150   0.618551       0.566667
7  (0.65, 0.7]    24   0.665435       0.458333


In [16]:
preds_2021, by_type_2021, cal_2021 = backtest_division_model(
    season=2021,
    include_postseason=True,
    l2=0.1,
    calib_low=0.3,
    calib_high=0.7,
    calib_bins=9,
    save_csv=True,
)

[saved] predictions_division_model_2021.csv
Season 2021 — division-only model
Overall accuracy: 51.402%
  season_type  accuracy
0  Postseason  0.621622
1     Regular  0.512623

Calibration (bins around 0.5):
           bin     n  home_prob  home_win_rate
0  (0.3, 0.35]    14   0.332427       0.500000
1  (0.35, 0.4]    23   0.377250       0.652174
2  (0.4, 0.45]    58   0.429431       0.655172
3  (0.45, 0.5]   502   0.484946       0.535857
4  (0.5, 0.55]  1630   0.531982       0.519018
5  (0.55, 0.6]   610   0.566920       0.563934
6  (0.6, 0.65]    21   0.607206       0.619048
7  (0.65, 0.7]     2   0.660105       0.000000
