In [52]:
import pandas as pd
import os
import json
import matplotlib.pyplot as plt
import pickle

In [53]:
# load pickle file
leagues = pd.read_pickle('/Users/luisenriquekaiser/Documents/soccer_betting_forecast/data/processed/top5_leagues_schedule.pkl')

In [54]:
params = [800, 0.05, 0.5]


In [55]:
import numpy as np

def exp_goal_diff(c, hr, ar):          # choose asymmetric exp or sinh; here: sinh (odd)
    if not (c > 0):
        raise ValueError("c must be > 0")
    return np.sinh(hr/c) - np.sinh(ar/c)

def get_error(obs_gd, exp_gd):
    return obs_gd - exp_gd

def get_weighted_error(eta, err):
    if np.isclose(err, 0.0):
        return 0.0, 0.0
    mag = eta * np.log10(1.0 + np.abs(err))
    sgn = np.sign(err)
    return mag*sgn, -mag*sgn  # opposite by construction

def update_ratings(wehome, weaway, hrhome, hraway, arhome, araway, mu1, mu2):
    dh = wehome * mu1
    da = weaway * mu1
    hrhome_new = hrhome + dh
    hraway_new = hraway - mu2 * dh     # reallocation within team
    araway_new = araway + da
    arhome_new = arhome - mu2 * da     # reallocation within team
    return hrhome_new, hraway_new, arhome_new, araway_new


In [56]:
params = [800, 0.05, 0.5]

df = leagues.copy()
# order dataset
df = df.reset_index(drop = True)

    # get team names and initialise pi ratings
teams = pd.concat([df['home_team'], df['away_team']]).unique()
keys = list(set(list(df['home_team']) + list(df['home_team'])))

pi_dictionary = {f"Home {key}": 0 for key in keys}
pi_dictionary.update({f"Away {key}": 0 for key in keys})

c = params[0]
mu1 = params[1]
mu2 = params[2]

home_key = 'Home {}'
away_key = 'Away {}'

df_list = df.values.tolist()

In [57]:

import numpy as np
import pandas as pd

# Parameters: c only for expectation scale; mu1=learning rate; mu2=reallocation
c_scale, mu1, mu2 = 800.0, 0.05, 0.5

# Start from leagues; sort chronologically
df = leagues.copy()
sort_cols = [col for col in ['date', 'league', 'season'] if col in df.columns]
df = df.sort_values(sort_cols).reset_index(drop=True) if sort_cols else df.reset_index(drop=True)

# Build team universe correctly (union of home and away)
teams = pd.unique(pd.concat([df['home_team'], df['away_team']]).dropna())
home_key = 'Home {}'
away_key = 'Away {}'

# Initialize separate home/away pi dictionaries at 0
pi_dictionary = {home_key.format(t): 0.0 for t in teams}
pi_dictionary.update({away_key.format(t): 0.0 for t in teams})

# Ensure output columns exist
pre_cols = ['Home Home Rating', 'Home Away Rating', 'Away Home Rating', 'Away Away Rating']
post_cols = ['Home Home Rating Post', 'Home Away Rating Post', 'Away Home Rating Post', 'Away Away Rating Post']
aux_cols = ['Expected GD', 'Obs GD', 'Err (Obs-Exp)']
for col in pre_cols + post_cols + aux_cols:
    if col not in df.columns:
        df[col] = np.nan

# Build mask: valid parsed score and not in the future
today = pd.Timestamp('today').normalize()

# Convert dates and strip timezone if present
if 'date' in df.columns:
    dates = pd.to_datetime(df['date'], errors='coerce')
    try:
        dates = dates.dt.tz_convert(None)
    except Exception:
        pass
    is_future = dates.dt.normalize() > today
else:
    is_future = pd.Series(False, index=df.index)

# Score validity: both goal columns present; also respect optional 'is_valid_score'
has_scores = df['home_goals'].notna() & df['away_goals'].notna()
if 'is_valid_score' in df.columns:
    has_scores &= df['is_valid_score'].fillna(False)

valid_mask = has_scores & (~is_future)

# Iterate only valid, non-future matches; write PRE, then update to POST for future games
for i, row in df.loc[valid_mask].iterrows():
    home = row['home_team']
    away = row['away_team']
    home_score = float(row['home_goals'])
    away_score = float(row['away_goals'])

    # Current PRE ratings (strictly before this match is processed)
    h_hr = pi_dictionary[home_key.format(home)]
    h_ar = pi_dictionary[away_key.format(home)]
    a_hr = pi_dictionary[home_key.format(away)]
    a_ar = pi_dictionary[away_key.format(away)]

    # Write PRE ratings (shifted by one game)
    df.at[i, 'Home Home Rating'] = h_hr
    df.at[i, 'Home Away Rating'] = h_ar
    df.at[i, 'Away Home Rating'] = a_hr
    df.at[i, 'Away Away Rating'] = a_ar

    # Observed and expected goal difference
    obs_gd = home_score - away_score
    exp_gd = exp_goal_diff(c_scale, h_hr, a_ar)
    err = obs_gd - exp_gd

    df.at[i, 'Expected GD'] = exp_gd
    df.at[i, 'Obs GD'] = obs_gd
    df.at[i, 'Err (Obs-Exp)'] = err

    # Symmetric weighted update
    if np.isclose(err, 0.0):
        wehome = 0.0
        weaway = 0.0
    else:
        mag = np.log1p(abs(err))
        sgn = np.sign(err)
        wehome = sgn * mag
        weaway = -wehome

    # Apply learning rate; avoid double scaling in update_ratings
    wehome *= mu1
    weaway *= mu1

    h_hr_new, h_ar_new, a_hr_new, a_ar_new = update_ratings(
        wehome, weaway, h_hr, h_ar, a_hr, a_ar, 1.0, mu2
    )

    # Optional: write POST ratings
    df.at[i, 'Home Home Rating Post'] = h_hr_new
    df.at[i, 'Home Away Rating Post'] = h_ar_new
    df.at[i, 'Away Home Rating Post'] = a_hr_new
    df.at[i, 'Away Away Rating Post'] = a_ar_new

    # Commit POST to dictionary for the next fixtures
    pi_dictionary[home_key.format(home)] = h_hr_new
    pi_dictionary[away_key.format(home)] = h_ar_new
    pi_dictionary[home_key.format(away)] = a_hr_new
    pi_dictionary[away_key.format(away)] = a_ar_new

In [61]:
# save the updated dataframe as pickle
# safe the newest pi ratings within a new pickle file
from pathlib import Path

pkl_path = Path("/Users/luisenriquekaiser/Documents/soccer_betting_forecast/data/processed/pi_dictionary.pkl")

with open(pkl_path, "wb") as f:
    pickle.dump(pi_dictionary, f, protocol=pickle.HIGHEST_PROTOCOL)


df.to_pickle('/Users/luisenriquekaiser/Documents/soccer_betting_forecast/data/processed/top5_leagues_schedule_with_pi.pkl')