In [79]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy.stats import poisson,skellam
from scipy.optimize import minimize
from bettools import get_data, generate_seasons, calculate_poisson_match_outcomes, calculate_ev_from_odds
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings

# Suppress RuntimeWarnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.options.mode.chained_assignment = None

In [80]:
def rho_correction(x, y, lambda_x, mu_y, rho):
    if x==0 and y==0:
        return 1- (lambda_x * mu_y * rho)
    elif x==0 and y==1:
        return 1 + (lambda_x * rho)
    elif x==1 and y==0:
        return 1 + (mu_y * rho)
    elif x==1 and y==1:
        return 1 - rho
    else:
        return 1.0

def dc_log_like(x, y, alpha_x, beta_x, alpha_y, beta_y, rho, gamma):
    lambda_x, mu_y = np.exp(alpha_x + beta_y + gamma), np.exp(alpha_y + beta_x) 
    return (np.log(rho_correction(x, y, lambda_x, mu_y, rho)) + 
            np.log(poisson.pmf(x, lambda_x)) + np.log(poisson.pmf(y, mu_y)))

In [81]:
def solve_parameters(dataset, debug = False, init_vals=None, options={'disp': True, 'maxiter':100},
                     constraints = [{'type':'eq', 'fun': lambda x: sum(x[:20])-20}] , **kwargs):
    teams = np.sort(dataset['HomeTeam'].unique())
    # check for no weirdness in dataset
    away_teams = np.sort(dataset['AwayTeam'].unique())
    if not np.array_equal(teams, away_teams):
        raise ValueError("Something's not right")
    n_teams = len(teams)
    if init_vals is None:
        # random initialisation of model parameters
        init_vals = np.concatenate((np.random.uniform(0,1,(n_teams)), # attack strength
                                      np.random.uniform(0,-1,(n_teams)), # defence strength
                                      np.array([0, 1.0]) # rho (score correction), gamma (home advantage)
                                     ))
    def dc_log_like(x, y, alpha_x, beta_x, alpha_y, beta_y, rho, gamma):
        lambda_x, mu_y = np.exp(alpha_x + beta_y + gamma), np.exp(alpha_y + beta_x) 
        return (np.log(rho_correction(x, y, lambda_x, mu_y, rho)) + 
                np.log(poisson.pmf(x, lambda_x)) + np.log(poisson.pmf(y, mu_y)))

    def estimate_paramters(params):
        score_coefs = dict(zip(teams, params[:n_teams]))
        defend_coefs = dict(zip(teams, params[n_teams:(2*n_teams)]))
        rho, gamma = params[-2:]
        log_like = [dc_log_like(row.FTHG, row.FTAG, score_coefs[row.HomeTeam], defend_coefs[row.HomeTeam],
                     score_coefs[row.AwayTeam], defend_coefs[row.AwayTeam], rho, gamma) for row in dataset.itertuples()]
        return -sum(log_like)
    opt_output = minimize(estimate_paramters, init_vals, options=options, constraints = constraints, **kwargs)
    if debug:
        # sort of hacky way to investigate the output of the optimisation process
        return opt_output
    else:
        return dict(zip(["attack_"+team for team in teams] + 
                        ["defence_"+team for team in teams] +
                        ['rho', 'home_adv'],
                        opt_output.x)) 

# season_list = generate_seasons(2017, 2018)

# df_ls = get_data(season_list, leagues, additional_cols=['HS','AS','FTR'])

# dc_df = pd.concat(df_ls)

# params = solve_parameters(dc_df)

In [82]:
def calc_means(param_dict, homeTeam, awayTeam):
    return [np.exp(param_dict['attack_'+homeTeam] + param_dict['defence_'+awayTeam] + param_dict['home_adv']),
            np.exp(param_dict['defence_'+homeTeam] + param_dict['attack_'+awayTeam])]

def dixon_coles_simulate_match(params_dict, homeTeam, awayTeam, max_goals=10):
    team_avgs = calc_means(params_dict, homeTeam, awayTeam)
    team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)] for team_avg in team_avgs]
    output_matrix = np.outer(np.array(team_pred[0]), np.array(team_pred[1]))
    correction_matrix = np.array([[rho_correction(home_goals, away_goals, team_avgs[0],
                                                   team_avgs[1], params['rho']) for away_goals in range(2)]
                                   for home_goals in range(2)])
    output_matrix[:2,:2] = output_matrix[:2,:2] * correction_matrix
    return output_matrix

In [None]:
ars_lut_dc = dixon_coles_simulate_match(params, 'Arsenal', 'Luton', max_goals=10)

home_win = list(map(lambda x:np.sum(np.tril(x, -1)), [ars_lut_dc]))[0]
draw_win = list(map(lambda x:np.sum(np.diag(x)), [ars_lut_dc]))[0]
away_win = list(map(lambda x:np.sum(np.triu(x, 1)), [ars_lut_dc]))[0]

In [83]:
def dc_log_like_decay(x, y, alpha_x, beta_x, alpha_y, beta_y, rho, gamma, t, xi=0):
    lambda_x, mu_y = np.exp(alpha_x + beta_y + gamma), np.exp(alpha_y + beta_x) 
    return  np.exp(-xi*t) * (np.log(rho_correction(x, y, lambda_x, mu_y, rho)) + 
                              np.log(poisson.pmf(x, lambda_x)) + np.log(poisson.pmf(y, mu_y)))

In [84]:
def solve_parameters_decay(dataset, xi=0.001, debug = False, init_vals=None, options={'disp': True, 'maxiter':100},
                     constraints = [{'type':'eq', 'fun': lambda x: sum(x[:20])-20}] , **kwargs):
    teams = np.sort(dataset['HomeTeam'].unique())
    # check for no weirdness in dataset
    away_teams = np.sort(dataset['AwayTeam'].unique())
    if not np.array_equal(teams, away_teams):
        raise ValueError("something not right")
    n_teams = len(teams)
    if init_vals is None:
        # random initialisation of model parameters
        init_vals = np.concatenate((np.random.uniform(0,1,(n_teams)), # attack strength
                                      np.random.uniform(0,-1,(n_teams)), # defence strength
                                      np.array([0,1.0]) # rho (score correction), gamma (home advantage)
                                     ))
        
    def dc_log_like_decay(x, y, alpha_x, beta_x, alpha_y, beta_y, rho, gamma, t, xi=xi):
        lambda_x, mu_y = np.exp(alpha_x + beta_y + gamma), np.exp(alpha_y + beta_x) 
        return  np.exp(-xi*t) * (np.log(rho_correction(x, y, lambda_x, mu_y, rho)) + 
                                  np.log(poisson.pmf(x, lambda_x)) + np.log(poisson.pmf(y, mu_y)))

    def estimate_paramters(params):
        score_coefs = dict(zip(teams, params[:n_teams]))
        defend_coefs = dict(zip(teams, params[n_teams:(2*n_teams)]))
        rho, gamma = params[-2:]
        log_like = [dc_log_like_decay(row.FTHG, row.FTAG, score_coefs[row.HomeTeam], defend_coefs[row.HomeTeam],
                                      score_coefs[row.AwayTeam], defend_coefs[row.AwayTeam], 
                                      rho, gamma, row.time_diff, xi=xi) for row in dataset.itertuples()]
        return -sum(log_like)
    opt_output = minimize(estimate_paramters, init_vals, options=options, constraints = constraints)
    if debug:
        # sort of hacky way to investigate the output of the optimisation process
        return opt_output
    else:
        return dict(zip(["attack_"+team for team in teams] + 
                        ["defence_"+team for team in teams] +
                        ['rho', 'home_adv'],
                        opt_output.x))

In [85]:
def get_1x2_probs(match_score_matrix):
    return dict({"H":np.sum(np.tril(match_score_matrix, -1)), 
                 "A":np.sum(np.triu(match_score_matrix, 1)), "D":np.sum(np.diag(match_score_matrix))})

def build_temp_model(dataset, time_diff, xi=0.000, init_params=None):
    test_dataset = dataset[((dataset['time_diff']<=time_diff) & (dataset['time_diff']>=(time_diff-2)))]
    if len(test_dataset)==0:
        return 0
    train_dataset = dataset[dataset['time_diff']>time_diff]
    train_dataset['time_diff'] = train_dataset['time_diff'] - time_diff
    params = solve_parameters_decay(train_dataset, xi=xi, init_vals=init_params)
    predictive_score = sum([np.log(get_1x2_probs(dixon_coles_simulate_match(
                    params, row.HomeTeam, row.AwayTeam))[row.FTR]) for row in test_dataset.itertuples()])
    return predictive_score    

def get_total_score_xi(xi):
    xi_result = [build_temp_model(dc_df, day, xi=xi) for day in range(99,-1,-3)]
    with open('find_xi_1season_{}.txt'.format(str(xi)[2:]), 'wb') as thefile:
        pickle.dump(xi_result, thefile)

In [86]:
def kelly_criterion(probability, odds, bankroll, kelly_fraction=1.0):
    """
    Calculate the optimal betting amount using the Kelly Criterion, with an option to use a fraction of the full recommendation.
    
    Parameters:
    - probability: The probability of the outcome occurring.
    - odds: The decimal odds offered for the bet.
    - bankroll: The current amount in your bankroll.
    - kelly_fraction: Fraction of the Kelly bet to use (default is 1.0 for 100%).
    
    Returns:
    - The optimal amount to bet from your bankroll, adjusted by the specified Kelly fraction.
    """
    b = odds - 1  # Convert decimal odds to b in the formula
    q = 1 - probability  # Probability of losing
    
    # Calculate the fraction of the bankroll to bet, according to the Kelly Criterion
    f_star = (b * probability - q) / b
    
    # Adjust the fraction with the specified Kelly fraction
    f_star = max(f_star, 0) * kelly_fraction
    
    # Calculate the amount to bet
    bet_amount = f_star * bankroll
    
    return bet_amount

In [117]:
leagues = ['E0']

season_list = generate_seasons(2014, 2024)

df_ls = get_data(season_list, leagues, additional_cols=['HS','AS','FTR'])

main_df = pd.concat(df_ls)

main_df = main_df[-500:]

main_df.reset_index(inplace=True, drop=True)

main_df['Date'] = pd.to_datetime(main_df['Date'],  format='%d/%m/%y')
main_df['time_diff'] = (max(main_df['Date']) - main_df['Date']).dt.days
main_df = main_df[['HomeTeam','AwayTeam','FTHG','FTAG', 'FTR', 'time_diff']]

In [118]:
params = solve_parameters_decay(main_df, xi=0.00325)

Optimization terminated successfully    (Exit mode 0)
            Current function value: 825.3474170622521
            Iterations: 53
            Function evaluations: 2672
            Gradient evaluations: 53


In [121]:
def make_betting_prediction(home_odds, draw_odds, away_odds, params, home_team, away_team, bankroll):
    predicted_probs = get_1x2_probs(dixon_coles_simulate_match(params, home_team, away_team, max_goals=10))
    
    home_ev = calculate_ev_from_odds(home_odds, predicted_probs['H'])
    away_ev = calculate_ev_from_odds(away_odds, predicted_probs['A'])
    draw_ev = calculate_ev_from_odds(draw_odds, predicted_probs['D'])

    max_ev = max([home_ev, away_ev, draw_ev])
    if max_ev == home_ev:
        bet_amount = kelly_criterion(predicted_probs['H'], home_odds, bankroll, kelly_fraction=0.05)
        bet_selection = 'Home'
    if max_ev == away_ev:
        bet_amount = kelly_criterion(predicted_probs['A'], away_odds, bankroll, kelly_fraction=0.05)
        bet_selection = 'Away'
    elif max_ev == draw_ev:
        bet_amount = kelly_criterion(predicted_probs['D'], draw_odds, bankroll, kelly_fraction=0.05)
        bet_selection = 'Draw'
    return bet_selection, bet_amount

make_betting_prediction(1.68, 4.23, 4.5, params, "Chelsea", 'Everton', 110)

('Away', 0.18325254664701998)

In [114]:
params

{'attack_Birmingham': 0.8050523897496342,
 'attack_Blackburn': 1.0021238194010662,
 'attack_Bristol City': 0.9391071205764208,
 'attack_Cardiff': 0.813067093406044,
 'attack_Coventry': 1.2662105726498234,
 'attack_Huddersfield': 0.8715634067129769,
 'attack_Hull': 1.1040636762199296,
 'attack_Ipswich': 1.4388374074475823,
 'attack_Leeds': 1.336719158971619,
 'attack_Leicester': 1.3932487453042963,
 'attack_Middlesbrough': 1.090096797825093,
 'attack_Millwall': 0.6472968191795301,
 'attack_Norwich': 1.3401056517176584,
 'attack_Plymouth': 0.9978687909828998,
 'attack_Preston': 1.0563956507299663,
 'attack_QPR': 0.7085882861220348,
 'attack_Rotherham': 0.44121515083825785,
 'attack_Sheffield Weds': 0.6232189651769978,
 'attack_Southampton': 1.4202885049157383,
 'attack_Stoke': 0.7049319920724367,
 'attack_Sunderland': 0.876893436390103,
 'attack_Swansea': 0.9620295779839108,
 'attack_Watford': 1.0249526479642235,
 'attack_West Brom': 1.1990549614542747,
 'defence_Birmingham': -0.72013189

In [None]:
leagues = ['E0']

season_list = generate_seasons(2014, 2024)

df_ls = get_data(season_list, leagues)

main_df = pd.concat(df_ls)

In [None]:
main_df['Date'] = pd.to_datetime(main_df['Date'])
main_df = main_df.sort_values('Date')
main_df.set_index('Date', inplace=True)

In [None]:
def walk_forward_validation(data, initial_train_size, test_size, num_iterations):
    total_rows = len(data)
    
    train_start = 0
    train_end = initial_train_size
    test_end = train_end + test_size
    
    results = []
    iteration = 0  # To track the number of iterations
    while test_end <= total_rows and iteration < num_iterations:
        # Splitting the data
        train_data = data.iloc[train_start:train_end]
        test_data = data.iloc[train_end:test_end]

        # Calculating time difference from the end of the training set
        max_train_date = train_data.index.max()
        train_data['time_diff'] = (max_train_date - train_data.index).days

        # Selecting columns (adjust as necessary)
        train_data = train_data[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'time_diff']]

        successful_fit = False
        attempts = 0
        while not successful_fit and attempts < 5:  # Try fitting parameters up to 2 times
            try:
                params = solve_parameters_decay(train_data, xi=0.00325)
                # Insert your prediction and evaluation logic here, using `params`
                # For each row in test_data, calculate predictions and append to test_data
                test_data = test_data.reset_index()
                for i in range(len(test_data)):
                    home_team = test_data.loc[i]['HomeTeam']
                    away_team = test_data.loc[i]['AwayTeam']
                    probs_1x2 = get_1x2_probs(dixon_coles_simulate_match(params, home_team, away_team, max_goals=10))
                    test_data.loc[i, 'home_win_prob'] = probs_1x2['H']
                    test_data.loc[i, 'away_win_prob'] = probs_1x2['A']
                    test_data.loc[i, 'draw_win_prob'] = probs_1x2['D']
                results.append(test_data)
                successful_fit = True
            except Exception as e:
                # If parameter fitting fails, add more data to the training set and try again
                print(f"Parameter fitting failed on attempt {attempts + 1}: {e}. Trying with a larger training set.")
                train_end += test_size  # Expanding the training set window
                if train_end + test_size > total_rows:
                    print("Not enough data to expand the training set and perform another test. Stopping.")
                    return results
                train_data = data.iloc[train_start:train_end]
                train_data['time_diff'] = (max_train_date - train_data.index).days
                train_data = train_data[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'time_diff']]
                attempts += 1
        if successful_fit:
            # Move the window forward
            train_end = test_end
            test_end += test_size
            iteration += 1  # Increment the iteration count
    return results

res = walk_forward_validation(main_df, 500, 10, 200)