In [1]:
%cd /home/mathieu/Prose/Mathieu/Benter-Project

/home/mathieu/Prose/Mathieu/Benter-Project


In [2]:
%matplotlib inline

import datetime as dt
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm.notebook import tqdm
import re
from itertools import combinations
import tensorflow as tf
import functools
import itertools
from scipy.stats import rankdata
import scipy
import json

from utils import import_data
from winning_validation import errors
from winning_validation import r_squared
from winning_horse_models import sklearn
from winning_horse_models.logistic_regression import LogisticRegressionModel
from winning_horse_models.xgboost import XGBoostWinningModel
from winning_horse_models.lgbm import LGBMWinningModel
from training_procedures import sequential_training, flattened_training
from constants import Sources
from utils import preprocess
from winning_horse_models.baselines import RandomModel
from joblib import Parallel, delayed

from database.setup import create_sqlalchemy_session
from models.race import Race
from models.runner import Runner

In [3]:
SOURCE = Sources.UNIBET
N_FEATURES = preprocess.get_n_preprocessed_feature_columns(source=SOURCE)

In [5]:
sequential_sgd_regression = LogisticRegressionModel.load_model(prefix="48_col_")

In [5]:
alpha, beta = 0.5, 0.5

In [6]:
def _combine_odds_winning_model(model_proba, odds, alpha, beta):
    return alpha*np.log(model_proba)+beta*np.log(odds)

In [7]:
def compute_combined_r_squared(alpha_beta, source, winning_model, same_races_support, verbose):
    alpha, beta =alpha_beta
    min_horse, max_horse = import_data.get_min_max_horse(source=source)


    np.random.seed(42)
    r_squared_num, r_squared_deno=0,0
    for n_horses in range(max(1, min_horse), max_horse + 1):
        x_race, rank_race, race_dfs = import_data.get_races_per_horse_number(
            source=source,
            n_horses=n_horses,
            on_split="val",
            x_format="sequential_per_horse",
            y_format="rank",
            extra_features_func=None,
        )

        if x_race.size == 0:
            continue
        odds_race = np.stack(
            arrays=[race_df["odds"].values for race_df in race_dfs], axis=0
        )

        model_prediction = winning_model.predict(x=x_race)
        race_odds_notna_index = np.logical_not(np.isnan(odds_race)).all(axis=1)
        pari_mutual_proba = (1 / odds_race)[race_odds_notna_index]
        rank_race_ = rank_race

        if same_races_support:
            model_prediction = model_prediction[race_odds_notna_index]
            rank_race_ = rank_race_[race_odds_notna_index]

        # Ignore dead heat finish
        model_prediction = model_prediction[(rank_race_==1).sum(axis=1)==1, :]
        pari_mutual_proba = pari_mutual_proba[(rank_race_==1).sum(axis=1)==1, :]
        rank_race_=rank_race_[(rank_race_==1).sum(axis=1)==1, :]

        n_races = len(rank_race_)
        combined_winning_horse_proba=np.exp(alpha*np.log(model_prediction[rank_race_==1])+beta*np.log(pari_mutual_proba[rank_race_==1]))
        combined_winning_horse_proba/=np.exp(alpha*np.log(model_prediction)+beta*np.log(pari_mutual_proba)).sum(axis=1)
        r_squared_num_n_horses = np.log(combined_winning_horse_proba).sum()
        r_squared_deno_n_horses =-n_races* np.log(n_horses)
        if verbose:
            print(f'On {n_horses} horses, odds winning combined R²: {1-r_squared_num_n_horses/r_squared_deno_n_horses:.3}')
        r_squared_num+=r_squared_num_n_horses
        r_squared_deno+= r_squared_deno_n_horses

    return alpha, beta, 1-r_squared_num/r_squared_deno

In [8]:
simulate = functools.partial(compute_combined_r_squared, source=SOURCE, winning_model=sequential_sgd_regression, same_races_support=True, verbose=False)
grid=list(itertools.product(range(-1, 11, 1), repeat=2))
simulation_results=Parallel(n_jobs=-2)(delayed(simulate)(ab) for ab in tqdm(grid, total=len(grid)))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=441.0), HTML(value='')))






In [22]:
sorted(simulation_results, key=lambda sim:sim[2], reverse=True)[:10]

[(0, 1, 0.18560668139504388),
 (1, 1, 0.15320517668224265),
 (-1, 1, 0.09400939359923899),
 (1, 0, 0.08930949575158098),
 (-1, 2, 0.08300052212338005),
 (0, 2, 0.07865300017542931),
 (2, 0, 0.050999214963541206),
 (2, 1, 0.043444325811888285),
 (1, 2, 0.00010602870425668875),
 (0, 0, 0.0)]

In [24]:
simulate = functools.partial(compute_combined_r_squared, source=SOURCE, winning_model=sequential_sgd_regression, same_races_support=True, verbose=False)
grid=list(itertools.product(np.arange(-1, 1.1, 0.1), repeat=2))
simulation_results=Parallel(n_jobs=-1)(delayed(simulate)(ab) for ab in tqdm(grid, total=len(grid)))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=441.0), HTML(value='')))




In [25]:
sorted(simulation_results, key=lambda sim:sim[2], reverse=True)[:10]

[(0.2999999999999998, 0.8999999999999995, 0.187842263347205),
 (0.19999999999999973, 0.8999999999999995, 0.18770267440215915),
 (0.19999999999999973, 0.9999999999999996, 0.1874103284906503),
 (0.09999999999999964, 0.9999999999999996, 0.18710246826501153),
 (0.3999999999999997, 0.8999999999999995, 0.18683793355333234),
 (0.2999999999999998, 0.9999999999999996, 0.18658281533248577),
 (0.09999999999999964, 0.8999999999999995, 0.18636600591831454),
 (0.2999999999999998, 0.7999999999999996, 0.18620527165376322),
 (0.3999999999999997, 0.7999999999999996, 0.18620202941583397),
 (-2.220446049250313e-16, 0.9999999999999996, 0.18560668139504388)]

In [4]:
# Best result for 0.3 0.9
from winning_horse_models import OddsCombinedWinningModel

In [6]:
combined_model = OddsCombinedWinningModel(alpha=0.3, beta=0.8, winning_model=LogisticRegressionModel.load_model(prefix="48_col_"))

In [7]:
combined_model 

<winning_horse_models.OddsCombinedWinningModel at 0x7ff8741cacd0>