In [3]:
#%cd C:/Users/Mathieu/Desktop/Projets/Benter
%cd /home/mathieu/Prose/Mathieu/Benter-Project

/home/mathieu/Prose/Mathieu/Benter-Project


In [4]:
%matplotlib inline

import datetime as dt
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
import re
from itertools import combinations
import tensorflow as tf
import functools
from scipy.stats import rankdata
import scipy
import json

from utils import import_data
from winning_validation import errors
from winning_validation import r_squared
from winning_horse_models import sklearn
from winning_horse_models.dl_shared_layers import LogisticRegressionModel, DLSharedLayersModel, DLLayersGeneratorModel
from winning_horse_models.xgboost import XGBoostWinningModel
from winning_horse_models.catboost import CatboostWinningModel
from winning_horse_models.lgbm import LGBMWinningModel
from training_procedures import sequential_training, flattened_training
from constants import Sources
from utils import preprocess

from database.setup import create_sqlalchemy_session
from models.race import Race
from models.runner import Runner
tqdm.pandas()

import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

  from pandas import Panel


In [5]:
SOURCE = Sources.UNIBET
N_FEATURES = preprocess.get_n_preprocessed_feature_columns(source=SOURCE)

# XGBoost

In [5]:
%%time
%%capture --no-stdout

#xgboost_winning_model, training_history = flattened_training.train_per_n_horses_races(source=SOURCE, winning_model=XGBoostWinningModel(source=SOURCE), verbose=True)
#xgboost_winning_model.save_model(prefix="48_col_")

CPU times: user 618 µs, sys: 194 µs, total: 812 µs
Wall time: 818 µs


In [6]:
xgboost_winning_model=XGBoostWinningModel.load_model(prefix="48_col_", source=SOURCE, n_features = N_FEATURES)

In [11]:
%%capture --no-stdout
%%time
res=r_squared.compute_mcfadden_r_squared_on_n_horses(source=SOURCE,winning_model=xgboost_winning_model, n_horses=10,verbose=False)

CPU times: user 8.75 s, sys: 106 ms, total: 8.86 s
Wall time: 8.24 s


In [13]:
res['model_r_squared']

-0.06956215250069908

In [None]:
# Overfitting!

In [5]:
hyperparameters = dict(silent=False, 
                      scale_pos_weight=1,
                      learning_rate=0.01,  
                      colsample_bytree = 0.4,
                      subsample = 0.8,
                      objective='binary:logistic', 
                      n_estimators=1000, 
                      reg_alpha = 0.3,
                      max_depth=4, 
                      gamma=10)

In [11]:
# https://medium.com/analytics-vidhya/hyperparameter-tuning-hyperopt-bayesian-optimization-for-xgboost-and-neural-network-8aedf278a1c9
space={'max_depth': hp.quniform("max_depth", 2, 10, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.2,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': hp.quniform('n_estimators', 20,180,1)
    }

In [12]:
   
# Classifier:
def hyperparameter_tuning(space):
    for param in ('max_depth', 'n_estimators', 'reg_alpha', 'min_child_weight'):
        if param in space:
            space[param] = int(space[param])
            
    for param in ('gamma', 'reg_lambda', 'colsample_bytree'):
        if param in space:
            space[param] = float(space[param])  
    model =XGBoostWinningModel(source=SOURCE, n_features = N_FEATURES, hyperparameters=space)
    
    model, _ = flattened_training.train_on_n_horses_races(source=SOURCE, winning_model=model, n_horses=10, verbose=True)

    res=r_squared.compute_mcfadden_r_squared_on_n_horses(source=SOURCE,winning_model=model, n_horses=10, verbose=True)
    r_squared__score= res['model_r_squared']
    print (f"R²: {r_squared__score:.2}, {space}")
    #change the metric if you like
    return {'loss': -r_squared__score, 'status': STATUS_OK, 'model': model}



In [13]:
%%time
%%capture --no-stdout
trials = Trials()
best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials,
           verbose=True)

print (best)

Training for 10 horses (13625 races): loss per horse: 0.218, val loss per horse: 0.220 Train Accuracy: 23.4%, Val Accuracy: 20.9%

Comparing on same races w/ 10 horses with odds 2196 races (3182 races in total)
Mean Predicted probas of actual race result: 11.471% (Random: 9.999%, Odds: 20.823%)
On 2196 races with 10 horses,R² of winning model: 0.04, R² of odds: 0.19, [R² of random model: -0.13 (should be closed to 0)]
R²: 0.044, {'colsample_bytree': 0.867336526984638, 'gamma': 7.370804695882834, 'max_depth': 9, 'min_child_weight': 6, 'n_estimators': 33, 'reg_alpha': 48, 'reg_lambda': 0.6757482952936152}
Training for 10 horses (13625 races): loss per horse: 0.228, val loss per horse: 0.229 Train Accuracy: 17.8%, Val Accuracy: 16.4%

Comparing on same races w/ 10 horses with odds 2196 races (3182 races in total)        
Mean Predicted probas of actual race result: 10.168% (Random: 9.841%, Odds: 20.823%)   
On 2196 races with 10 horses,R² of winning model: 0.01, R² of odds: 0.19, [R² of r

In [15]:
trials.argmin

{'colsample_bytree': 0.6868653078531703,
 'gamma': 1.3533016677310217,
 'max_depth': 4.0,
 'min_child_weight': 3.0,
 'n_estimators': 180.0,
 'reg_alpha': 40.0,
 'reg_lambda': 0.06703423674448061}

In [17]:
trials.best_trial

{'state': 2,
 'tid': 89,
 'spec': None,
 'result': {'loss': -0.0716297958198091,
  'status': 'ok',
  'model': <winning_horse_models.xgboost.XGBoostWinningModel at 0x7f305ef52950>},
 'misc': {'tid': 89,
  'cmd': ('domain_attachment', 'FMinIter_Domain'),
  'workdir': None,
  'idxs': {'colsample_bytree': [89],
   'gamma': [89],
   'max_depth': [89],
   'min_child_weight': [89],
   'n_estimators': [89],
   'reg_alpha': [89],
   'reg_lambda': [89]},
  'vals': {'colsample_bytree': [0.6868653078531703],
   'gamma': [1.3533016677310217],
   'max_depth': [4.0],
   'min_child_weight': [3.0],
   'n_estimators': [180.0],
   'reg_alpha': [40.0],
   'reg_lambda': [0.06703423674448061]}},
 'exp_key': None,
 'owner': None,
 'version': 0,
 'book_time': datetime.datetime(2021, 9, 15, 10, 57, 6, 171000),
 'refresh_time': datetime.datetime(2021, 9, 15, 10, 59, 35, 852000)}

# SKlearn

In [10]:
# Classifier:

space={'penalty': 'l2',
       'n_jobs':-1,
       'max_iter': hp.quniform('max_iter', 50,2000,10),
       'C': hp.uniform('C', 0.2, 5),
    }
def hyperparameter_tuning(space):
    for param in ('max_iter', 'n_jobs'):
        if param in space:
            space[param] = int(space[param])
            
    for param in ('l1_ratio', 'C'):
        if param in space:
            space[param] = float(space[param])  
    model =sklearn.LogisticRegressionModel(source=SOURCE, n_features = N_FEATURES, hyperparameters=space)
    
    model, _ = flattened_training.train_on_n_horses_races(source=SOURCE, winning_model=model, n_horses=10, verbose=True)

    res=r_squared.compute_mcfadden_r_squared_on_n_horses(source=SOURCE,winning_model=model, n_horses=10, verbose=True)
    r_squared__score= res['model_r_squared']
    print (f"R²: {r_squared__score:.2}, {space}")
    #change the metric if you like
    return {'loss': -r_squared__score, 'status': STATUS_OK, 'model': model}


In [11]:
%%time
%%capture --no-stdout
trials = Trials()
best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials,
           verbose=True)

print (best)

Importing training data...                             
Importing validation data...                           
Training for 10 horses (13625 races): loss per horse: 0.189, val loss per horse: 0.230 Train Accuracy: 32.4%, Val Accuracy: 19.6%

Comparing on same races w/ 10 horses with odds 2196 races (3182 races in total)
Mean Predicted probas of actual race result: 14.607% (Random: 10.038%, Odds: 20.823%)
On 2196 races with 10 horses,R² of winning model: 0.00, R² of odds: 0.19, [R² of random model: -0.13 (should be closed to 0)]
R²: 0.0014, {'C': 1.887674556317566, 'max_iter': 1130, 'n_jobs': -1, 'penalty': 'l2'}
Importing training data...                                                             
Importing validation data...                                                           
Training for 10 horses (13625 races): loss per horse: 0.190, val loss per horse: 0.227 Train Accuracy: 32.2%, Val Accuracy: 19.8%

Comparing on same races w/ 10 horses with odds 2196 races (3182 races in

In [8]:
# Classifier:

space={'loss':'modified_huber', # not log,since log regression is already tested
       'penalty':'elasticnet',
       'alpha': hp.uniform('alpha', 0, 0.1),
       'n_jobs':-1,
       'max_iter': hp.quniform('max_iter', 50,2000,10),
       'l1_ratio': hp.uniform('C', 0, 1),
    }
def hyperparameter_tuning(space):
    for param in ('max_iter', 'n_jobs'):
        if param in space:
            space[param] = int(space[param])
            
    for param in ('l1_ratio', 'alpha'):
        if param in space:
            space[param] = float(space[param])  
    model =sklearn.SGDModel(source=SOURCE, n_features = N_FEATURES, hyperparameters=space)
    
    model, _ = flattened_training.train_on_n_horses_races(source=SOURCE, winning_model=model, n_horses=10, verbose=True)

    res=r_squared.compute_mcfadden_r_squared_on_n_horses(source=SOURCE,winning_model=model, n_horses=10, verbose=True)
    r_squared__score= res['model_r_squared']
    print (f"R²: {r_squared__score:.2}, {space}")
    #change the metric if you like
    return {'loss': -r_squared__score, 'status': STATUS_OK, 'model': model}


In [9]:
%%time
%%capture --no-stdout
trials = Trials()
best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials,
           verbose=True)

print (best)

Importing training data...                             
Importing validation data...                           
Training for 10 horses (13625 races): loss per horse: 0.340, val loss per horse: 2.355 Train Accuracy: 20.0%, Val Accuracy: 10.9%

Comparing on same races w/ 10 horses with odds 2196 races (3182 races in total)
Mean Predicted probas of actual race result: 10.516% (Random: 9.777%, Odds: 20.823%)
On 2196 races with 10 horses,R² of winning model: -2.62, R² of odds: 0.19, [R² of random model: -0.14 (should be closed to 0)]
R²: -2.6, {'alpha': 0.09613999594129971, 'l1_ratio': 0.33825671685263525, 'loss': 'modified_huber', 'max_iter': 1120, 'n_jobs': -1, 'penalty': 'elasticnet'}
Importing training data...                                                         
Importing validation data...                                                       
Training for 10 horses (13625 races): loss per horse: 0.297, val loss per horse: 0.842 Train Accuracy: 23.0%, Val Accuracy: 18.5%

Comparing

# Catboost

In [6]:
space = {
        'depth': hp.quniform("depth", 1, 6, 1),
        'border_count': hp.quniform ('border_count', 32, 255, 1),
        'learning_rate': hp.loguniform('learning_rate', -5.0, -2),
        'l2_leaf_reg': hp.uniform('l2_leaf_reg', 3, 8),
       }

In [7]:
def hyperparameter_tuning(space):
    for param in ('depth', 'border_count'):
        if param in space:
            space[param] = int(space[param])
            
    for param in ('learning_rate', 'l2_leaf_reg'):
        if param in space:
            space[param] = float(space[param])  
    space['verbose']=0
    model =CatboostWinningModel(source=SOURCE, n_features = N_FEATURES, hyperparameters=space)
    
    model, _ = flattened_training.train_on_n_horses_races(source=SOURCE, winning_model=model, n_horses=10, verbose=True)

    res=r_squared.compute_mcfadden_r_squared_on_n_horses(source=SOURCE,winning_model=model, n_horses=10, verbose=True)
    r_squared__score= res['model_r_squared']
    print (f"R²: {r_squared__score:.2}, {space}")
    #change the metric if you like
    return {'loss': -r_squared__score, 'status': STATUS_OK, 'model': model}

In [8]:
%%time
%%capture --no-stdout
trials = Trials()
best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials,
           verbose=True)

print (best)

Importing training data...                             
Importing validation data...                           
Training for 10 horses (13625 races): loss per horse: 0.202, val loss per horse: 0.214 Train Accuracy: 31.2%, Val Accuracy: 22.2%

Comparing on same races w/ 10 horses with odds 2196 races (3182 races in total)
Mean Predicted probas of actual race result: 12.959% (Random: 9.996%, Odds: 20.823%)
On 2196 races with 10 horses,R² of winning model: 0.07, R² of odds: 0.19, [R² of random model: -0.12 (should be closed to 0)]
R²: 0.07, {'border_count': 104, 'depth': 3, 'l2_leaf_reg': 3.4716400974296153, 'learning_rate': 0.029317995388420507, 'verbose': 0}
Importing training data...                                                           
Importing validation data...                                                         
Training for 10 horses (13625 races): loss per horse: 0.144, val loss per horse: 0.215 Train Accuracy: 67.0%, Val Accuracy: 21.7%

Comparing on same races w/ 10 h

# LGBM

In [12]:
space = {
    #this is just piling on most of the possible parameter values for LGBM
    #some of them apparently don't make sense together, but works for now.. :)
    'class_weight': hp.choice('class_weight', [None, 'balanced']),
    'boosting_type': hp.choice('boosting_type',
                               ['gbdt',
#                                     'subsample': hp.uniform('dart_subsample', 0.5, 1)
                                
                                'dart',
#                                     'subsample': hp.uniform('dart_subsample', 0.5, 1)
                                 
                                'goss']),
    'num_leaves': hp.quniform('num_leaves', 30, 150, 1),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)),
    'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000),
    'feature_fraction': hp.uniform('feature_fraction', 0.5, 1),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.5, 1), #alias "subsample"
    'min_data_in_leaf': hp.qloguniform('min_data_in_leaf', 0, 6, 1),
    'lambda_l1': hp.choice('lambda_l1', [0, hp.loguniform('lambda_l1_positive', -16, 2)]),
    'lambda_l2': hp.choice('lambda_l2', [0, hp.loguniform('lambda_l2_positive', -16, 2)]),
    'verbose': -1,
    #the LGBM parameters docs list various aliases, and the LGBM implementation seems to complain about
    #the following not being used due to other params, so trying to silence the complaints by setting to None
    'subsample': None, #overridden by bagging_fraction
    'reg_alpha': None, #overridden by lambda_l1
    'reg_lambda': None, #overridden by lambda_l2
    'min_sum_hessian_in_leaf': None, #overrides min_child_weight
    'min_child_samples': None, #overridden by min_data_in_leaf
    'colsample_bytree': None, #overridden by feature_fraction
#        'min_child_samples': hp.quniform('min_child_samples', 20, 500, 5),
    'min_child_weight': hp.loguniform('min_child_weight', -16, 5), #also aliases to min_sum_hessian
#        'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
#        'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
#        'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
}

In [15]:
def hyperparameter_tuning(space): 
    for param in ('num_leaves', 'subsample_for_bin', 'min_data_in_leaf'):
        if param in space:
            space[param] = int(space[param])
            
    for param in ('learning_rate', 'l2_leaf_reg'):
        if param in space:
            space[param] = float(space[param])  
    space['verbose']=-1
    model =LGBMWinningModel(source=SOURCE, n_features = N_FEATURES, hyperparameters=space)
    
    model, _ = flattened_training.train_on_n_horses_races(source=SOURCE, winning_model=model, n_horses=10, verbose=True)

    res=r_squared.compute_mcfadden_r_squared_on_n_horses(source=SOURCE,winning_model=model, n_horses=10, verbose=True)
    r_squared__score= res['model_r_squared']
    print (f"R²: {r_squared__score:.2}, {space}")
    #change the metric if you like
    return {'loss': -r_squared__score, 'status': STATUS_OK, 'model': model}

In [16]:
%%time
%%capture --no-stdout
trials = Trials()
best = fmin(fn=hyperparameter_tuning,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials,
           verbose=True)

print (best)

Importing training data...                             
Importing validation data...                           
Training for 10 horses (13625 races): loss per horse: 0.151, val loss per horse: 0.217 Train Accuracy: 85.5%, Val Accuracy: 20.9%

Comparing on same races w/ 10 horses with odds 2196 races (3182 races in total)
Mean Predicted probas of actual race result: 12.215% (Random: 9.838%, Odds: 20.823%)
On 2196 races with 10 horses,R² of winning model: 0.06, R² of odds: 0.19, [R² of random model: -0.15 (should be closed to 0)]
R²: 0.056, {'bagging_fraction': 0.968330262444898, 'boosting_type': 'dart', 'class_weight': 'balanced', 'colsample_bytree': None, 'feature_fraction': 0.6381884687152157, 'lambda_l1': 0.006461225606134217, 'lambda_l2': 5.776546662706244e-06, 'learning_rate': 0.033964977946119974, 'min_child_samples': None, 'min_child_weight': 3.2848008843740284, 'min_data_in_leaf': 1, 'min_sum_hessian_in_leaf': None, 'num_leaves': 56, 'reg_alpha': None, 'reg_lambda': None, 'subsa

KeyboardInterrupt: 

CPU times: user 3h 4min 53s, sys: 47.8 s, total: 3h 5min 40s
Wall time: 47min 39s


# DLSharedLGBMWinningModel

In [12]:
%%time
%%capture --no-stdout
space = {'layers':[{'type':'Dense', 'n_units':1}]}

model =DLLayersGeneratorModel(source=SOURCE, n_features = N_FEATURES, hyperparameters=space, name='1')
    
model, _ = sequential_training.train_on_n_horses(source=SOURCE, winning_model=model, n_horses=10, n_epochs=0, start_training_at=dt.datetime.now(),
                                                 n_epochs_per_n_horses=1, verbose=True)

res=r_squared.compute_mcfadden_r_squared_on_n_horses(source=SOURCE,winning_model=model, n_horses=10, verbose=True)
print(res)

Importing training data...
Importing validation data...
Training for 10 horses (13625 races, val 3182 races): loss per horse: 0.216, val loss per horse: 0.211 Train Accuracy: 21.1%, Val Accuracy: 22.9%

Comparing on same races w/ 10 horses with odds 2196 races (3182 races in total)
Mean Predicted probas of actual race result: 13.711% (Random: 10.089%, Odds: 20.823%)

On 2196 races with 10 horses,R² of winning model: 0.08, R² of odds: 0.19, [R² of random model: -0.12 (should be closed to 0)]
{'model_r_squared': 0.07945345387824787, 'odds_r_squared': 0.19096965205005967, 'random_r_squared': -0.11506439675810864, 'n_races': 2196, 'n_rejected_races': 0}
CPU times: user 43.6 s, sys: 555 ms, total: 44.2 s
Wall time: 43.8 s


In [4]:
%%time
%%capture --no-stdout
space = {'layers':[{'type':'Dense', 'n_units':1}]}

model =DLLayersGeneratorModel(source=SOURCE, n_features = N_FEATURES, hyperparameters=space, name='1')
model, _ = sequential_training.train_on_n_horses(source=SOURCE, winning_model=model, n_horses=10, n_epochs=0, start_training_at=dt.datetime.now(),
                                                 n_epochs_per_n_horses=1000, verbose=True)

res=r_squared.compute_mcfadden_r_squared_on_n_horses(source=SOURCE,winning_model=model, n_horses=10, verbose=True)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [11]:
%%time
%%capture --no-stdout
space = {'layers':[{'type':'Dropout', 'rate':0.2}, {'type':'Dense', 'n_units':1}]}

model =DLLayersGeneratorModel(source=SOURCE, n_features = N_FEATURES, hyperparameters=space, name='dropout_0.5_dense_1')
    
model, _ = sequential_training.train_on_n_horses(source=SOURCE, winning_model=model, n_horses=10, n_epochs=0, start_training_at=dt.datetime.now(),
                                                 n_epochs_per_n_horses=1, verbose=True)

res=r_squared.compute_mcfadden_r_squared_on_n_horses(source=SOURCE,winning_model=model, n_horses=10, verbose=True)
print(res)

Importing training data...
Importing validation data...
Training for 10 horses (13625 races, val 3182 races): loss per horse: 0.219, val loss per horse: 0.210 Train Accuracy: 20.3%, Val Accuracy: 23.1%

Comparing on same races w/ 10 horses with odds 2196 races (3182 races in total)
Mean Predicted probas of actual race result: 13.765% (Random: 9.867%, Odds: 20.823%)

On 2196 races with 10 horses,R² of winning model: 0.08, R² of odds: 0.19, [R² of random model: -0.13 (should be closed to 0)]
{'model_r_squared': 0.08033308883167667, 'odds_r_squared': 0.19096965205005967, 'random_r_squared': -0.12889582402376853, 'n_races': 2196, 'n_rejected_races': 0}
CPU times: user 43.3 s, sys: 527 ms, total: 43.9 s
Wall time: 43.3 s


In [5]:
%%time
%%capture --no-stdout
space = {'layers':[{'type':'Dropout', 'rate':0.5}, {'type':'Dense', 'n_units':1}]}

model =DLLayersGeneratorModel(source=SOURCE, n_features = N_FEATURES, hyperparameters=space, name='dropout_0.5_dense_1')
model, _ = sequential_training.train_on_n_horses(source=SOURCE, winning_model=model, n_horses=10, n_epochs=0, start_training_at=dt.datetime.now(),
                                                 n_epochs_per_n_horses=1000, verbose=True)

res=r_squared.compute_mcfadden_r_squared_on_n_horses(source=SOURCE,winning_model=model, n_horses=10, verbose=True)

Importing training data...
Importing validation data...
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
E

In [4]:
%%time
%%capture --no-stdout
space = {'layers':[{'type':'Dropout', 'rate':0.2}, {'type':'Dense', 'n_units':1, 'kernel_regularizer':{'type':'l2', 'l2':0.001}}]}

model =DLLayersGeneratorModel(source=SOURCE, n_features = N_FEATURES, hyperparameters=space, name='dropout_0.5_dense_1')
    
model, _ = sequential_training.train_on_n_horses(source=SOURCE, winning_model=model, n_horses=10, n_epochs=0, start_training_at=dt.datetime.now(),
                                                 n_epochs_per_n_horses=1, verbose=True)

res=r_squared.compute_mcfadden_r_squared_on_n_horses(source=SOURCE,winning_model=model, n_horses=10, verbose=True)
print(res)

Importing training data...
Importing validation data...
Training for 10 horses (13625 races, val 3182 races): loss per horse: 0.220, val loss per horse: 0.211 Train Accuracy: 19.4%, Val Accuracy: 23.0%

Comparing on same races w/ 10 horses with odds 2196 races (3182 races in total)
Mean Predicted probas of actual race result: 13.387% (Random: 10.043%, Odds: 20.823%)

On 2196 races with 10 horses,R² of winning model: 0.08, R² of odds: 0.19, [R² of random model: -0.12 (should be closed to 0)]
{'model_r_squared': 0.0765609141798872, 'odds_r_squared': 0.19096965205005967, 'random_r_squared': -0.12378432864096167, 'n_races': 2196, 'n_rejected_races': 0}
CPU times: user 43.6 s, sys: 901 ms, total: 44.5 s
Wall time: 43.9 s


In [7]:
%%time
%%capture --no-stdout
space = {'layers':[{'type':'Dropout', 'rate':0.2}, {'type':'Dense', 'n_units':1, 'kernel_regularizer':{'type':'l2', 'l2':0.001}}]}

model =DLLayersGeneratorModel(source=SOURCE, n_features = N_FEATURES, hyperparameters=space, name='dropout_0.5_dense_1')
model, _ = sequential_training.train_on_n_horses(source=SOURCE, winning_model=model, n_horses=10, n_epochs=0, start_training_at=dt.datetime.now(),
                                                 n_epochs_per_n_horses=20, verbose=True)

res=r_squared.compute_mcfadden_r_squared_on_n_horses(source=SOURCE,winning_model=model, n_horses=10, verbose=True)
print(res)

Importing training data...
Importing validation data...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Training for 10 horses (13625 races, val 3182 races): loss per horse: 0.223, val loss per horse: 0.212 Train Accuracy: 18.5%, Val Accuracy: 23.0%

Comparing on same races w/ 10 horses with odds 2196 races (3182 races in total)
Mean Predicted probas of actual race result: 13.486% (Random: 10.134%, Odds: 20.823%)

On 2196 races with 10 horses,R² of winning model: 0.08, R² of odds: 0.19, [R² of random model: -0.12 (should be closed to 0)]
{'model_r_squared': 0.07608162535533691, 'odds_r_squared': 0.19096965205005967, 'random_r_squared': -0.12266346827215302, 'n_races': 2196, 'n_rejected_races': 0}
CPU times: user 1min 1s, sys: 1.51 s, total: 1min 2s
Wall time: 52.1 s


In [9]:
%%time
%%capture --no-stdout
space = {'layers':[{'type':'Dropout', 'rate':0.2}, 
                   {'type':'Dense', 'n_units':1, 'kernel_regularizer':{'type':'l2', 'l2':0.001}},
                  {'type':'Dropout', 'rate':0.2}, 
                   {'type':'Dense', 'n_units':1, 'kernel_regularizer':{'type':'l2', 'l2':0.001}}]}

model =DLLayersGeneratorModel(source=SOURCE, n_features = N_FEATURES, hyperparameters=space, name='dropout_0.5_dense_1')
model, _ = sequential_training.train_on_n_horses(source=SOURCE, winning_model=model, n_horses=10, n_epochs=0, start_training_at=dt.datetime.now(),
                                                 n_epochs_per_n_horses=100, verbose=True)

res=r_squared.compute_mcfadden_r_squared_on_n_horses(source=SOURCE,winning_model=model, n_horses=10, verbose=True)
print(res)

Importing training data...
Importing validation data...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoc

In [10]:
%%time
%%capture --no-stdout
space = {'layers':[{'type':'Dropout', 'rate':0.2}, 
                   {'type':'Dense', 'n_units':1},
                  {'type':'Dropout', 'rate':0.2}, 
                   {'type':'Dense', 'n_units':1}]}

model =DLLayersGeneratorModel(source=SOURCE, n_features = N_FEATURES, hyperparameters=space, name='dropout_0.5_dense_1')
model, _ = sequential_training.train_on_n_horses(source=SOURCE, winning_model=model, n_horses=10, n_epochs=0, start_training_at=dt.datetime.now(),
                                                 n_epochs_per_n_horses=100, verbose=True)

res=r_squared.compute_mcfadden_r_squared_on_n_horses(source=SOURCE,winning_model=model, n_horses=10, verbose=True)
print(res)

Importing training data...
Importing validation data...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoc

In [11]:
%%time
%%capture --no-stdout
space = {'layers':[{'type':'Dropout', 'rate':0.2}, 
                   {'type':'Dense', 'n_units':1},
                  {'type':'Dropout', 'rate':0.2}, 
                   {'type':'Dense', 'n_units':1}]}

model =DLLayersGeneratorModel(source=SOURCE, n_features = N_FEATURES, hyperparameters=space, name='dropout_0.5_dense_1')
model, _ = sequential_training.train_on_n_horses(source=SOURCE, winning_model=model, n_horses=10, n_epochs=0, start_training_at=dt.datetime.now(),
                                                 n_epochs_per_n_horses=1, verbose=True)

res=r_squared.compute_mcfadden_r_squared_on_n_horses(source=SOURCE,winning_model=model, n_horses=10, verbose=True)
print(res)

Importing training data...
Importing validation data...
Training for 10 horses (13625 races, val 3182 races): loss per horse: 0.223, val loss per horse: 0.214 Train Accuracy: 19.0%, Val Accuracy: 22.2%

Comparing on same races w/ 10 horses with odds 2196 races (3182 races in total)
Mean Predicted probas of actual race result: 12.557% (Random: 10.003%, Odds: 20.823%)

On 2196 races with 10 horses,R² of winning model: 0.07, R² of odds: 0.19, [R² of random model: -0.11 (should be closed to 0)]
{'model_r_squared': 0.06880480118050591, 'odds_r_squared': 0.19096965205005967, 'random_r_squared': -0.11352155630920047, 'n_races': 2196, 'n_rejected_races': 0}
CPU times: user 55 s, sys: 521 ms, total: 55.5 s
Wall time: 55.3 s


In [12]:
%%time
%%capture --no-stdout
space = {'layers':[{'type':'Dropout', 'rate':0.2}, 
                   {'type':'Dense', 'n_units':8},
                  {'type':'Dropout', 'rate':0.2}, 
                   {'type':'Dense', 'n_units':1}]}

model =DLLayersGeneratorModel(source=SOURCE, n_features = N_FEATURES, hyperparameters=space, name='dropout_0.5_dense_1')
model, _ = sequential_training.train_on_n_horses(source=SOURCE, winning_model=model, n_horses=10, n_epochs=0, start_training_at=dt.datetime.now(),
                                                 n_epochs_per_n_horses=1, verbose=True)

res=r_squared.compute_mcfadden_r_squared_on_n_horses(source=SOURCE,winning_model=model, n_horses=10, verbose=True)
print(res)

Importing training data...
Importing validation data...
Training for 10 horses (13625 races, val 3182 races): loss per horse: 0.227, val loss per horse: 0.211 Train Accuracy: 18.4%, Val Accuracy: 23.5%

Comparing on same races w/ 10 horses with odds 2196 races (3182 races in total)
Mean Predicted probas of actual race result: 13.213% (Random: 9.845%, Odds: 20.823%)

On 2196 races with 10 horses,R² of winning model: 0.07, R² of odds: 0.19, [R² of random model: -0.13 (should be closed to 0)]
{'model_r_squared': 0.0739364095442615, 'odds_r_squared': 0.19096965205005967, 'random_r_squared': -0.1301848714616345, 'n_races': 2196, 'n_rejected_races': 0}
CPU times: user 47.9 s, sys: 553 ms, total: 48.4 s
Wall time: 47.7 s
