In [1]:
import ipynb.fs.defs.hyperas as hyperas_nb
import numpy as np

Using TensorFlow backend.


In [2]:
x_train, y_train, x_val, y_val = hyperas_nb.data('data/E0_processed.csv')

In [3]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=3, random_state=42)

In [4]:
import xgboost

In [5]:
def evaluate(clfs, x_val, y_val, th=0.05, min_odds=1, agree=2, agree_odds=2):
    import numpy as np
    import pandas as pd
    
    profits = pd.DataFrame()
    profits.columns=[]
    r = None
    for i, model in enumerate(clfs):
        y_pred = model.predict_proba(x_val)
        odds = np.abs(y_val)
        sel = y_pred > (1 / odds + th) 
        if r is None:
            r = pd.DataFrame(sel).astype('int')
        else:
            r = r + sel
        profit = (sel & (odds > min_odds)) * (odds * y_val.clip(0,1) - 1)
                
        pr = profit.sum()        
        co = (profit != 0).sum()

        columns = list(profits.columns) + ['P_%02d' % i, 'C_%02d' % i]
        profits = pd.concat([profits, pr, co], axis=1, sort=False)
        profits.columns = columns

    sel = (r > agree) & (odds > agree_odds)
    profit = sel * (odds * y_val.clip(0,1) - 1)

    pr = profit.sum()        
    co = (profit != 0).sum()

    columns = list(profits.columns) + ['P', 'C']
    profits = pd.concat([profits, pr, co], axis=1, sort=False)
    profits.columns = columns

    total = profits[['P','C']].sum()
    if total.iloc[1] == 0:
        l = pd.DataFrame(list(total.values) + [ 0 ])
    else:
        l = pd.DataFrame(list(total.values) + [ total.iloc[0] / total.iloc[1] ])
    l.index = profits.index
    l.columns = ['Total']

    profits = pd.concat([profits, l], axis=1, sort=False)

    return profits

In [6]:
def train():
    from sklearn import svm
    from sklearn import neighbors
    from sklearn import ensemble
    from sklearn import tree
    clfs = []
    total = 0
    for i, (trainidx, testidx) in enumerate(list(kfold.split(x_train))):
        xx_train = x_train.iloc[trainidx]
        x_test = x_train.iloc[testidx]

        yy_train = y_train.iloc[trainidx]
        yy_labels = yy_train.clip(0,1).values.argmax(axis=1)

        y_test = y_train.iloc[testidx]
        y_labels = y_train.clip(0,1).values.argmax(axis=1)

        # clf = svm.SVC(gamma='scale', probability=True)
        # clf = svm.NuSVC(probability=True)
        # clf = neighbors.KNeighborsClassifier()
        clf = ensemble.GradientBoostingClassifier()
        # clf = tree.DecisionTreeClassifier()
        clf = xgboost.XGBClassifier(max_depth=2)
        clf.fit(xx_train, yy_labels)  

        clfs.append(clf)

        y_pred = clf.predict_proba(x_test)
        odds = np.abs(y_test)
        profit = ( y_pred > (1 / odds + 0.05)) * (odds * y_test.clip(0,1) - 1)
        acc = profit.sum().sum()
        total += acc
    return total, clfs

In [7]:
_, clfs = train()

In [8]:
evaluate(clfs, x_val, y_val)

Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,5.55,25,-0.52,32,-5.9,27,1.14,1,0.14
odds-draw,-8.66,24,6.58,20,7.35,27,-1.0,1,2.0
odds-away,-3.34,10,-6.34,11,1.07,32,0.0,0,0.07


In [9]:
import glob
leagues = glob.glob('data/*_processed.csv')

In [10]:
total = None
for league in leagues:
    print (league)
    x_train, y_train, x_val, y_val = hyperas_nb.data(league, drop_shots=False)
    _, clfs = train()
    d = evaluate(clfs, x_val, y_val, agree_odds=2, agree=2)
    if total is None:
        total = d
    else:
        total = total + d
    display(d)

data\D1_processed.csv


Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,11.06,37,3.37,35,3.81,18,6.14,8,14.43
odds-draw,6.29,15,-0.41,8,-6.82,14,0.0,0,14.0
odds-away,1.54,14,3.77,24,19.66,31,8.29,6,1.030714


data\E0_processed.csv


Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,5.55,25,-0.52,32,-5.9,27,1.14,1,0.14
odds-draw,-8.66,24,6.58,20,7.35,27,-1.0,1,2.0
odds-away,-3.34,10,-6.34,11,1.07,32,0.0,0,0.07


data\F1_processed.csv


Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,-4.07,30,-3.9,25,-2.82,33,4.64,4,-0.26
odds-draw,-6.73,22,-5.22,29,8.57,44,-1.9,5,12.0
odds-away,-8.68,26,-8.9,12,3.5,8,-3.0,3,-0.021667


data\I1_processed.csv


Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,-1.96,28,-10.48,26,-2.99,25,0.0,0,3.54
odds-draw,5.74,17,3.69,18,-1.48,27,2.2,1,8.0
odds-away,-3.61,43,-4.18,35,7.98,36,1.34,7,0.4425


data\SC0_processed.csv


Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,-6.9,24,-4.4,10,0.84,23,-1.0,1,-2.0
odds-draw,3.29,7,-10.0,10,6.08,4,0.0,0,2.0
odds-away,-7.83,15,-8.28,27,-1.67,17,-1.0,1,-1.0


data\SP1_processed.csv


Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,-5.71,41,-0.24,28,10.14,34,-0.9,4,4.35
odds-draw,-4.89,33,8.27,40,13.0,18,6.25,4,9.0
odds-away,-2.38,9,-4.75,7,-8.0,26,-1.0,1,0.483333


In [11]:
total

Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,-2.03,185,-16.17,156,3.08,160,10.02,18,20.2
odds-draw,-4.96,118,2.91,125,26.7,134,5.55,11,47.0
odds-away,-24.3,117,-28.68,116,22.54,150,4.63,18,1.004881


In [12]:
total.P.sum() / total.C.sum() * 100

42.97872340425533

In [13]:
from hyperopt import Trials, STATUS_OK, tpe
from hyperas import optim
from hyperas.distributions import choice, uniform

In [14]:
def data():
    # empty because we read it within the train function
    return None, None, None, None
    
def cross_train(x_train, y_train, x_val, y_val):
    import glob
    import xgboost

    from sklearn.model_selection import KFold
    from IPython.display import clear_output

    kfold = KFold(n_splits=3, random_state=42)    
    
    leagues = glob.glob('data/*_processed.csv')
    
    clear_output()

    
    total = 0
    all_clfs = []

    for league in leagues:
        x_train, y_train, x_val, y_val = hyperas_nb.data(league)

        sub_total = 0
        clfs = []
        for i, (trainidx, testidx) in enumerate(list(kfold.split(x_train))):
            xx_train = x_train.iloc[trainidx]
            x_test = x_train.iloc[testidx]

            yy_train = y_train.iloc[trainidx]
            yy_labels = yy_train.clip(0,1).values.argmax(axis=1)

            y_test = y_train.iloc[testidx]
            y_labels = y_train.clip(0,1).values.argmax(axis=1)

            clf = xgboost.XGBClassifier(base_score={{uniform(0,1)}},
                                        max_depth= 2,
                                        learning_rate={{uniform(0.01,0.3)}},
                                        n_estimators= {{choice([50,100,150,200,250,300])}},
                                        gamma = {{ uniform(0,1) }})
            clf.fit(xx_train, yy_labels)  

            clfs.append(clf)

            y_pred = clf.predict_proba(x_test)
            odds = np.abs(y_test)
            profit = ( y_pred > (1 / odds + 0.05)) * (odds * y_test.clip(0,1) - 1)
            acc = profit.sum().iloc[:2].sum()
            sub_total += acc
        print (league, sub_total)
            
        all_clfs.append(clfs)
        total = total + sub_total
    return {'loss': -total, 'status': STATUS_OK, 'model': all_clfs}

In [15]:
xgboost.XGBClassifier()

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [16]:
cross_trials = Trials()
best_run3, best_model3 = optim.minimize(model=cross_train,
                                        data=data,
                                        algo=tpe.suggest,
                                        max_evals=50,
                                        trials=cross_trials,
                                        notebook_name="scikit")

data\D1_processed.csv                                                                                                  
-77.93000000000006                                                                                                     
data\E0_processed.csv                                                                                                  
12.81999999999999                                                                                                      
data\F1_processed.csv                                                                                                  
27.11000000000002                                                                                                      
data\I1_processed.csv                                                                                                  
-19.749999999999982                                                                                                    
data\SC0_processed.csv                  

In [17]:
best_run3

{'base_score': 0.6108563941804438,
 'gamma': 0.9608812210263653,
 'learning_rate': 0.1414098890581527,
 'n_estimators': 4}

In [18]:
total = None
for league, model in zip(leagues, best_model3):
    print (league)
    x_train, y_train, x_val, y_val = hyperas_nb.data(league, drop_shots=False)
    
    d = evaluate(model, x_val, y_val)
    if total is None:
        total = d
    else:
        total = total + d
    display(d)

data\D1_processed.csv


Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,10.71,38,3.9,33,6.26,18,4.55,6,13.84
odds-draw,9.89,15,5.79,10,-8.82,16,0.0,0,11.0
odds-away,4.54,15,6.77,21,17.27,30,9.29,5,1.258182


data\E0_processed.csv


Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,-1.21,28,-2.22,33,-0.69,30,1.14,1,-0.86
odds-draw,-7.37,26,11.81,28,13.75,34,0.0,0,3.0
odds-away,-4.34,11,-4.84,13,-0.33,37,-2.0,2,-0.286667


data\F1_processed.csv


Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,-2.5,29,-9.43,33,-0.68,35,5.64,3,-0.26
odds-draw,-4.53,23,-10.72,29,7.97,41,-2.9,6,12.0
odds-away,-10.68,26,-9.8,16,1.5,10,-3.0,3,-0.021667


data\I1_processed.csv


Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,-6.96,33,-12.3,31,-4.8,23,-1.0,1,-3.61
odds-draw,2.04,18,6.48,19,5.02,24,0.0,0,6.0
odds-away,-5.85,41,-8.14,46,3.33,37,-2.61,5,-0.601667


data\SC0_processed.csv


Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,-9.1,28,-3.45,11,4.44,23,-2.0,2,-3.0
odds-draw,4.29,6,-8.0,8,3.58,3,0.0,0,3.0
odds-away,-5.83,13,-3.99,26,-3.67,19,-1.0,1,-1.0


data\SP1_processed.csv


Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,-6.25,49,2.33,28,3.85,47,0.1,3,5.35
odds-draw,-1.94,34,3.02,42,18.74,24,7.25,3,8.0
odds-away,-5.09,15,-7.75,10,-8.56,28,-2.0,2,0.66875


In [19]:
total

Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,-15.31,205,-21.17,169,8.38,176,8.43,16,11.46
odds-draw,2.38,122,8.38,136,40.24,142,4.35,9,43.0
odds-away,-27.25,121,-27.75,132,9.54,161,-1.32,18,0.016932


In [20]:
total.P.sum() / total.C.sum() * 100

26.651162790697676

In [21]:
def data():
    # empty because we read it within the train function
    return None, None, None, None
    
def cross_cross_train(x_train, y_train, x_val, y_val):
    import glob
    import xgboost

    from sklearn.model_selection import KFold
    from IPython.display import clear_output
    
    from ipynb.fs.defs.scikit import evaluate

    kfold = KFold(n_splits=3, random_state=42)    
    
    leagues = glob.glob('data/*_processed.csv')
    
    clear_output()
    
    total = 0
    all_clfs = []


    for league in leagues:
        x_train, y_train, x_val, y_val = hyperas_nb.data(league, drop_shots=False)

        sub_total = 0
        clfs = []
        for i, (trainidx, testidx) in enumerate(list(kfold.split(x_train))):
            xx_train = x_train.iloc[trainidx]
            x_test = x_train.iloc[testidx]

            yy_train = y_train.iloc[trainidx]
            yy_labels = yy_train.clip(0,1).values.argmax(axis=1)

            y_test = y_train.iloc[testidx]
            y_labels = y_train.clip(0,1).values.argmax(axis=1)
            
            base_score={{uniform(0,1)}}
            max_depth= {{choice([1,2,3,4,5,6])}}
            learning_rate={{uniform(0.01,0.3)}}
            n_estimators= {{choice([50,100,150,200,250,300])}}
            gamma = {{ uniform(0,1) }}
            
            iclfs = []
            for j, (ttrainidx, ttestidx) in enumerate(list(kfold.split(xx_train))):
                xxx_train = xx_train.iloc[ttrainidx]
                yyy_labels = yy_labels[ttrainidx]
                
                clf = xgboost.XGBClassifier(nthread=4,
                                            base_score=base_score,
                                            max_depth=max_depth,
                                            learning_rate=learning_rate,
                                            n_estimators= n_estimators,
                                            gamma = gamma)

                clf.fit(xxx_train, yyy_labels)
                iclfs.append(clf)
                
            d = evaluate(iclfs, x_test, y_test)
            # display(d)
            
            acc = d.P.sum()

            clf = xgboost.XGBClassifier(nthread=4,
                                        base_score=base_score,
                                        max_depth=max_depth,
                                        learning_rate=learning_rate,
                                        n_estimators= n_estimators,
                                        gamma = gamma)

            clf.fit(xx_train, yy_labels)  
            clfs.append(clf)

            sub_total += acc
        print (league, sub_total)
            
        all_clfs.append(clfs)
        total = total + sub_total
    return {'loss': -total, 'status': STATUS_OK, 'model': all_clfs}

In [22]:
cross_cross_trials = Trials()
best_run, best_model = optim.minimize(model=cross_cross_train,
                                        data=data,
                                        algo=tpe.suggest,
                                        max_evals=50,
                                        trials=cross_cross_trials,
                                        notebook_name="scikit")

data\D1_processed.csv                                                                                                  
32.17999999999999                                                                                                      
data\E0_processed.csv                                                                                                  
-20.43                                                                                                                 
data\F1_processed.csv                                                                                                  
-19.590000000000003                                                                                                    
data\I1_processed.csv                                                                                                  
-3.21                                                                                                                  
data\SC0_processed.csv                  

In [23]:
best_run

{'base_score': 0.8729022165378107,
 'gamma': 0.15655581891240672,
 'learning_rate': 0.0993798830584434,
 'max_depth': 1,
 'n_estimators': 4}

In [24]:
total = None
for league, model in zip(leagues, best_model):
    print (league)
    x_train, y_train, x_val, y_val = hyperas_nb.data(league, drop_shots=False)
    
    d = evaluate(model, x_val, y_val)
    if total is None:
        total = d
    else:
        total = total + d
    display(d)

data\D1_processed.csv


Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,15.06,43,7.0,41,4.91,28,6.59,10,14.88
odds-draw,-0.53,24,-0.73,25,-1.53,22,1.0,4,18.0
odds-away,1.32,25,5.42,30,3.51,44,7.29,4,0.826667


data\E0_processed.csv


Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,2.23,40,-1.04,43,0.84,38,0.14,2,-5.86
odds-draw,-1.42,35,8.89,41,3.69,47,-3.0,3,8.0
odds-away,-1.1,21,-10.34,20,7.64,46,-3.0,3,-0.7325


data\F1_processed.csv


Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,-4.5,50,-7.84,38,-3.57,49,-0.66,5,-4.06
odds-draw,-1.24,31,-1.23,30,16.53,56,0.6,7,16.0
odds-away,-10.82,41,-8.27,27,-0.34,18,-4.0,4,-0.25375


data\I1_processed.csv


Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,-9.46,41,-14.21,49,-3.05,41,-3.0,3,2.51
odds-draw,3.63,26,13.16,29,3.91,35,2.7,4,16.0
odds-away,-1.89,60,-5.32,51,8.37,49,2.81,9,0.156875


data\SC0_processed.csv


Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,-9.38,31,-6.78,19,0.94,27,-2.0,2,-6.0
odds-draw,0.29,10,-4.46,13,4.04,6,0.0,0,6.0
odds-away,-11.75,22,-1.4,28,-4.26,32,-4.0,4,-1.0


data\SP1_processed.csv


Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,-2.43,56,-2.66,62,-4.6,63,3.55,12,0.05
odds-draw,-5.54,44,-4.4,48,18.73,36,-2.75,6,21.0
odds-away,-7.5,25,-12.11,20,-17.54,34,-0.75,3,0.002381


In [25]:
total

Unnamed: 0,P_00,C_00,P_01,C_01,P_02,C_02,P,C,Total
odds-home,-8.48,261,-25.53,252,-4.53,246,4.62,34,1.52
odds-draw,-4.81,170,11.23,186,45.37,202,-1.45,24,85.0
odds-away,-31.74,194,-32.02,176,-2.62,223,-1.65,27,-1.000327


In [26]:
total.P.iloc[:2].sum() / total.C.iloc[:2].sum() * 100

5.465517241379311

In [27]:
total.P.sum() / total.C.sum() * 100

1.788235294117648