In [1]:
import numpy as np
import pandas as pd
import easydatascience as eds

full = pd.read_csv('merged.csv')
lines = pd.read_csv('lines_data.csv')

#"forms" represent "forms5"
forms = pd.read_csv('forms.csv')
forms4 = pd.read_csv('forms4.csv')
forms3 = pd.read_csv('forms3.csv')
forms6 = pd.read_csv('forms6.csv')

In [5]:
def transform_forms(forms, export=False):
    forms = forms.rename(columns={'WonH1':'WonH1past', 'WonQ1':'WonQ1past', 'Won':'Wonpast'})

    #Joining useful stats from "full" with "form" stats to make data for predictions
    full_use = full[['Season', 'Season_Half', 'Date', 'Game', 'Team', 'WonQ1', 'WonH1', 'Won', 
                     'Home/Away']]
    base = full_use.iloc[:, :4]
    base = base.iloc[::2]
    teams = full_use.iloc[:, 3:]

    away_teams = teams.loc[teams['Home/Away']=='Away'].drop('Home/Away', axis=1)
    home_teams = teams.loc[teams['Home/Away']=='Home'][['Game', 'Team']]
    away_teams.columns = [col+'_away' for col in away_teams.columns]
    home_teams.columns = [col+'_home' for col in home_teams.columns]

    base = pd.merge(base, away_teams, right_on='Game_away', left_on='Game', how='left')
    base = pd.merge(base, home_teams, right_on='Game_home', left_on='Game', how='left')
    base = base.drop(['Game_away', 'Game_home'], axis=1)
    
    #Adding stats
    data = pd.merge(base, forms, left_on=['Date', 'Team_away'], right_on=['Date', 'Team'],
                    how='left').drop('Team', axis=1)
    data = data.rename(columns={col: col+'_away' for col in data.columns[9:]})

    data = pd.merge(data, forms, left_on=['Date', 'Team_home'], right_on=['Date', 'Team'],
                    how='left').drop('Team', axis=1)
    data = data.rename(columns={col: col+'_home' for col in data.columns[121:]})

    #Deleting first few games of the season since they don't have any data because teams don't
    #have mesurable form yet
    data = data.loc[(data['FGM_team_away']!=0)&(data['FGM_team_home']!=0)].reset_index(drop=True)
    
    #Exporting "forms" data for analysis
    if export==True:
        file_name = input('Input export file name: ')
        if file_name[-4:]!='.csv':
            file_name += '.csv'
        data.to_csv(file_name, index=False)
        
    #Simple feature extraction
    data['Month'] = data['Date'].str[5:7].astype('int')

    data = data.rename(columns={'Season_Half': 'Season_Half_is2'})
    data['Season_Half_is2'] = data['Season_Half_is2']-1

    #Lines
    data = pd.merge(data, lines, how='left', left_on=['Team_away', 'Date'], right_on=['Team',
                                                                                 'Date'])
    data = data.drop('Team', axis=1)
    data = data.rename(columns={'Line': 'Line_away'})

    #Not useful for predictions
    data.set_index('Game', inplace=True)
    data = data.drop(['Season', 'Date', 'Team_away', 'Team_home'], axis=1)
    
    return data

data = transform_forms(forms)
data4 = transform_forms(forms4)
data3 = transform_forms(forms3)
data6 = transform_forms(forms6)
    
#For EDA
data.to_csv('eda_forms.csv')
    
print('---------------------')
print('Correlation of Vegas lines with the outcome:', 
data['Won_away'].corr(data['Line_away']).round(4))
print('---------------------')

---------------------
Correlation of Vegas lines with the outcome: -0.4275
---------------------


In [3]:
#Deciding what method should be used for machine learning, and if
#feature extraction should be used
from sklearn.model_selection import train_test_split

#Data used for evaluation
test_data = data[['Won_away', 'REB_team_away', 'REB_team_home',
                  'AST_team_away', 'AST_team_home', 'TOV_team_away', 'TOV_team_home', 
                  'FG%_team_away', 'FG%_team_home', 'FTM_team_away', 'FTM_team_home']]

#Shuffling to simplyfy train test split
test_data = test_data.sample(frac=1)

test_y = test_data['Won_away']
test_data = test_data.drop('Won_away', axis=1)
print('---------------------')
print('Test vanilla data columns skewness:\n')
print(test_data.skew(), '\n---------------------\n')

aways = test_data.iloc[:, ::2]
homes = test_data.iloc[:, 1::2]

#Difference data
test_data_diff = pd.DataFrame()

for idx in range(aways.shape[1]):
    curr_col = aways.columns[idx][:-5]
    test_data_diff[curr_col] = aways.iloc[:, idx]-homes.iloc[:, idx]

print('\n---------------------\nTest difference data columns skewness:\n')
print(test_data_diff.skew(), '\n---------------------\n')

print('It seems that there are no problems with the skewness of the data.')

---------------------
Test vanilla data columns skewness:

REB_team_away    0.187295
REB_team_home    0.218701
AST_team_away    0.290965
AST_team_home    0.266799
TOV_team_away    0.315879
TOV_team_home    0.321747
FG%_team_away    0.045115
FG%_team_home    0.024528
FTM_team_away    0.330271
FTM_team_home    0.291834
dtype: float64 
---------------------


---------------------
Test difference data columns skewness:

REB_team   -0.026904
AST_team    0.000288
TOV_team   -0.009176
FG%_team    0.025967
FTM_team    0.030083
dtype: float64 
---------------------

It seems that there are no problems with the skewness of the data.


In [4]:
#Testing data is prepared and now let's test it
import xgboost as xgb
import keras

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

#Train test split
test_y_train, test_y_test = test_y.iloc[:20000], test_y.iloc[20000:]
test_data_train, test_data_test = test_data.iloc[:20000], test_data.iloc[20000:]
test_data_diff_train, test_data_diff_test = test_data_diff.iloc[:20000], test_data_diff.iloc[20000:]

scaler = StandardScaler()

scaler.fit(test_data_train)
test_data_train,test_data_test = pd.DataFrame(scaler.transform(test_data_train)), pd.DataFrame(scaler.transform(test_data_test))
    
scaler.fit(test_data_diff_train)
test_data_diff_train, test_data_diff_test = pd.DataFrame(scaler.transform(test_data_diff_train)), pd.DataFrame(scaler.transform(test_data_diff_test))

print('---------------------\nAll scores represent accuracy.\n---------------------')
print('\n---------------------\nXGBoost, minimal model tweaking\n')

xgbc = xgb.XGBClassifier(objective='binary:logistic')

xgbc.fit(test_data_train, test_y_train)

print('Vanilla score:')
print(accuracy_score(test_y_test, xgbc.predict(test_data_test)).round(4))

xgbc.fit(test_data_diff_train, test_y_train)

print('Difference score:')
print(accuracy_score(test_y_test, xgbc.predict(test_data_diff_test)).round(4))
print('---------------------')

print('\n---------------------\nXGBoost, some model tweaking\n')

xgbc = xgb.XGBClassifier(objective='binary:logistic', max_depth=3,
                         n_estimators=1000, learning_rate = 0.01, n_jobs=-1)

xgbc.fit(test_data_train, test_y_train)

print('Vanilla score:')
print(accuracy_score(test_y_test, xgbc.predict(test_data_test)).round(4))

xgbc.fit(test_data_diff_train, test_y_train)

print('Difference score:')
print(accuracy_score(test_y_test, xgbc.predict(test_data_diff_test)).round(4))
print('---------------------')

print('\n---------------------\nNeural Net\n')

input = keras.layers.Input(shape=test_data_train.shape[1:])
hidden1 = keras.layers.Dense(60, activation='relu')(input)
hidden2 = keras.layers.Dense(30, activation='relu')(hidden1)
hidden3 = keras.layers.Dense(10, activation='relu')(hidden2)
output = keras.layers.Dense(1, activation='sigmoid')(hidden3)
model = keras.models.Model(inputs=[input], outputs=[output])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(test_data_train, test_y_train, epochs=5, verbose=0)

print('Vanilla score:')
print(accuracy_score(test_y_test, model.predict(test_data_test).round()).round(4))

input = keras.layers.Input(shape=test_data_diff.shape[1:])
hidden1 = keras.layers.Dense(60, activation='relu')(input)
hidden2 = keras.layers.Dense(30, activation='relu')(hidden1)
hidden3 = keras.layers.Dense(10, activation='relu')(hidden2)
output = keras.layers.Dense(1, activation='sigmoid')(hidden3)
model = keras.models.Model(inputs=[input], outputs=[output])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(test_data_diff_train, test_y_train, epochs=5, verbose=0)

print('Difference score:')
print(accuracy_score(test_y_test, model.predict(test_data_diff_test).round()).round(4))
print('---------------------')

print('\nAs we can see, the difference is not big but still, I thought that subtraction\nwould make models perform better, but that is not the case. As for the model,\nXGBoost can be a good default but adding more features could make NNs more viable.')

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


---------------------
All scores represent accuracy.
---------------------

---------------------
XGBoost, minimal model tweaking

Vanilla score:
0.611
Difference score:
0.6106
---------------------

---------------------
XGBoost, some model tweaking

Vanilla score:
0.6106
Difference score:
0.6081
---------------------

---------------------
Neural Net

Vanilla score:
0.6106
Difference score:
0.6093
---------------------

As we can see, the difference is not big but still, I thought that subtraction
would make models perform better, but that is not the case. As for the model,
XGBoost can be a good default but adding more features could make NNs more viable.


In [5]:
def get_datas(data, verbose=False):
    #Prettifying the data for easier splitting
    move = data.iloc[:, :4]
    data = data.drop(move.columns, axis=1)
    data = pd.concat([data, move], axis=1)

    move = data[['Month', 'Season_Half_is2']]
    data = data.drop(['Month', 'Season_Half_is2'], axis=1)
    data = pd.concat([data, move], axis=1)

    #Data chunks
    aways = data.iloc[:, :112]
    homes = data.iloc[:, 112:224]
    lines = data[['Line_away']]
    ys = data.iloc[:, 225:228]
    other = data[['Month', 'Season_Half_is2']]

    #Check if data was split properly
    #data.equals(pd.concat([aways, homes, lines, ys, other], axis=1))

    #Forming dataframes that are of some interest
    vanilla_X = pd.concat([aways, homes, other], axis=1)
    vanilla_X_lines = pd.concat([aways, homes, other, lines], axis=1)

    diff_X = pd.DataFrame()
    for col in aways.columns:
        diff_X[col[:-5]] = aways[col]-homes[col[:-5]+'_home']
    diff_X_lines = pd.concat([diff_X, lines], axis=1)

    diffp_X = pd.concat([aways.iloc[:, :22], homes.iloc[:, :22]], axis=1)
    for col in aways.columns[22:]:
        diffp_X[col[:-5]] = aways[col]-homes[col[:-5]+'_home']
    diffp_X_lines = pd.concat([diffp_X, lines], axis=1)
    
    if verbose==True:
        print('---------------------\nCorrelation vanilla_X_lines:\n')
        eds.print_abv_corr(vanilla_X_lines, ys['Won_away'], threshold=0.1)
        print('---------------------')
        print('\n---------------------\nCorrelation diff_X_lines:\n')
        eds.print_abv_corr(diff_X_lines, ys['Won_away'], threshold=0.1)
        print('---------------------')
    
    return vanilla_X, vanilla_X_lines, diff_X, diff_X_lines, diffp_X, diffp_X_lines, lines, ys

#Splitting different train and test dataframes
def split(vanilla_X, vanilla_X_lines, diff_X, diff_X_lines, diffp_X, diffp_X_lines, ys):
    vanilla_X_train, vanilla_X_test = vanilla_X[:24000], vanilla_X[24000:]
    vanilla_X_lines_train, vanilla_X_lines_test = vanilla_X_lines[:24000], vanilla_X_lines[24000:]
    diff_X_train, diff_X_test = diff_X[:24000], diff_X[24000:]
    diff_X_lines_train, diff_X_lines_test = diff_X_lines[:24000], diff_X_lines[24000:]
    diffp_X_train, diffp_X_test = diffp_X[:24000], diffp_X[24000:]
    diffp_X_lines_train, diffp_X_lines_test = diffp_X_lines[:24000], diffp_X_lines[24000:]
    y_train, y_test = ys['Won_away'][:24000], ys['Won_away'][24000:]
    
    return vanilla_X_train, vanilla_X_test, vanilla_X_lines_train, vanilla_X_lines_test, diff_X_train, diff_X_test, diff_X_lines_train, diff_X_lines_test, diffp_X_train, diffp_X_test, diffp_X_lines_train, diffp_X_lines_test, y_train, y_test

#Scaling dataframes
def scale(vanilla_X_train, vanilla_X_test, vanilla_X_lines_train, vanilla_X_lines_test,
          diff_X_train, diff_X_test, diff_X_lines_train, diff_X_lines_test, diffp_X_train,
          diffp_X_test, diffp_X_lines_train, diffp_X_lines_test):

    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()

    scaler.fit(vanilla_X_train)
    vanilla_X_train, vanilla_X_test = pd.DataFrame(scaler.transform(vanilla_X_train)), pd.DataFrame(scaler.transform(vanilla_X_test))

    scaler.fit(vanilla_X_lines_train)
    vanilla_X_lines_train, vanilla_X_lines_test = pd.DataFrame(scaler.transform(vanilla_X_lines_train)), pd.DataFrame(scaler.transform(vanilla_X_lines_test))

    scaler.fit(diff_X_train)
    diff_X_train, diff_X_test = pd.DataFrame(scaler.transform(diff_X_train)), pd.DataFrame(scaler.transform(diff_X_test))

    scaler.fit(diff_X_lines_train)
    diff_X_lines_train, diff_X_lines_test = pd.DataFrame(scaler.transform(diff_X_lines_train)), pd.DataFrame(scaler.transform(diff_X_lines_test))

    scaler.fit(diffp_X_train)
    diffp_X_train, diffp_X_test = pd.DataFrame(scaler.transform(diffp_X_train)), pd.DataFrame(scaler.transform(diffp_X_test))

    scaler.fit(diffp_X_lines_train)
    diffp_X_lines_train, diffp_X_lines_test = pd.DataFrame(scaler.transform(diffp_X_lines_train)), pd.DataFrame(scaler.transform(diffp_X_lines_test))
    
    return vanilla_X_train, vanilla_X_test, vanilla_X_lines_train, vanilla_X_lines_test, diff_X_train, diff_X_test, diff_X_lines_train, diff_X_lines_test, diffp_X_train, diffp_X_test, diffp_X_lines_train, diffp_X_lines_test


vanilla_X, vanilla_X_lines, diff_X, diff_X_lines, diffp_X, diffp_X_lines, lines, ys = get_datas(data, verbose=True)
vanilla4_X, vanilla4_X_lines, diff4_X, diff4_X_lines, diffp4_X, diffp4_X_lines, lines4, ys4 = get_datas(data4)
vanilla3_X, vanilla3_X_lines, diff3_X, diff3_X_lines, diffp3_X, diffp3_X_lines, lines3, ys3 = get_datas(data3)
vanilla6_X, vanilla6_X_lines, diff6_X, diff6_X_lines, diffp6_X, diffp6_X_lines, lines6, ys6 = get_datas(data6)

line_accuracy = lines
line_accuracy['Out'] = np.where(line_accuracy['Line_away']>=0, 0, 1)
print('\n---------------------\nThe accuracy of model if it used only lines:')
print(r'(The score to beat)')
print(accuracy_score(ys['Won_away'], line_accuracy['Out']).round(3)*100, '%')
print('---------------------')

---------------------
Correlation vanilla_X_lines:

Wonpast_away 0.1613
WonH1past_away 0.1201
Season_Wins_away 0.1172
WonQ1past_away 0.1093
WonQ1past_home -0.1053
Season_Wins_home -0.1357
WonH1past_home -0.1402
Wonpast_home -0.1808
Line_away -0.4275
---------------------

---------------------
Correlation diff_X_lines:

Season_Wins 0.3314
Wonpast 0.246
WonH1past 0.1873
WonQ1past 0.1545
PTS_player1 0.137
FGM_player1 0.1298
FG%_player1 0.1267
AST_team 0.1217
3PM_team 0.1165
FG%_team 0.1149
FTA_player1 0.1136
DREB_player1 0.1119
FTM_player1 0.1107
FGM_team 0.1053
FG%_player2 0.105
FGA_player1 0.1043
DREB_team 0.1026
PTS_player2 0.1015
Line_away -0.4275
---------------------

---------------------
The accuracy of model if it used only lines:
(The score to beat)
69.6 %
---------------------


In [6]:
#Splitting
vanilla_X_train, vanilla_X_test, vanilla_X_lines_train, vanilla_X_lines_test, diff_X_train, diff_X_test, diff_X_lines_train, diff_X_lines_test, diffp_X_train, diffp_X_test, diffp_X_lines_train, diffp_X_lines_test, y_train, y_test = split(vanilla_X, vanilla_X_lines, diff_X, diff_X_lines, diffp_X, diffp_X_lines, ys)
vanilla4_X_train, vanilla4_X_test, vanilla4_X_lines_train, vanilla4_X_lines_test, diff4_X_train, diff4_X_test, diff4_X_lines_train, diff4_X_lines_test, diffp4_X_train, diffp4_X_test, diffp4_X_lines_train, diffp4_X_lines_test, y4_train, y4_test = split(vanilla4_X, vanilla4_X_lines, diff4_X, diff4_X_lines, diffp4_X, diffp4_X_lines, ys4)
vanilla3_X_train, vanilla3_X_test, vanilla3_X_lines_train, vanilla3_X_lines_test, diff3_X_train, diff3_X_test, diff3_X_lines_train, diff3_X_lines_test, diffp3_X_train, diffp3_X_test, diffp3_X_lines_train, diffp3_X_lines_test, y3_train, y3_test = split(vanilla3_X, vanilla3_X_lines, diff3_X, diff3_X_lines, diffp3_X, diffp3_X_lines, ys3)
vanilla6_X_train, vanilla6_X_test, vanilla6_X_lines_train, vanilla6_X_lines_test, diff6_X_train, diff6_X_test, diff6_X_lines_train, diff6_X_lines_test, diffp6_X_train, diffp6_X_test, diffp6_X_lines_train, diffp6_X_lines_test, y6_train, y6_test = split(vanilla6_X, vanilla6_X_lines, diff6_X, diff6_X_lines, diffp6_X, diffp6_X_lines, ys3)

#Checking correlation between every feature within selected datasets
print('Feature correlations within datasets: ')
for df in [[vanilla_X_lines_train, 'vanilla_X_lines_train'],
           [diff_X_lines_train, 'diff_X_lines_train']]:
    print('---------------------\n', df[1], '\n')
    eds.print_abv_ft_corr(df[0], threshold=0.8)
    print('---------------------\n')

#Scaling
vanilla_X_train, vanilla_X_test, vanilla_X_lines_train, vanilla_X_lines_test, diff_X_train, diff_X_test, diff_X_lines_train, diff_X_lines_test, diffp_X_train, diffp_X_test, diffp_X_lines_train, diffp_X_lines_test = scale(vanilla_X_train, vanilla_X_test, vanilla_X_lines_train, vanilla_X_lines_test, diff_X_train, diff_X_test, diff_X_lines_train, diff_X_lines_test, diffp_X_train, diffp_X_test, diffp_X_lines_train, diffp_X_lines_test)
vanilla4_X_train, vanilla4_X_test, vanilla4_X_lines_train, vanilla4_X_lines_test, diff4_X_train, diff4_X_test, diff4_X_lines_train, diff4_X_lines_test, diffp4_X_train, diffp4_X_test, diffp4_X_lines_train, diffp4_X_lines_test = scale(vanilla4_X_train, vanilla4_X_test, vanilla4_X_lines_train, vanilla4_X_lines_test, diff4_X_train, diff4_X_test, diff4_X_lines_train, diff4_X_lines_test, diffp4_X_train, diffp4_X_test, diffp4_X_lines_train, diffp4_X_lines_test)
vanilla3_X_train, vanilla3_X_test, vanilla3_X_lines_train, vanilla3_X_lines_test, diff3_X_train, diff3_X_test, diff3_X_lines_train, diff3_X_lines_test, diffp3_X_train, diffp3_X_test, diffp3_X_lines_train, diffp3_X_lines_test = scale(vanilla3_X_train, vanilla3_X_test, vanilla3_X_lines_train, vanilla3_X_lines_test, diff3_X_train, diff3_X_test, diff3_X_lines_train, diff3_X_lines_test, diffp3_X_train, diffp3_X_test, diffp3_X_lines_train, diffp3_X_lines_test)
vanilla6_X_train, vanilla6_X_test, vanilla6_X_lines_train, vanilla6_X_lines_test, diff6_X_train, diff6_X_test, diff6_X_lines_train, diff6_X_lines_test, diffp6_X_train, diffp6_X_test, diffp6_X_lines_train, diffp6_X_lines_test = scale(vanilla6_X_train, vanilla6_X_test, vanilla6_X_lines_train, vanilla6_X_lines_test, diff6_X_train, diff6_X_test, diff6_X_lines_train, diff6_X_lines_test, diffp6_X_train, diffp6_X_test, diffp6_X_lines_train, diffp6_X_lines_test)

Feature correlations within datasets: 
---------------------
 vanilla_X_lines_train 

Season_Wins_away
PlayedHome_away
WonH1past_away
WonQ1past_away
Wonpast_away
FGM_team_away
FGA_team_away
3PM_team_away
3PA_team_away 0.910474061957493
3PA_team_away
3PM_team_away 0.910474061957493
FTM_team_away
FTA_team_away 0.9259875257106862
FTA_team_away
FTM_team_away 0.9259875257106862
OREB_team_away
DREB_team_away
REB_team_away
AST_team_away
TOV_team_away
STL_team_away
BLK_team_away
PF_team_away
FG%_team_away
3P%_team_away
FT%_team_away
FGM_player1_away
FGA_player1_away 0.9402368197493919
FG%_player1_away 0.8396077664527684
PTS_player1_away 0.9658302388508684
FGA_player1_away
FGM_player1_away 0.9402368197493918
PTS_player1_away 0.9334489584744095
FG%_player1_away
FGM_player1_away 0.8396077664527682
PTS_player1_away 0.815206655466191
3PM_player1_away
3PA_player1_away 0.922011492869058
3PA_player1_away
3PM_player1_away 0.922011492869058
3P%_player1_away
FTM_player1_away
FTA_player1_away 0.9666892122

3PM_player3_home 0.9315010610333647
3P%_player3_home
3PM_player3_home 0.834106098533625
FTM_player3_home
FTA_player3_home 0.9515690878602049
FTA_player3_home
FTM_player3_home 0.9515690878602049
FT%_player3_home
OREB_player3_home
REB_player3_home 0.8626193643213989
DREB_player3_home
REB_player3_home 0.9678507710952601
REB_player3_home
OREB_player3_home 0.8626193643213989
DREB_player3_home 0.9678507710952602
AST_player3_home
STL_player3_home
BLK_player3_home
TOV_player3_home
PF_player3_home
PTS_player3_home
FGM_player3_home 0.969922608339001
FGA_player3_home 0.9308542763698462
FG%_player3_home 0.805258457597829
FGM_player4_home
FGA_player4_home 0.9300987038382262
FG%_player4_home 0.8117157586331721
PTS_player4_home 0.9714467569316528
FGA_player4_home
FGM_player4_home 0.9300987038382262
PTS_player4_home 0.928337784215865
FG%_player4_home
FGM_player4_home 0.8117157586331721
3PM_player4_home
3PA_player4_home 0.9318317358476774
3P%_player4_home 0.8553509222896623
3PA_player4_home
3PM_player4

In [16]:
#Function to evaluate datasets
def get_performance(vanilla_X_train, vanilla_X_test, vanilla_X_lines_train,
                    vanilla_X_lines_test, diff_X_train, diff_X_test, diff_X_lines_train,
                    diff_X_lines_test, diffp_X_train, diffp_X_test, diffp_X_lines_train,
                    diffp_X_lines_test, y_train, y_test):
    print('---------------------')
    print('Testing to see how different models realtively perform on different datasets.')
    print('---------------------')

    from sklearn.linear_model import LinearRegression
    from sklearn.linear_model import LogisticRegression

    lr = LinearRegression()
    print('\n---------------------\nLinear regression:\n')

    print('Vanilla data')
    lr.fit(vanilla_X_train, y_train)
    print(accuracy_score(y_test, lr.predict(vanilla_X_test).round()).round(4))

    print('Vanilla data with lines')
    lr.fit(vanilla_X_lines_train, y_train)
    print(accuracy_score(y_test, lr.predict(vanilla_X_lines_test).round()).round(4))

    print('Difference data')
    lr.fit(diff_X_train, y_train)
    print(accuracy_score(y_test, lr.predict(diff_X_test).round()).round(4))

    print('Difference data with lines')
    lr.fit(diff_X_lines_train, y_train)
    print(accuracy_score(y_test, lr.predict(diff_X_lines_test).round()).round(4))

    print('Player difference data')
    lr.fit(diffp_X_train, y_train)
    print(accuracy_score(y_test, lr.predict(diffp_X_test).round()).round(4))

    print('Player difference data with lines')
    lr.fit(diffp_X_lines_train, y_train)
    print(accuracy_score(y_test, lr.predict(diffp_X_lines_test).round()).round(4))
    print('---------------------\n')


    logr = LogisticRegression(max_iter=2000, C=0.01)
    print('---------------------\nLogistic regression:\n')

    print('Vanilla data')
    logr.fit(vanilla_X_train, y_train)
    print(accuracy_score(y_test, logr.predict(vanilla_X_test)).round(4))

    print('Vanilla data with lines')
    logr.fit(vanilla_X_lines_train, y_train)
    print(accuracy_score(y_test, logr.predict(vanilla_X_lines_test)).round(4))

    print('Difference data')
    logr.fit(diff_X_train, y_train)
    print(accuracy_score(y_test, logr.predict(diff_X_test)).round(4))

    print('Difference data with lines')
    logr.fit(diff_X_lines_train, y_train)
    print(accuracy_score(y_test, logr.predict(diff_X_lines_test)).round(4))

    print('Player difference data')
    logr.fit(diffp_X_train, y_train)
    print(accuracy_score(y_test, logr.predict(diffp_X_test)).round(4))

    print('Player difference data with lines')
    logr.fit(diffp_X_lines_train, y_train)
    print(accuracy_score(y_test, logr.predict(diffp_X_lines_test)).round(4))
    print('---------------------\n')

    xgbc = xgb.XGBClassifier(objective='binary:logistic')
    print('---------------------\nXGBoost:\n')

    print('Vanilla data')
    xgbc.fit(vanilla_X_train, y_train)
    print(accuracy_score(y_test, xgbc.predict(vanilla_X_test)).round(4))

    print('Vanilla data with lines')
    xgbc.fit(vanilla_X_lines_train, y_train)
    print(accuracy_score(y_test, xgbc.predict(vanilla_X_lines_test)).round(4))

    print('Difference data')
    xgbc.fit(diff_X_train, y_train)
    print(accuracy_score(y_test, xgbc.predict(diff_X_test)).round(4))

    print('Difference data with lines')
    xgbc.fit(diff_X_lines_train, y_train)
    print(accuracy_score(y_test, xgbc.predict(diff_X_lines_test)).round(4))

    print('Player difference data')
    xgbc.fit(diffp_X_train, y_train)
    print(accuracy_score(y_test, xgbc.predict(diffp_X_test)).round(4))

    print('Player difference data with lines')
    xgbc.fit(diffp_X_lines_train, y_train)
    print(accuracy_score(y_test, xgbc.predict(diffp_X_lines_test)).round(4))
    print('---------------------\n')


    print('---------------------\nNeural Net:\n')

    input = keras.layers.Input(shape=vanilla_X_train.shape[1:])
    hidden1 = keras.layers.Dense(400, activation='relu')(input)
    hidden2 = keras.layers.Dense(200, activation='relu')(hidden1)
    hidden3 = keras.layers.Dense(100, activation='relu')(hidden2)
    hidden4 = keras.layers.Dense(30, activation='relu')(hidden3)
    output = keras.layers.Dense(1, activation='sigmoid')(hidden4)
    model = keras.models.Model(inputs=[input], outputs=[output])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(vanilla_X_train, y_train, epochs=5, verbose=0)

    print('Vanilla data:')
    print(accuracy_score(y_test, model.predict(vanilla_X_test).round()).round(4))

    input = keras.layers.Input(shape=vanilla_X_lines_train.shape[1:])
    hidden1 = keras.layers.Dense(400, activation='relu')(input)
    hidden2 = keras.layers.Dense(200, activation='relu')(hidden1)
    hidden3 = keras.layers.Dense(100, activation='relu')(hidden2)
    hidden4 = keras.layers.Dense(30, activation='relu')(hidden3)
    output = keras.layers.Dense(1, activation='sigmoid')(hidden4)
    model = keras.models.Model(inputs=[input], outputs=[output])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(vanilla_X_lines_train, y_train, epochs=5, verbose=0)

    print('Vanilla data with lines:')
    print(accuracy_score(y_test, model.predict(vanilla_X_lines_test).round()).round(4))

    input = keras.layers.Input(shape=diff_X_train.shape[1:])
    hidden1 = keras.layers.Dense(300, activation='relu')(input)
    hidden2 = keras.layers.Dense(150, activation='relu')(hidden1)
    hidden3 = keras.layers.Dense(30, activation='relu')(hidden2)
    output = keras.layers.Dense(1, activation='sigmoid')(hidden3)
    model = keras.models.Model(inputs=[input], outputs=[output])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(diff_X_train, y_train, epochs=5, verbose=0)

    print('Difference data:')
    print(accuracy_score(y_test, model.predict(diff_X_test).round()).round(4))

    input = keras.layers.Input(shape=diff_X_lines_train.shape[1:])
    hidden1 = keras.layers.Dense(300, activation='relu')(input)
    hidden2 = keras.layers.Dense(150, activation='relu')(hidden1)
    hidden3 = keras.layers.Dense(30, activation='relu')(hidden2)
    output = keras.layers.Dense(1, activation='sigmoid')(hidden3)
    model = keras.models.Model(inputs=[input], outputs=[output])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(diff_X_lines_train, y_train, epochs=5, verbose=0)

    print('Difference data with lines:')
    print(accuracy_score(y_test, model.predict(diff_X_lines_test).round()).round(4))

    input = keras.layers.Input(shape=diffp_X_train.shape[1:])
    hidden1 = keras.layers.Dense(300, activation='relu')(input)
    hidden2 = keras.layers.Dense(150, activation='relu')(hidden1)
    hidden3 = keras.layers.Dense(30, activation='relu')(hidden2)
    output = keras.layers.Dense(1, activation='sigmoid')(hidden3)
    model = keras.models.Model(inputs=[input], outputs=[output])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(diffp_X_train, y_train, epochs=5, verbose=0)

    print('Player difference data:')
    print(accuracy_score(y_test, model.predict(diffp_X_test).round()).round(4))

    input = keras.layers.Input(shape=diffp_X_lines_train.shape[1:])
    hidden1 = keras.layers.Dense(300, activation='relu')(input)
    hidden2 = keras.layers.Dense(150, activation='relu')(hidden1)
    hidden3 = keras.layers.Dense(30, activation='relu')(hidden2)
    output = keras.layers.Dense(1, activation='sigmoid')(hidden3)
    model = keras.models.Model(inputs=[input], outputs=[output])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(diffp_X_lines_train, y_train, epochs=5, verbose=0)

    print('Player difference data with lines:')
    print(accuracy_score(y_test, model.predict(diffp_X_lines_test).round()).round(4))
    print('---------------------')
    
get_performance(vanilla_X_train, vanilla_X_test, vanilla_X_lines_train, vanilla_X_lines_test, diff_X_train, diff_X_test, diff_X_lines_train, diff_X_lines_test, diffp_X_train, diffp_X_test, diffp_X_lines_train, diffp_X_lines_test, y_train, y_test)

---------------------
Testing to see how different models realtively perform on different datasets.
---------------------

---------------------
Linear regression:

Vanilla data
0.6596
Vanilla data with lines
0.6843
Difference data
0.6571
Difference data with lines
0.6819
Player difference data
0.6603
Player difference data with lines
0.6812
---------------------

---------------------
Logistic regression:

Vanilla data
0.6627
Vanilla data with lines
0.6843
Difference data
0.6592
Difference data with lines
0.6812
Player difference data
0.662
Player difference data with lines
0.6815
---------------------

---------------------
XGBoost:

Vanilla data
0.6495
Vanilla data with lines
0.6854
Difference data
0.6505
Difference data with lines
0.6861
Player difference data
0.6477
Player difference data with lines
0.6878
---------------------

---------------------
Neural Net:

Vanilla data:
0.6324
Vanilla data with lines:
0.6659
Difference data:
0.6442
Difference data with lines:
0.6788
Player 

In [8]:
print('---------------------\nChecking just how more biased the lines make predictions to be.')
print('\nI took 3 samples, first one is a clear looser, the second one has a high likelyhood')
print('of winning and the third one is undecided:')
print('Line of the first sample: ', diff_X_lines.iloc[24000:]['Line_away'].iloc[0])
print('Line of the second sample: ', diff_X_lines.iloc[24000:]['Line_away'].iloc[1])
print('Line of the third sample: ', diff_X_lines.iloc[24000:]['Line_away'].iloc[5])
print('---------------------')

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
lr = LinearRegression()
logr = LogisticRegression(max_iter=2000, C=0.01)
xgbc = xgb.XGBClassifier(objective='binary:logistic')

print('\n---------------------\nWithout lines:\n')

lr.fit(vanilla_X_train, y_train)
print('Linear regression:')
print(np.round(lr.predict(vanilla_X_test.iloc[[0,1,5]]), 2))

logr.fit(vanilla_X_train, y_train)
print('Logistic regression:')
print(np.round(logr.predict_proba(vanilla_X_test.iloc[[0,1,5]])[:, 1], 2))

xgbc = xgb.XGBClassifier(objective='binary:logistic')

xgbc.fit(vanilla_X_train, y_train)
print('XGBoost:')
print(np.round(xgbc.predict_proba(vanilla_X_test.iloc[[0,1,5]])[:, 1], 2))

input = keras.layers.Input(shape=vanilla_X_train.shape[1:])
hidden1 = keras.layers.Dense(300, activation='relu')(input)
hidden2 = keras.layers.Dense(150, activation='relu')(hidden1)
hidden3 = keras.layers.Dense(30, activation='relu')(hidden2)
output = keras.layers.Dense(1, activation='sigmoid')(hidden3)
model = keras.models.Model(inputs=[input], outputs=[output])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(vanilla_X_train, y_train, epochs=5, verbose=0)
print('Neural Net:')
print(np.round(model.predict(vanilla_X_test.iloc[[0,1,5]]), 2))
print('---------------------')

print('\n---------------------\nWith lines:\n')

lr.fit(vanilla_X_lines_train, y_train)
print('Linear regression:')
print(np.round(lr.predict(vanilla_X_lines_test.iloc[[0,1,5]]), 2))

logr.fit(vanilla_X_lines_train, y_train)
print('Logistic regression:')
print(np.round(logr.predict_proba(vanilla_X_lines_test.iloc[[0,1,5]])[:, 1], 2))

xgbc.fit(vanilla_X_lines_train, y_train)
print('XGBoost:')
print(np.round(xgbc.predict_proba(vanilla_X_lines_test.iloc[[0,1,5]])[:, 1], 2))

input = keras.layers.Input(shape=vanilla_X_lines_train.shape[1:])
hidden1 = keras.layers.Dense(300, activation='relu')(input)
hidden2 = keras.layers.Dense(150, activation='relu')(hidden1)
hidden3 = keras.layers.Dense(30, activation='relu')(hidden2)
output = keras.layers.Dense(1, activation='sigmoid')(hidden3)
model = keras.models.Model(inputs=[input], outputs=[output])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(vanilla_X_lines_train, y_train, epochs=5, verbose=0)
print('Neural Net:')
print(np.round(model.predict(vanilla_X_lines_test.iloc[[0,1,5]]), 2))
print('---------------------')

---------------------
Checking just how more biased the lines make predictions to be.

I took 3 samples, first one is a clear looser, the second one has a high likelyhood
of winning and the third one is undecided:
Line of the first sample:  17.0
Line of the second sample:  -8.5
Line of the third sample:  -2.0
---------------------

---------------------
Without lines:

Linear regression:
[0.06 0.68 0.7 ]
Logistic regression:
[0.1  0.7  0.71]
XGBoost:
[0.13 0.7  0.64]
Neural Net:
[[0.02]
 [0.7 ]
 [0.72]]
---------------------

---------------------
With lines:

Linear regression:
[-0.11  0.82  0.6 ]
Logistic regression:
[0.05 0.82 0.64]
XGBoost:
[0.02 0.8  0.63]
Neural Net:
[[0.01]
 [0.66]
 [0.65]]
---------------------


In [9]:
from sklearn.metrics import mean_squared_error

vanilla_X_train, vanilla_X_test = vanilla_X[:24000], vanilla_X[24000:]
diff_X_train, diff_X_test = diff_X[:24000], diff_X[24000:]
diffp_X_train, diffp_X_test = diffp_X[:24000], diffp_X[24000:]

lines_train, lines_test = lines[['Line_away']][:24000], lines[['Line_away']][24000:]

scaler.fit(vanilla_X_train)
vanilla_X_train, vanilla_X_test = pd.DataFrame(scaler.transform(vanilla_X_train)), pd.DataFrame(scaler.transform(vanilla_X_test))

scaler.fit(diff_X_train)
diff_X_train, diff_X_test = pd.DataFrame(scaler.transform(diff_X_train)), pd.DataFrame(scaler.transform(diff_X_test))
    
scaler.fit(diffp_X_train)
diffp_X_train, diffp_X_test = pd.DataFrame(scaler.transform(diffp_X_train)), pd.DataFrame(scaler.transform(diffp_X_test))
print('---------------------')
print('Predicting the Vegas lines.\nAll scores represent RMSE.\nMight come in handy later.')
print('---------------------')


lr = LinearRegression()
print('\n---------------------\nLinear regression:\n')

print('Vanilla data')
lr.fit(vanilla_X_train, lines_train)
print(np.sqrt(mean_squared_error(lines_test, lr.predict(vanilla_X_test))).round(4))

print('Difference data')
lr.fit(diff_X_train, lines_train)
print(np.sqrt(mean_squared_error(lines_test, lr.predict(diff_X_test))).round(4))

print('Player difference data')
lr.fit(diffp_X_train, lines_train)
print(np.sqrt(mean_squared_error(lines_test, lr.predict(diffp_X_test))).round(4))
print('---------------------\n')


xgbr = xgb.XGBRegressor(objective='reg:squarederror')
print('---------------------\nXGBoost:\n')

print('Vanilla data')
xgbr.fit(vanilla_X_train, lines_train)
print(np.sqrt(mean_squared_error(lines_test, xgbr.predict(vanilla_X_test))).round(4))

print('Difference data')
xgbr.fit(diff_X_train, lines_train)
print(np.sqrt(mean_squared_error(lines_test, xgbr.predict(diff_X_test))).round(4))

print('Player difference data')
xgbr.fit(diffp_X_train, lines_train)
print(np.sqrt(mean_squared_error(lines_test, xgbr.predict(diffp_X_test))).round(4))
print('---------------------\n')


print('---------------------\nNeural Net:\n')

input = keras.layers.Input(shape=vanilla_X_train.shape[1:])
hidden1 = keras.layers.Dense(400, activation='relu')(input)
hidden2 = keras.layers.Dense(200, activation='relu')(hidden1)
hidden3 = keras.layers.Dense(100, activation='relu')(hidden2)
hidden4 = keras.layers.Dense(30, activation='relu')(hidden3)
output = keras.layers.Dense(1)(hidden4)
model = keras.models.Model(inputs=[input], outputs=[output])

model.compile(optimizer='adam', loss='mean_squared_error')
history = model.fit(vanilla_X_train, lines_train, epochs=5, verbose=0)

print('Vanilla data:')
print(np.sqrt(mean_squared_error(lines_test, model.predict(vanilla_X_test))).round(4))

input = keras.layers.Input(shape=diff_X_train.shape[1:])
hidden1 = keras.layers.Dense(300, activation='relu')(input)
hidden2 = keras.layers.Dense(150, activation='relu')(hidden1)
hidden3 = keras.layers.Dense(30, activation='relu')(hidden2)
output = keras.layers.Dense(1)(hidden3)
model = keras.models.Model(inputs=[input], outputs=[output])

model.compile(optimizer='adam', loss='mean_squared_error')
history = model.fit(diff_X_train, lines_train, epochs=5, verbose=0)

print('Difference data:')
print(np.sqrt(mean_squared_error(lines_test, model.predict(diff_X_test))).round(4))

input = keras.layers.Input(shape=diffp_X_train.shape[1:])
hidden1 = keras.layers.Dense(300, activation='relu')(input)
hidden2 = keras.layers.Dense(150, activation='relu')(hidden1)
hidden3 = keras.layers.Dense(30, activation='relu')(hidden2)
output = keras.layers.Dense(1)(hidden3)
model = keras.models.Model(inputs=[input], outputs=[output])

model.compile(optimizer='adam', loss='mean_squared_error')
history = model.fit(diffp_X_train, lines_train, epochs=5, verbose=0)

print('Difference data:')
print(np.sqrt(mean_squared_error(lines_test, model.predict(diffp_X_test))).round(4))
print('---------------------')

print('\nXGB predictions trained on player difference data:')
model_example = pd.concat([lines_test.reset_index(drop=True),
                           pd.DataFrame(xgbr.predict(diffp_X_test).round(2)),
                           pd.DataFrame(y_test).reset_index(drop=True)], axis=1)
model_example = model_example.rename(columns={0: 'Predicted', 'Line_away': 'Real'})
display(model_example.sample(20))

---------------------
Predicting the Vegas lines.
All scores represent RMSE.
---------------------

---------------------
Linear regression:

Vanilla data
3.4256
Difference data
3.4598
Player difference data
3.4188
---------------------

---------------------
XGBoost:

Vanilla data
3.533
Difference data
3.492
Player difference data
3.457
---------------------

---------------------
Neural Net:

Vanilla data:
3.4743
Difference data:
3.3929
Difference data:
3.5805
---------------------

XGB predictions trained on player difference data:


Unnamed: 0,Real,Predicted,Won_away
2794,-5.0,-4.87,0
169,2.5,4.33,1
2722,-2.5,-0.94,1
862,0.0,-0.82,1
2364,16.5,11.7,0
1725,8.5,2.5,1
2210,5.0,9.17,1
2393,10.0,9.85,1
2192,6.5,3.41,0
2681,3.5,4.22,0


In [10]:
#Checking if predicted Vegas lines can be used as a viable feature
def count_line_success(example):
    pts_real = 0
    pts_predicted = 0
    sup_real = 0
    sup_predicted = 0

    for row in range(example.shape[0]):
        curr_row = example.iloc[row]
        real = curr_row['Real']
        predicted = curr_row['Predicted']
        outcome = curr_row['Won_away']

        if real>0.5 and predicted>0.5:
            if outcome==1:
                if real-predicted>=5:
                    pts_real += 1
                    sup_real += 1
                elif predicted-real>=5:
                    pts_predicted += 1
                    sup_predicted += 1
                else:
                    pts_real += 1
                    pts_predicted += 1
        elif real>0.5 and predicted<0.5:
            if outcome==1:
                pts_real += 1
                sup_real += 1
            else:
                pts_predicted += 1
                sup_predicted += 1
        elif real<0.5 and predicted>0.5:
            if outcome==1:
                pts_predicted += 1
                sup_predicted += 1
            else:
                pts_real += 1
                sup_predicted += 1
        else:
            if outcome==0:
                if predicted-real>=5:
                    pts_real += 1
                    sup_real += 1
                elif real-predicted>=5:
                    pts_predicted += 1
                    sup_predicted += 1
                else:
                    pts_real += 1
                    pts_predicted += 1
            
    print('Points real: ', pts_real, '\nPoints predicted: ', pts_predicted)
    print('Dominant points real: ', sup_real, '\nDominant points predicted: ', sup_predicted)
    
print('XGBR:')
count_line_success(model_example)

XGBR:
Points real:  903 
Points predicted:  972
Dominant points real:  86 
Dominant points predicted:  401




In [11]:
#Trying other datasets
print('Forms 4')
get_performance(vanilla4_X_train, vanilla4_X_test, vanilla4_X_lines_train, vanilla4_X_lines_test, diff4_X_train, diff4_X_test, diff4_X_lines_train, diff4_X_lines_test, diffp4_X_train, diffp4_X_test, diffp4_X_lines_train, diffp4_X_lines_test, y4_train, y4_test)

Forms 4
---------------------
Testing to see how different models realtively perform on different datasets.
---------------------

---------------------
Linear regression:

Vanilla data
0.6568
Vanilla data with lines
0.6805
Difference data
0.6603
Difference data with lines
0.6822
Player difference data
0.6547
Player difference data with lines
0.6829
---------------------

---------------------
Logistic regression:

Vanilla data
0.6582
Vanilla data with lines
0.6815
Difference data
0.6585
Difference data with lines
0.6802
Player difference data
0.6571
Player difference data with lines
0.6819
---------------------

---------------------
XGBoost:

Vanilla data
0.6467
Vanilla data with lines
0.6822
Difference data
0.6509
Difference data with lines
0.6854
Player difference data
0.6488
Player difference data with lines
0.6871
---------------------

---------------------
Neural Net:

Vanilla data:
0.6219
Vanilla data with lines:
0.6725
Difference data:
0.6456
Difference data with lines:
0.671

In [12]:
print('Forms 3')
get_performance(vanilla3_X_train, vanilla3_X_test, vanilla3_X_lines_train, vanilla3_X_lines_test, diff3_X_train, diff3_X_test, diff3_X_lines_train, diff3_X_lines_test, diffp3_X_train, diffp3_X_test, diffp3_X_lines_train, diffp3_X_lines_test, y3_train, y3_test)

Forms 3
---------------------
Testing to see how different models realtively perform on different datasets.
---------------------

---------------------
Linear regression:

Vanilla data
0.655
Vanilla data with lines
0.6826
Difference data
0.6564
Difference data with lines
0.6774
Player difference data
0.6519
Player difference data with lines
0.6802
---------------------

---------------------
Logistic regression:

Vanilla data
0.6536
Vanilla data with lines
0.6829
Difference data
0.6543
Difference data with lines
0.6777
Player difference data
0.655
Player difference data with lines
0.6781
---------------------

---------------------
XGBoost:

Vanilla data
0.6463
Vanilla data with lines
0.6857
Difference data
0.6536
Difference data with lines
0.6812
Player difference data
0.6481
Player difference data with lines
0.6819
---------------------

---------------------
Neural Net:

Vanilla data:
0.631
Vanilla data with lines:
0.6578
Difference data:
0.6428
Difference data with lines:
0.6714
P

In [13]:
print('Forms 6')
get_performance(vanilla6_X_train, vanilla6_X_test, vanilla6_X_lines_train, vanilla6_X_lines_test, diff6_X_train, diff6_X_test, diff6_X_lines_train, diff6_X_lines_test, diffp6_X_train, diffp6_X_test, diffp6_X_lines_train, diffp6_X_lines_test, y6_train, y6_test)

Forms 6
---------------------
Testing to see how different models realtively perform on different datasets.
---------------------

---------------------
Linear regression:

Vanilla data
0.6568
Vanilla data with lines
0.6854
Difference data
0.661
Difference data with lines
0.6815
Player difference data
0.6557
Player difference data with lines
0.6833
---------------------

---------------------
Logistic regression:

Vanilla data
0.6638
Vanilla data with lines
0.6864
Difference data
0.661
Difference data with lines
0.6812
Player difference data
0.6585
Player difference data with lines
0.6815
---------------------

---------------------
XGBoost:

Vanilla data
0.6564
Vanilla data with lines
0.6878
Difference data
0.6495
Difference data with lines
0.6843
Player difference data
0.6502
Player difference data with lines
0.6833
---------------------

---------------------
Neural Net:

Vanilla data:
0.6271
Vanilla data with lines:
0.669
Difference data:
0.6407
Difference data with lines:
0.6767
P

In [17]:
#Trying voting ensemble
from mlxtend.classifier import EnsembleVoteClassifier
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

logr = LogisticRegression(max_iter=2000, C=0.01)
logr.fit(diffp_X_lines_train, y_train)
print('Logistic Regression: ', accuracy_score(y_test, logr.predict(diffp_X_lines_test)).round(3))

xgbc = XGBClassifier(objective='binary:logistic', max_depth=2,colsample_bytree=1,
                       eta=0.01, colsample_bylevel=1)
xgbc.fit(diffp_X_lines_train, y_train)
print('XGB Classifier: ', accuracy_score(y_test, xgbc.predict(diffp_X_lines_test)).round(3))

def keras_model():
    input = keras.layers.Input(shape=diffp_X_lines_train.shape[1:])
    hidden1 = keras.layers.Dense(300, activation='relu')(input)
    hidden2 = keras.layers.Dense(200, activation='relu')(hidden1)
    dropout1 = keras.layers.Dropout(0.5)(hidden2)
    hidden3 = keras.layers.Dense(100, activation='relu')(dropout1)
    dropout2 = keras.layers.Dropout(0.4)(hidden3)
    hidden4 = keras.layers.Dense(30, activation='relu')(dropout2)
    output = keras.layers.Dense(1, activation='sigmoid')(hidden4)
    model = keras.models.Model(inputs=[input], outputs=[output])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model
model = KerasClassifier(build_fn=keras_model, epochs=5, verbose=0)
model.fit(diffp_X_lines_train, y_train)
print('Neural Net: ', accuracy_score(y_test, model.predict(diffp_X_lines_test)).round(3))

voting = EnsembleVoteClassifier(clfs=[logr, xgbc, model], voting='soft', refit=False)
voting.fit(diffp_X_lines_train, y_train)
print('Voting ensemble: ', accuracy_score(y_test, voting.predict(diffp_X_lines_test)).round(3))

Logistic Regression:  0.682
XGB Classifier:  0.688
Neural Net:  0.678
Voting ensemble:  0.686


In [96]:
print('Best accuracy with lines: ',round(accuracy_score(y_test, xgbc.predict(diffp_X_lines_test).round()), 3))
print('Accuracy using only lines: ', accuracy_score(ys['Won_away'], line_accuracy['Out']).round(3) )

Best accuracy with lines:  0.688
Accuracy using only lines:  0.696


## Conclusion

As we can see, and as we could have seen throughout the whole notebook, it takes a state-of-the-art feature (lines in this case, could have been odds, probability, etc.) to only come close to the state-of-the-art performance (bookies probability - 69.5%-70%). I hoped that I could somehow enhance the performace of the state-of-the-art with my own scraped data but naturally I couldn't do it because it was already taken into account. Nevertheless, it was a fun project and even if ML models didn't yield any useful results, the scraped data is quite useful and the application for it could be found.