In [148]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from statsmodels.tools import add_constant
from statsmodels.discrete.discrete_model import Logit
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, AdaBoostClassifier 
from sklearn.metrics import roc_auc_score,accuracy_score, precision_score,recall_score
from sklearn.pipeline import Pipeline

In [155]:
games = pd.read_csv('games_cleaned.csv')

In [156]:
games.head()

Unnamed: 0.1,Unnamed: 0,season,week,Date,Home Team,Away Team,home_team_score,visit_team_score,game_over_under,line,...,temperature,wind_speed,wind_bearing,name,month,start,over_hit,under_hit,favorite_covered,underdog_covered
0,0,1983,1,1983-09-03 00:00:00,SF,PHI,17,22,40.0,-4.0,...,73.32,6.12,282.0,PHI @ SF,September,Night,0,1,0,1
1,1,1983,1,1983-09-04 00:00:00,NO,ARI,28,17,37.0,-3.0,...,70.0,0.0,0.0,SLC @ NO,September,Day,1,0,1,0
2,2,1983,1,1983-09-04 00:00:00,BUF,MIA,0,12,36.0,4.0,...,78.95,10.67,194.0,MIA @ BUF,September,Day,0,1,0,1
3,3,1983,1,1983-09-04 00:00:00,CHI,ATL,17,20,38.0,-1.5,...,86.66,11.04,196.0,ATL @ CHI,September,Day,0,1,0,1
4,4,1983,1,1983-09-04 00:00:00,CIN,OAK,10,20,42.0,-2.0,...,82.18,7.9,194.0,LAD @ CIN,September,Day,0,1,0,1


In [157]:
games.columns

Index(['Unnamed: 0', 'season', 'week', 'Date', 'Home Team', 'Away Team',
       'home_team_score', 'visit_team_score', 'game_over_under', 'line',
       'surface', 'weather_icon', 'temperature', 'wind_speed', 'wind_bearing',
       'name', 'month', 'start', 'over_hit', 'under_hit', 'favorite_covered',
       'underdog_covered'],
      dtype='object')

#### Deleting Features that are Leakage 

In [158]:
games.drop( ['Unnamed: 0',  'Date', 
       'home_team_score', 'visit_team_score', 
       'name',  'under_hit', 'favorite_covered',
       'underdog_covered'], axis = 1, inplace = True)

In [159]:
games.head()

Unnamed: 0,season,week,Home Team,Away Team,game_over_under,line,surface,weather_icon,temperature,wind_speed,wind_bearing,month,start,over_hit
0,1983,1,SF,PHI,40.0,-4.0,Grass,Clear Day,73.32,6.12,282.0,September,Night,0
1,1983,1,NO,ARI,37.0,-3.0,Dome,Dome,70.0,0.0,0.0,September,Day,1
2,1983,1,BUF,MIA,36.0,4.0,Turf,Partly Cloudy Day,78.95,10.67,194.0,September,Day,0
3,1983,1,CHI,ATL,38.0,-1.5,Grass,Clear Day,86.66,11.04,196.0,September,Day,0
4,1983,1,CIN,OAK,42.0,-2.0,Turf,Clear Day,82.18,7.9,194.0,September,Day,0


#### Without Dummies

In [227]:
def logreg_scores(X_train,X_test,y_train,y_test):
    logreg = LogisticRegression(solver = 'lbfgs')
    logreg.fit(X_train,y_train)
    y_pred = logreg.predict(X_test)
    a = accuracy_score(y_test,y_pred)
    r = recall_score(y_test,y_pred)
    p = precision_score(y_test,y_pred)
    roc = roc_auc_score(y_test,y_pred)
    return a,r,p,roc

In [161]:
y = games.over_hit.values
X = games[['line', 'game_over_under',
        'temperature', 'wind_speed', 'wind_bearing']].values

In [162]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [166]:
logreg = LogisticRegression (solver='lbfgs')

In [164]:
logreg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [165]:
y_pred = logreg.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(roc_auc_score(y_test,y_pred))

0.5179063360881543
0.26124197002141325
0.5687645687645687
0.5256266603796056


In [172]:
cross_val_score(logreg,X_train, y_train, cv = 10, scoring = 'accuracy')

array([0.54607978, 0.4869326 , 0.53232462, 0.49311295, 0.50550964,
       0.5462069 , 0.50068966, 0.51862069, 0.52137931, 0.52137931])

In [168]:
steps = [('scaler', StandardScaler()),
        ('logreg', LogisticRegression())]
pipeline = Pipeline(steps)
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)
logreg_scaled = pipeline.fit(X_train,y_train)



In [169]:
logreg_scaled.score(X_test,y_test)

0.5344352617079889

In [170]:
X_const = add_constant(X,prepend =True)
logit_model = Logit(y,X_const).fit()

Optimization terminated successfully.
         Current function value: 0.690791
         Iterations 4


In [171]:
logit_model.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,9073.0
Model:,Logit,Df Residuals:,9067.0
Method:,MLE,Df Model:,5.0
Date:,"Tue, 25 Aug 2020",Pseudo R-squ.:,0.00269
Time:,22:12:10,Log-Likelihood:,-6267.5
converged:,True,LL-Null:,-6284.4
Covariance Type:,nonrobust,LLR p-value:,2.601e-06

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.8655,0.224,3.862,0.000,0.426,1.305
x1,-0.0024,0.004,-0.661,0.509,-0.009,0.005
x2,-0.0192,0.005,-4.138,0.000,-0.028,-0.010
x3,-0.0010,0.001,-0.658,0.511,-0.004,0.002
x4,-0.0236,0.005,-4.860,0.000,-0.033,-0.014
x5,0.0003,0.000,1.677,0.094,-5.72e-05,0.001


#### With Dummies

In [173]:
dummies = pd.get_dummies(games['surface'],prefix ='surface')

In [174]:
dummies

Unnamed: 0,surface_Dome,surface_Grass,surface_Turf
0,0,1,0
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1
...,...,...,...
9068,0,0,1
9069,1,0,0
9070,1,0,0
9071,0,1,0


In [175]:
X2 = games[['line', 'game_over_under',
        'temperature', 'wind_speed', 'wind_bearing']].join(dummies.loc[:,'surface_Grass':]).values

In [180]:
X2_train, X2_test,y_train, y_test = train_test_split(X2,y, test_size = 0.2)

In [181]:
logreg = LogisticRegression(solver = 'lbfgs')

In [269]:
cross_val_score(logreg,X2_train, y_train, cv = 10, scoring = 'accuracy').mean()



0.5209354976180691

In [183]:
logreg = LogisticRegression(solver = 'lbfgs')
logreg.fit(X2_train,y_train)
y_pred = logreg.predict(X2_test)
print(accuracy_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(roc_auc_score(y_test,y_pred))

0.5134986225895317
0.4193168433451119
0.4772117962466488
0.5077950676352888




In [188]:
steps = [('scaler', StandardScaler()),
        ('logreg', LogisticRegression())]
pipeline = Pipeline(steps)
logreg_scaled = pipeline.fit(X2_train,y_train)




In [189]:
logreg_scaled.score(X2_test,y_test)

0.5190082644628099

In [190]:
X_const = add_constant(X2,prepend =True)
logit_model = Logit(y,X_const).fit()

Optimization terminated successfully.
         Current function value: 0.690618
         Iterations 4


In [191]:
logit_model.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,9073.0
Model:,Logit,Df Residuals:,9065.0
Method:,MLE,Df Model:,7.0
Date:,"Tue, 25 Aug 2020",Pseudo R-squ.:,0.002939
Time:,22:43:33,Log-Likelihood:,-6266.0
converged:,True,LL-Null:,-6284.4
Covariance Type:,nonrobust,LLR p-value:,4.825e-06

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.8280,0.229,3.613,0.000,0.379,1.277
x1,-0.0022,0.004,-0.625,0.532,-0.009,0.005
x2,-0.0191,0.005,-4.104,0.000,-0.028,-0.010
x3,-0.0005,0.001,-0.343,0.731,-0.003,0.002
x4,-0.0246,0.005,-4.716,0.000,-0.035,-0.014
x5,0.0003,0.000,1.329,0.184,-0.000,0.001
x6,-0.0147,0.075,-0.196,0.844,-0.162,0.132
x7,0.0790,0.086,0.916,0.360,-0.090,0.248


In [224]:
X4 = games[['line', 'game_over_under',
        'temperature', 'wind_speed', 'wind_bearing']].join(dummies2.loc[:,'weather_Clear Night':]).values

In [225]:
X4_train, X4_test, y_train, y_test = train_test_split(X4, y, test_size = 0.2)

In [228]:
a,r,p,roc = logreg_scores(X4_train,X4_test,y_train,y_test)
print(a)
print(r)
print(p)
print(roc)

0.5118457300275482
0.2168141592920354
0.5240641711229946
0.5107122388117696




In [229]:
steps = [('scaler', StandardScaler()),
        ('logreg', LogisticRegression())]
pipeline = Pipeline(steps)
logreg_scaled = pipeline.fit(X4_train,y_train)




In [231]:
logreg_scaled.score(X4_test,y_test)

0.5267217630853994

In [232]:
X_const = add_constant(X4,prepend =True)
logit_model = Logit(y,X_const).fit()

         Current function value: 0.689870
         Iterations: 35




In [233]:
logit_model.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,9073.0
Model:,Logit,Df Residuals:,9057.0
Method:,MLE,Df Model:,15.0
Date:,"Tue, 25 Aug 2020",Pseudo R-squ.:,0.004019
Time:,23:33:12,Log-Likelihood:,-6259.2
converged:,False,LL-Null:,-6284.4
Covariance Type:,nonrobust,LLR p-value:,9.913e-06

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.9112,0.237,3.840,0.000,0.446,1.376
x1,-0.0022,0.004,-0.619,0.536,-0.009,0.005
x2,-0.0187,0.005,-3.985,0.000,-0.028,-0.010
x3,-0.0020,0.002,-1.265,0.206,-0.005,0.001
x4,-0.0235,0.006,-4.010,0.000,-0.035,-0.012
x5,0.0003,0.000,1.011,0.312,-0.000,0.001
x6,-0.1642,0.176,-0.931,0.352,-0.510,0.182
x7,-0.0070,0.076,-0.092,0.927,-0.156,0.142
x8,0.0026,0.085,0.030,0.976,-0.164,0.169


In [263]:
logreg = LogisticRegression(solver = 'lbfgs')
cross_val_score(logreg,X4_train, y_train, cv = 10, scoring = 'accuracy').mean()



0.5085399449035812

#### with 2 dummies

In [220]:
dummies2 = pd.get_dummies(games['weather_icon'],prefix ='weather')


In [195]:
dummies2.columns

Index(['weather_Clear Day', 'weather_Clear Night', 'weather_Cloudy',
       'weather_Dome', 'weather_Fog', 'weather_Partly Cloudy Day',
       'weather_Partly Cloudy Night', 'weather_Rain', 'weather_Sleet',
       'weather_Snow', 'weather_Wind'],
      dtype='object')

In [197]:
X2_ = games[['line', 'game_over_under',
        'temperature', 'wind_speed', 'wind_bearing']].join(dummies.loc[:,'surface_Grass':])

In [198]:
X3 = X2_.join(dummies2.loc[:,'weather_Clear Night':]).values

In [199]:
X3_train, X3_test,y_train, y_test = train_test_split(X3,y, test_size = 0.2)

In [200]:
logreg = LogisticRegression(solver = 'lbfgs')

In [264]:
cross_val_score(logreg,X3_train, y_train, cv = 10, scoring = 'accuracy').mean()



0.5122669436707608

In [203]:
logreg = LogisticRegression(solver = 'lbfgs')
logreg.fit(X3_train,y_train)
y_pred = logreg.predict(X3_test)
print(accuracy_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(roc_auc_score(y_test,y_pred))

0.531129476584022
0.2751445086705202
0.5151515151515151
0.5196775174931548




#### Decsion Tree

In [210]:
X_dt = games[['line', 'game_over_under',
        'temperature', 'wind_speed', 'wind_bearing']].values
y = games.over_hit.values




#### without dummies

In [211]:
Xdt_train, Xdt_test, y_train, y_test = train_test_split(X_dt, y, test_size = 0.2)

In [212]:
dt = DecisionTreeClassifier()
dt.fit(Xdt_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [214]:
y_pred = dt.predict(Xdt_test)
print(accuracy_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(roc_auc_score(y_test,y_pred))

0.5074380165289256
0.46440677966101696
0.49458483754512633
0.5063969382176052


In [265]:
dt = DecisionTreeClassifier()
cross_val_score(dt,Xdt_train,ydt_train, cv = 10, scoring = 'accuracy').mean()

0.5024724898406465

#### With Dummies

In [238]:
def model_scores(model, X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)

    a = accuracy_score(y_test,y_pred)
    r = recall_score(y_test,y_pred)
    p = precision_score(y_test,y_pred)
    roc = roc_auc_score(y_test,y_pred)
    return a,r,p,roc

#### surface dummies

In [217]:
Xdt2 = games[['line', 'game_over_under',
        'temperature', 'wind_speed', 'wind_bearing']].join(dummies.loc[:,'surface_Grass':]).values

In [254]:
Xdt2_train, Xdt2_test, y_train, y_test = train_test_split(Xdt2, y, test_size= 0.2)

In [255]:
dt = DecisionTreeClassifier()
dt.fit(Xdt2_train,y_train)
y_pred = dt.predict(Xdt2_test)
print(accuracy_score(y_test,y_pred))
print(recall_score(y_test,y_pred))
print(precision_score(y_test,y_pred))
print(roc_auc_score(y_test,y_pred))

0.49641873278236914
0.4490022172949002
0.4927007299270073
0.49613309112280607


In [266]:
dt = DecisionTreeClassifier()
cross_val_score(dt,Xdt2_train,y_train, cv = 10, scoring = 'accuracy').mean()

0.4936623898452125

#### weather dummies

In [239]:
Xdt3 =  games[['line', 'game_over_under',
        'temperature', 'wind_speed', 'wind_bearing']].join(dummies2.loc[:,'weather_Clear Night':]).values

In [257]:
Xdt3_train, Xdt3_test, y_train, y_test = train_test_split(Xdt3, y, test_size = 0.2)

In [258]:
dt = DecisionTreeClassifier()
a, r, p, roc = model_scores(dt,Xdt3_train, Xdt3_test,y_train,y_test)
print(a)
print(r)
print(p)
print(roc)

0.49696969696969695
0.4452054794520548
0.4773561811505508
0.49523319765999974


In [267]:
dt = DecisionTreeClassifier()
cross_val_score(dt,Xdt3_train,y_train, cv = 10, scoring = 'accuracy').mean()

0.5073014169824818

#### both dummies

In [250]:
X2_ = games[['line', 'game_over_under',
        'temperature', 'wind_speed', 'wind_bearing']].join(dummies.loc[:,'surface_Grass':])
Xdt4 = X2_.join(dummies2.loc[:,'weather_Clear Night':]).values

In [260]:
Xdt4_train, Xdt4_test, y_train, y_test = train_test_split(Xdt4, y, test_size = 0.2)

In [261]:
dt = DecisionTreeClassifier()
a, r, p, roc = model_scores(dt,Xdt4_train, Xdt4_test,y_train,y_test)
print(a)
print(r)
print(p)
print(roc)

0.4997245179063361
0.4553672316384181
0.48612786489746684
0.49865135775469294


In [268]:
dt = DecisionTreeClassifier()
cross_val_score(dt,Xdt4_train,y_train, cv = 10, scoring = 'accuracy').mean()

0.5010988843736207