In [144]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from statsmodels.tools import add_constant
from statsmodels.discrete.discrete_model import Logit
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, AdaBoostClassifier 
from sklearn.metrics import roc_auc_score,accuracy_score, precision_score,recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [46]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
with_ou = pd.read_csv('games_and_odds.csv')

In [121]:
with_ou2 = pd.read_csv('games_and_odds2.csv')

In [3]:
with_ou.head()

Unnamed: 0.1,Unnamed: 0,season,week,Date,Home Team,Away Team,home_team_score,visit_team_score,game_over_under,line,...,month,start,over_hit,under_hit,favorite_covered,underdog_covered,Total Score Over Close,Total Score Under Close,Total Score Over Odds Range,Total Score Under Odds Range
0,0,1983,1,1983-09-03,SF,PHI,17,22,40.0,-4.0,...,September,Night,0,1,0,1,1.936783,1.93928,0.070516,0.031869
1,1,1983,1,1983-09-04,NO,ARI,28,17,37.0,-3.0,...,September,Day,1,0,1,0,1.936783,1.93928,0.070516,0.031869
2,2,1983,1,1983-09-04,BUF,MIA,0,12,36.0,4.0,...,September,Day,0,1,0,1,1.936783,1.93928,0.070516,0.031869
3,3,1983,1,1983-09-04,CHI,ATL,17,20,38.0,-1.5,...,September,Day,0,1,0,1,1.936783,1.93928,0.070516,0.031869
4,4,1983,1,1983-09-04,CIN,OAK,10,20,42.0,-2.0,...,September,Day,0,1,0,1,1.936783,1.93928,0.070516,0.031869


In [9]:
with_ou.shape

(9073, 26)

In [10]:
with_ou.drop(with_ou[(with_ou.over_hit == 0) & (with_ou.under_hit == 0)].index,inplace = True)

In [123]:
with_ou2.drop(with_ou2[(with_ou2.over_hit == 0) & (with_ou2.under_hit == 0)].index,inplace = True)

In [12]:
with_ou.shape

(8873, 26)

In [13]:
with_ou.drop( ['Unnamed: 0',  'Date', 
       'home_team_score', 'visit_team_score', 
       'name',  'under_hit', 'favorite_covered',
       'underdog_covered'], axis = 1, inplace = True)

In [124]:
with_ou2.drop( ['Unnamed: 0',  'Date', 
       'home_team_score', 'visit_team_score', 
       'name',  'under_hit', 'favorite_covered',
       'underdog_covered'], axis = 1, inplace = True)

In [14]:
with_ou.head()

Unnamed: 0,season,week,Home Team,Away Team,game_over_under,line,surface,weather_icon,temperature,wind_speed,wind_bearing,month,start,over_hit,Total Score Over Close,Total Score Under Close,Total Score Over Odds Range,Total Score Under Odds Range
0,1983,1,SF,PHI,40.0,-4.0,Grass,Clear Day,73.32,6.12,282.0,September,Night,0,1.936783,1.93928,0.070516,0.031869
1,1983,1,NO,ARI,37.0,-3.0,Dome,Dome,70.0,0.0,0.0,September,Day,1,1.936783,1.93928,0.070516,0.031869
2,1983,1,BUF,MIA,36.0,4.0,Turf,Partly Cloudy Day,78.95,10.67,194.0,September,Day,0,1.936783,1.93928,0.070516,0.031869
3,1983,1,CHI,ATL,38.0,-1.5,Grass,Clear Day,86.66,11.04,196.0,September,Day,0,1.936783,1.93928,0.070516,0.031869
4,1983,1,CIN,OAK,42.0,-2.0,Turf,Clear Day,82.18,7.9,194.0,September,Day,0,1.936783,1.93928,0.070516,0.031869


#### without dummies

In [15]:
X = with_ou.drop(['over_hit','season','week','Home Team', 'Away Team','surface','weather_icon','month','start'], axis = 1)
y = with_ou.over_hit

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

#### Standardized 

In [17]:
def print_cv_scores(model,X_train,y_train ):
    accuracy = cross_val_score(model,X_train,y_train, cv=10, scoring = 'accuracy').mean()
    f1 = cross_val_score(model, X_train,y_train, cv = 10, scoring ='f1').mean()
    print(f'Accuracy Score : {accuracy}')
    print(f'F1 Score : {f1}')


In [18]:
def model_prediction_scores(model,X_train,X_test,y_train, y_test):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f'Accuracy Score : {accuracy}')
    print(f'F1 Score : {f1}')


In [19]:
X_train_stand = X_train.copy()
X_test_stand = X_test.copy()
scale = StandardScaler().fit(X_train_stand)
X_train_stand = scale.transform(X_train_stand)
X_test_stand = scale.transform(X_test_stand)

In [20]:
logreg = LogisticRegression(solver = 'lbfgs')
print_cv_scores(logreg, X_train_stand, y_train)

Accuracy Score : 0.5160533847741566
F1 Score : 0.5220544002628682


In [21]:
logreg = LogisticRegression(solver = 'lbfgs')
model_prediction_scores(logreg, X_train_stand, X_test_stand,y_train, y_test)

Accuracy Score : 0.5154929577464789
F1 Score : 0.5141242937853108


In [22]:
X_const = add_constant(X,prepend =True)
logit_model = Logit(y,X_const).fit()

Optimization terminated successfully.
         Current function value: 0.690905
         Iterations 4


  return ptp(axis=axis, out=out, **kwargs)


In [23]:
logit_model.summary()

0,1,2,3
Dep. Variable:,over_hit,No. Observations:,8873.0
Model:,Logit,Df Residuals:,8863.0
Method:,MLE,Df Model:,9.0
Date:,"Wed, 26 Aug 2020",Pseudo R-squ.:,0.003169
Time:,12:40:33,Log-Likelihood:,-6130.4
converged:,True,LL-Null:,-6149.9
Covariance Type:,nonrobust,LLR p-value:,1.164e-05

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,7.3262,6.316,1.160,0.246,-5.054,19.706
game_over_under,-0.0206,0.005,-4.399,0.000,-0.030,-0.011
line,-0.0027,0.004,-0.752,0.452,-0.010,0.004
temperature,-0.0011,0.001,-0.773,0.440,-0.004,0.002
wind_speed,-0.0239,0.005,-4.859,0.000,-0.034,-0.014
wind_bearing,0.0003,0.000,1.603,0.109,-7.3e-05,0.001
Total Score Over Close,-1.1042,1.819,-0.607,0.544,-4.669,2.461
Total Score Under Close,-2.2448,1.790,-1.254,0.210,-5.754,1.264
Total Score Over Odds Range,1.2549,1.100,1.141,0.254,-0.901,3.411


#### Not Standardized

In [24]:
logreg = LogisticRegression(solver = 'lbfgs')
print_cv_scores(logreg, X_train,y_train)

Accuracy Score : 0.5164753240260289
F1 Score : 0.5215939944834513


In [25]:
logreg = LogisticRegression(solver = 'lbfgs')
model_prediction_scores(logreg, X_train, X_test, y_train, y_test)

Accuracy Score : 0.5154929577464789
F1 Score : 0.5119182746878547


#### With Dummies

In [26]:
dummies1 = pd.get_dummies(with_ou['surface'],prefix ='surface')
dummies2 = pd.get_dummies(with_ou['weather_icon'], prefix = 'weather')
dummies3 = pd.get_dummies(with_ou['start'], prefix = 'start')

In [27]:
X_dummies = with_ou[['line', 'game_over_under',
        'temperature', 'wind_speed', 'wind_bearing']].join(dummies1.loc[:,'surface_Grass':])
X_dummies = X_dummies.join(dummies2.loc[:, 'weather_Clear Night':])
X_dummies = X_dummies.join(dummies3.loc[:,'start_Late':])

In [28]:
X_dummies_train, X_dummies_test, y_train, y_test = train_test_split(X_dummies,y, test_size = 0.20)

In [29]:
logreg = LogisticRegression(solver = 'lbfgs', max_iter = 1000)
print_cv_scores(logreg, X_dummies_train,y_train)

Accuracy Score : 0.5276199714738573
F1 Score : 0.5332090538053944


In [30]:
logreg =  LogisticRegression(solver = 'lbfgs', max_iter = 1000)
model_prediction_scores(logreg, X_dummies_train, X_dummies_test, y_train, y_test)

Accuracy Score : 0.5138028169014085
F1 Score : 0.5208217656857301


#### Logistic Regression Model With More Columns

In [128]:
X2 = with_ou2.drop(['over_hit','season','week','Home Team', 'Away Team','surface','weather_icon','month','start'], axis = 1)
y2 = with_ou2.over_hit

In [129]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2,y2,test_size = 0.2)

In [133]:
logreg = LogisticRegression(solver = 'lbfgs', max_iter = 2000)
print_cv_scores(logreg, X2_train, y2_train)

Accuracy Score : 0.5223995490601849
F1 Score : 0.5121309913915781


In [149]:
logreg = LogisticRegression(solver = 'lbfgs', max_iter = 2000)
model_prediction_scores(logreg,X2_train, X2_test, y2_train, y2_test )

Accuracy Score : 0.5194366197183099
F1 Score : 0.518351214003388


In [125]:
dummies1_ = pd.get_dummies(with_ou2['surface'],prefix ='surface')
dummies2_ = pd.get_dummies(with_ou2['weather_icon'], prefix = 'weather')
dummies3_ = pd.get_dummies(with_ou2['start'], prefix = 'start')

X_dummies_ = with_ou2[['line', 'game_over_under',
        'temperature', 'wind_speed', 'wind_bearing']].join(dummies1_.loc[:,'surface_Grass':])
X_dummies_ = X_dummies_.join(dummies2_.loc[:, 'weather_Clear Night':])
X_dummies_ = X_dummies_.join(dummies3_.loc[:,'start_Late':])

In [126]:
X2_dummies_train, X2_dummies_test, y2_train, y2_test = train_test_split(X_dummies_,y, test_size = 0.20)

In [127]:
logreg = LogisticRegression(solver = 'lbfgs', max_iter = 1000)
print_cv_scores(logreg, X2_dummies_train,y2_train)

Accuracy Score : 0.5300031784501082
F1 Score : 0.5380698596949576


In [148]:
logreg = LogisticRegression(solver = 'lbfgs', max_iter =1000)
model_prediction_scores(logreg, X2_dummies_train, X2_dummies_test, y2_train, y2_test)

Accuracy Score : 0.5076056338028169
F1 Score : 0.47222222222222227


#### Decision Tree

#### without dummies

In [55]:
Xdt = with_ou.drop(['over_hit','season','week','Home Team', 'Away Team','surface',
                    'weather_icon','month','start'], axis = 1)
ydt = with_ou.over_hit
Xdt_train, Xdt_test, ydt_train, ydt_test = train_test_split(Xdt,ydt, test_size = 0.20 )

In [56]:
dt = DecisionTreeClassifier()
print_cv_scores(dt, Xdt_train, ydt_train)

Accuracy Score : 0.49506585155738325
F1 Score : 0.4862948555251914


In [57]:
dt= DecisionTreeClassifier()
model_prediction_scores(dt, Xdt_train, Xdt_test, ydt_train,ydt_test)

Accuracy Score : 0.49295774647887325
F1 Score : 0.4833524684270953


#### with dummies

In [62]:
X_dummies_dt_train, X_dummies_dt_test, y_dummies_dt_train, y_dummies_dt_test = train_test_split(X_dummies,y, test_size = 0.20)

In [63]:
dt = DecisionTreeClassifier()
print_cv_scores(dt, X_dummies_dt_train, y_dummies_dt_train)

Accuracy Score : 0.49871751382168983
F1 Score : 0.48486562348855483


In [64]:
dt= DecisionTreeClassifier()
model_prediction_scores(dt, X_dummies_dt_train, X_dummies_dt_test, y_dummies_dt_train,y_dummies_dt_test)

Accuracy Score : 0.5036619718309859
F1 Score : 0.4928036845135291


#### with more columns

In [134]:
Xdt2 = with_ou2.drop(['over_hit','season','week','Home Team', 'Away Team','surface',
                    'weather_icon','month','start'], axis = 1)
ydt2 = with_ou.over_hit
Xdt2_train, Xdt2_test, ydt2_train, ydt2_test = train_test_split(Xdt2,ydt2, test_size = 0.20 )

In [135]:
dt = DecisionTreeClassifier()
print_cv_scores(dt,Xdt2_train, ydt2_train)

Accuracy Score : 0.5056447259451524
F1 Score : 0.49365711921570077


In [136]:
X_dummies_dt2_train, X_dummies_dt2_test, y_dummies_dt2_train, y_dummies_dt2_test = train_test_split(X_dummies_,y, test_size = 0.20)

In [137]:
dt = DecisionTreeClassifier()
print_cv_scores(dt, X_dummies_dt2_train,y_dummies_dt2_train)

Accuracy Score : 0.5033818709151949
F1 Score : 0.48415695416179183


#### Hyperparamter Tuning


In [82]:
from scipy.stats import randint
param_dist = {'max_depth':[3,10], 'max_features': randint(1,9), 'min_samples_leaf': randint(1,9), 'criterion':['gini','entropy']}

In [83]:
tree = DecisionTreeClassifier()
tree_cv = RandomizedSearchCV(tree, param_dist, cv = 10)

In [84]:
tree_cv.fit(Xdt_train,ydt_train)

RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=DecisionTreeClassifier(class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort=False,
                                                    random_state=None,
                                                    splitter='best'

In [100]:
tree_cv.best_params_

{'criterion': 'entropy',
 'max_depth': 3,
 'max_features': 3,
 'min_samples_leaf': 2}

In [101]:
tree_cv.best_score_

0.5139475908706678

In [138]:
tree = DecisionTreeClassifier()
tree_cv = RandomizedSearchCV(tree, param_dist, cv = 10)
tree_cv.fit(X_dummies_dt_train,y_dummies_dt_train)

RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=DecisionTreeClassifier(class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features=None,
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    presort=False,
                                                    random_state=None,
                                                    splitter='best'

In [140]:
tree_cv.best_params_

{'criterion': 'entropy',
 'max_depth': 3,
 'max_features': 1,
 'min_samples_leaf': 6}

In [141]:
tree_cv.best_score_

0.5156382079459002

#### Random Forest

#### without dummies

In [112]:
Xrf = with_ou.drop(['over_hit','season','week','Home Team', 'Away Team','surface',
                    'weather_icon','month','start'], axis = 1)
yrf = with_ou.over_hit
Xrf_train, Xrf_test, yrf_train, yrf_test = train_test_split(Xrf,yrf, test_size = 0.20 )

In [53]:
rf = RandomForestClassifier()
print_cv_scores(rf, Xrf_train, y_train)

Accuracy Score : 0.49549711940385965
F1 Score : 0.43076412604066083


#### with dummies

In [65]:
X_dummies_rf_train, X_dummies_rf_test, y_dummies_rf_train, y_dummies_rf_test = train_test_split(X_dummies,y, test_size = 0.20)

In [66]:
rf= RandomForestClassifier ()
print_cv_scores(rf,X_dummies_train,y_train)

Accuracy Score : 0.5043630304677745
F1 Score : 0.41319630619907155


#### Hyperparameter Tuning

#### without dummies

In [113]:
import numpy as np

n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

In [114]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [117]:
rf = RandomForestClassifier()
rf_cv = RandomizedSearchCV(rf, random_grid, n_iter = 100,cv = 3)
rf_cv.fit(Xrf_train, yrf_train)

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [118]:
rf_cv.best_params_

{'n_estimators': 400,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 100,
 'bootstrap': True}

In [119]:
rf_cv.best_score_

0.52465483234714

#### AdaBoost

In [67]:
Xa = with_ou.drop(['over_hit','season','week','Home Team', 'Away Team','surface',
                    'weather_icon','month','start'], axis = 1)
ya = with_ou.over_hit
Xa_train, Xa_test, ya_train, ya_test = train_test_split(Xa,ya, test_size = 0.20 )

In [68]:
dt = DecisionTreeClassifier()
ada = AdaBoostClassifier(base_estimator = dt)
print_cv_scores(ada, Xa_train, ya_train)

Accuracy Score : 0.4938030138781425
F1 Score : 0.4932422338339437


#### with dummies

In [69]:
X_dummies_a_train, X_dummies_a_test, y_dummies_a_train, y_dummies_a_test = train_test_split(X_dummies,y, test_size = 0.20)
dt = DecisionTreeClassifier()
ada = AdaBoostClassifier(base_estimator = dt)
print_cv_scores(ada, X_dummies_a_train, y_dummies_a_train)


Accuracy Score : 0.5021132773243262
F1 Score : 0.4832311123093212


#### Gradient Boost

#### without dummies

In [70]:
Xgb = with_ou.drop(['over_hit','season','week','Home Team', 'Away Team','surface',
                    'weather_icon','month','start'], axis = 1)
ygb= with_ou.over_hit
Xgb_train, Xgb_test, ygb_train, ygb_test = train_test_split(Xgb,ygb, test_size = 0.20 )

In [71]:
gb = GradientBoostingClassifier()
print_cv_scores(gb, Xgb_train, ygb_train)

Accuracy Score : 0.5121188404502146
F1 Score : 0.5160929790118493


#### with dummies


In [72]:
X_dummies_gb_train, X_dummies_gb_test, y_dummies_gb_train, y_dummies_gb_test = train_test_split(X_dummies,y, test_size = 0.20)

In [73]:
gb = GradientBoostingClassifier()
print_cv_scores(gb, X_dummies_gb_train, y_dummies_gb_train)

Accuracy Score : 0.5121214114855429
F1 Score : 0.4795231998021744


#### Hyperparameter Tuning

In [145]:
param_grid = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
                          'max_depth': [2, 4, 6],
                          'min_samples_leaf': [1, 2, 5, 10],
                          'max_features': [1.0, 0.3, 0.1],
                          'n_estimators': [250, 500,750,1000]}

gb = GradientBoostingClassifier()
gb_cv = GridSearchCV(gb,param_grid, cv = 10)
gb_cv.fit(Xgb_train,ygb_train)


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_n...
                                                  random_state=None,

In [146]:
gb_cv.best_params_

{'learning_rate': 0.05,
 'max_depth': 4,
 'max_features': 0.1,
 'min_samples_leaf': 5,
 'n_estimators': 1000}

In [147]:
gb_cv.best_score_

0.5295857988165681