In [321]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from statsmodels.tools import add_constant
from statsmodels.discrete.discrete_model import Logit
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, AdaBoostClassifier 
from sklearn.metrics import roc_auc_score,accuracy_score, precision_score,recall_score, f1_score
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [155]:
games = pd.read_csv('games_cleaned.csv')

In [156]:
games.head()

Unnamed: 0.1,Unnamed: 0,season,week,Date,Home Team,Away Team,home_team_score,visit_team_score,game_over_under,line,...,temperature,wind_speed,wind_bearing,name,month,start,over_hit,under_hit,favorite_covered,underdog_covered
0,0,1983,1,1983-09-03 00:00:00,SF,PHI,17,22,40.0,-4.0,...,73.32,6.12,282.0,PHI @ SF,September,Night,0,1,0,1
1,1,1983,1,1983-09-04 00:00:00,NO,ARI,28,17,37.0,-3.0,...,70.0,0.0,0.0,SLC @ NO,September,Day,1,0,1,0
2,2,1983,1,1983-09-04 00:00:00,BUF,MIA,0,12,36.0,4.0,...,78.95,10.67,194.0,MIA @ BUF,September,Day,0,1,0,1
3,3,1983,1,1983-09-04 00:00:00,CHI,ATL,17,20,38.0,-1.5,...,86.66,11.04,196.0,ATL @ CHI,September,Day,0,1,0,1
4,4,1983,1,1983-09-04 00:00:00,CIN,OAK,10,20,42.0,-2.0,...,82.18,7.9,194.0,LAD @ CIN,September,Day,0,1,0,1


In [157]:
games.columns

Index(['Unnamed: 0', 'season', 'week', 'Date', 'Home Team', 'Away Team',
       'home_team_score', 'visit_team_score', 'game_over_under', 'line',
       'surface', 'weather_icon', 'temperature', 'wind_speed', 'wind_bearing',
       'name', 'month', 'start', 'over_hit', 'under_hit', 'favorite_covered',
       'underdog_covered'],
      dtype='object')

#### Delete Unusable Features 

In [158]:
games.drop( ['Unnamed: 0',  'Date', 
       'home_team_score', 'visit_team_score', 
       'name',  'under_hit', 'favorite_covered',
       'underdog_covered'], axis = 1, inplace = True)

In [159]:
games.head()

Unnamed: 0,season,week,Home Team,Away Team,game_over_under,line,surface,weather_icon,temperature,wind_speed,wind_bearing,month,start,over_hit
0,1983,1,SF,PHI,40.0,-4.0,Grass,Clear Day,73.32,6.12,282.0,September,Night,0
1,1983,1,NO,ARI,37.0,-3.0,Dome,Dome,70.0,0.0,0.0,September,Day,1
2,1983,1,BUF,MIA,36.0,4.0,Turf,Partly Cloudy Day,78.95,10.67,194.0,September,Day,0
3,1983,1,CHI,ATL,38.0,-1.5,Grass,Clear Day,86.66,11.04,196.0,September,Day,0
4,1983,1,CIN,OAK,42.0,-2.0,Turf,Clear Day,82.18,7.9,194.0,September,Day,0


In [319]:
games.columns

Index(['season', 'week', 'Home Team', 'Away Team', 'game_over_under', 'line',
       'surface', 'weather_icon', 'temperature', 'wind_speed', 'wind_bearing',
       'month', 'start', 'over_hit'],
      dtype='object')

In [326]:
def print_cv_scores(model,X_train,y_train ):
    f1= cross_val_score(model, X_train,y_train, cv = 10, scoring ='f1')
    precision = cross_val_score(model,X_train, y_train, cv = 10, scoring = 'precision')
    recall = cross_val_score(model,X_train,y_train, cv = 10, scoring = 'recall')
    print(f'Average Recall Score : {recall.mean()}')
    print(f'Average Precision Score : {precision.mean()}')   
    print(f'Average F1 Score : {f1.mean()}') 
    print(f'F1 Scores:{f1}')


#### Without Dummies

In [271]:
y = games.over_hit.values
X = games[['line', 'game_over_under',
        'temperature', 'wind_speed', 'wind_bearing']]

In [272]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [327]:
logreg = LogisticRegression(solver = 'lbfgs')
print_cv_scores(logreg,X_train, y_train)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Average Recall Score : 0.009116809116809118
Average Precision Score : 0.35804761904761906
Average F1 Score : 0.01734794996570036
F1 Scores:[0.         0.00561798 0.         0.         0.01133144 0.
 0.05913978 0.01133144 0.02754821 0.05851064]


In [281]:
logreg = LogisticRegression(solver = 'lbfgs')
model_prediction_scores(logreg, X_train, X_test, y_train,y_test)

Accuracy Score : 0.528374655647383
F1 Score : 0.3929078014184397


#### Scaled 

In [283]:
X_train_stand = X_train.copy()
X_test_stand = X_test.copy()
scale = StandardScaler().fit(X_train_stand)
X_train_stand = scale.transform(X_train_stand)
X_test_stand = scale.transform(X_test_stand)

In [284]:
logreg = LogisticRegression(solver = 'lbfgs')
print_cv_scores(logreg, X_train_stand, y_train)

Accuracy Score : 0.5217745004033302
F1 Score : 0.3869385094645105


In [285]:
logreg = LogisticRegression(solver = 'lbfgs')
model_prediction_scores(logreg, X_train_stand, X_test_stand,y_train, y_test)

Accuracy Score : 0.528374655647383
F1 Score : 0.3929078014184397


In [170]:
X_const = add_constant(X,prepend =True)
logit_model = Logit(y,X_const).fit()

Optimization terminated successfully.
         Current function value: 0.690791
         Iterations 4


In [171]:
logit_model.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,9073.0
Model:,Logit,Df Residuals:,9067.0
Method:,MLE,Df Model:,5.0
Date:,"Tue, 25 Aug 2020",Pseudo R-squ.:,0.00269
Time:,22:12:10,Log-Likelihood:,-6267.5
converged:,True,LL-Null:,-6284.4
Covariance Type:,nonrobust,LLR p-value:,2.601e-06

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0.8655,0.224,3.862,0.000,0.426,1.305
x1,-0.0024,0.004,-0.661,0.509,-0.009,0.005
x2,-0.0192,0.005,-4.138,0.000,-0.028,-0.010
x3,-0.0010,0.001,-0.658,0.511,-0.004,0.002
x4,-0.0236,0.005,-4.860,0.000,-0.033,-0.014
x5,0.0003,0.000,1.677,0.094,-5.72e-05,0.001


#### With Dummies

In [286]:
dummies1 = pd.get_dummies(games['surface'],prefix ='surface')

In [296]:
X_dummies = games[['line', 'game_over_under',
        'temperature', 'wind_speed', 'wind_bearing']].join(dummies1.loc[:,'surface_Grass':])

In [288]:
X_dummies_train, X_dummies_test,y_dummies_train, y_dummies_test = train_test_split(X_dummies,y, test_size = 0.20)

In [291]:
logreg = LogisticRegression(solver = 'lbfgs', max_iter = 200)
print_cv_scores(logreg, X_dummies_train, y_dummies_train)

Accuracy Score : 0.5180545308984076
F1 Score : 0.41844052570391765


In [292]:
logreg = LogisticRegression(solver = 'lbfgs', max_iter = 200)
model_prediction_scores(logreg,X_dummies_train, X_dummies_test, y_dummies_train, y_dummies_test)

Accuracy Score : 0.5333333333333333
F1 Score : 0.43268586738111187


#### with 2 dummies

In [294]:
dummies2 = pd.get_dummies(games['weather_icon'],prefix ='weather')


In [297]:
X2_dummies= X_dummies.join(dummies2.loc[:,'weather_Clear Night':])

In [298]:
X2_dummies_train, X2_dummies_test,y2_dummies_train, y2_dummies_test = train_test_split(X2_dummies,y, test_size = 0.2)

In [302]:
logreg = LogisticRegression(solver = 'lbfgs', max_iter =400)
print_cv_scores(logreg, X2_dummies_train, y2_dummies_train)

Accuracy Score : 0.5235543109230225
F1 Score : 0.3893795894033939


#### Decsion Tree

In [303]:
X_dt = games[['line', 'game_over_under',
        'temperature', 'wind_speed', 'wind_bearing']]
y = games.over_hit.values




#### without dummies

In [305]:
Xdt_train, Xdt_test, ydt_train, ydt_test = train_test_split(X_dt, y, test_size = 0.2)

In [306]:
dt = DecisionTreeClassifier()
print_cv_scores(dt, Xdt_train,ydt_train)

Accuracy Score : 0.5039918307923464
F1 Score : 0.49131471846269414


In [307]:
dt= DecisionTreeClassifier()
model_prediction_scores(dt, Xdt_train, Xdt_test, ydt_train, ydt_test)

Accuracy Score : 0.5013774104683195
F1 Score : 0.4654459539279386


#### With Dummies

#### surface dummies

In [308]:
Xdt_dummies = games[['line', 'game_over_under',
        'temperature', 'wind_speed', 'wind_bearing']].join(dummies1.loc[:,'surface_Grass':])

In [309]:
Xdt_dummies_train, Xdt_dummies_test, ydt_dummies_train, ydt_dummies_test = train_test_split(Xdt_dummies, y, test_size= 0.2)

In [310]:
dt = DecisionTreeClassifier()
print_cv_scores(dt, Xdt_dummies_train, ydt_dummies_train)

Accuracy Score : 0.49297310629417074
F1 Score : 0.468394089808913


In [311]:
dt = DecisionTreeClassifier()
model_prediction_scores(dt, Xdt_dummies_train, Xdt_dummies_test, ydt_dummies_train, ydt_dummies_test)

Accuracy Score : 0.5068870523415978
F1 Score : 0.4888635065676756


#### both dummies

In [313]:
Xdt2_dummies = X2_dummies

In [314]:
Xdt2_dummies_train, Xdt2_dummies_test, ydt2_dummies_train, ydt2_dummies_test = train_test_split(Xdt2_dummies, y, test_size = 0.2)

In [317]:
dt = DecisionTreeClassifier()
print_cv_scores(dt, Xdt2_dummies_train, ydt2_dummies_train)

Accuracy Score : 0.5017862714714465
F1 Score : 0.4681219149519277


In [318]:
dt = DecisionTreeClassifier()
model_prediction_scores(dt,Xdt2_dummies_train, Xdt2_dummies_test, ydt2_dummies_train, ydt2_dummies_test  )

Accuracy Score : 0.5046831955922865
F1 Score : 0.4764123471170646
