In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import polars as pl
import sportsdataverse.nfl as sdv_nfl
# preprocessing tools
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler, PowerTransformer, MaxAbsScaler, LabelEncoder
from sklearn.pipeline import Pipeline
# import potential models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import SGDClassifier
# import tools to optimize our models
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate, RandomizedSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import SequentialFeatureSelector
from multiprocessing import Pool

In [28]:
#get our training data
master_df_with_moneyline = pd.read_csv('Data/final_master_for_ml.csv',dtype={'date':object,'season':'float','neutral':object,'playoff':object,'home_team':object,'away_team':object,'qb_away':object,'qb_home':object})
master_df = pd.read_csv('Data/final_master.csv',dtype={'date':object,'season':'float','neutral':object,'playoff':object,'home_team':object,'away_team':object,'qb_away':object,'qb_home':object})
#Define our training features
y_features = ['over','covered','winner_h_or_a']
x_features = [
 'spread_favorite',
 'over_under_line',
 'total-first-downs-rolling-4-home',
 'total-first-downs-rolling-4-away',
 'rushing-first-downs-rolling-4-home',
 'rushing-first-downs-rolling-4-away',
 'passing-first-downs-rolling-4-home',
 'passing-first-downs-rolling-4-away',
 'penalty-first-downs-rolling-4-home',
 'penalty-first-downs-rolling-4-away',
 'net-yards-rolling-4-home',
 'net-yards-rolling-4-away',
 'net-rushing-yds-rolling-4-home',
 'net-rushing-yds-rolling-4-away',
 'rushing-plays-rolling-4-home',
 'rushing-plays-rolling-4-away',
 'avg-gain-rushing-rolling-4-home',
 'avg-gain-rushing-rolling-4-away',
 'net-passing-yds-rolling-4-home',
 'net-passing-yds-rolling-4-away',
 'gross-passing-yds-rolling-4-home',
 'gross-passing-yds-rolling-4-away',
 'yds-per-att-rolling-4-home',
 'yds-per-att-rolling-4-away',
 'blocked-kicks-allowed-rolling-4-home',
 'blocked-kicks-allowed-rolling-4-away',
 'total-plays-rolling-4-home',
 'total-plays-rolling-4-away',
 'avg-gain-per-play-rolling-4-home',
 'avg-gain-per-play-rolling-4-away',
 'passing-attempts-rolling-4-home',
 'passing-attempts-rolling-4-away',
 'completions-rolling-4-home',
 'completions-rolling-4-away',
 'int-thrown-rolling-4-home',
 'int-thrown-rolling-4-away',
 'fumbles-rolling-4-home',
 'fumbles-rolling-4-away',
 'fumbles-lost-rolling-4-home',
 'fumbles-lost-rolling-4-away',
 'fga-rolling-4-home',
 'fga-rolling-4-away',
 'fgm-rolling-4-home',
 'fgm-rolling-4-away',
 '3rd-down-convs-rolling-4-home',
 '3rd-down-convs-rolling-4-away',
 '3rd-downs-rolling-4-home',
 '3rd-downs-rolling-4-away',
 '3rd-down-conv-rate-rolling-4-home',
 '3rd-down-conv-rate-rolling-4-away',
 'punts-rolling-4-home',
 'punts-rolling-4-away',
 'yards-per-punt-rolling-4-home',
 'yards-per-punt-rolling-4-away',
 'penalties-rolling-4-home',
 'penalties-rolling-4-away',
 'penalty-yards-rolling-4-home',
 'penalty-yards-rolling-4-away',
 'sacks_allowed-rolling-4-home',
 'sacks_allowed-rolling-4-away',
 'sack-yds-lost-rolling-4-home',
 'sack-yds-lost-rolling-4-away',
 'punts-returned-rolling-4-home',
 'punts-returned-rolling-4-away',
 'punt-return-yds-rolling-4-home',
 'punt-return-yds-rolling-4-away',
 'kicks-returned-rolling-4-home',
 'kicks-returned-rolling-4-away',
 'kick-return-yds-rolling-4-home',
 'kick-return-yds-rolling-4-away',
 'total-first-downs-rolling-8-home',
 'total-first-downs-rolling-8-away',
 'rushing-first-downs-rolling-8-home',
 'rushing-first-downs-rolling-8-away',
 'passing-first-downs-rolling-8-home',
 'passing-first-downs-rolling-8-away',
 'penalty-first-downs-rolling-8-home',
 'penalty-first-downs-rolling-8-away',
 'net-yards-rolling-8-home',
 'net-yards-rolling-8-away',
 'net-rushing-yds-rolling-8-home',
 'net-rushing-yds-rolling-8-away',
 'rushing-plays-rolling-8-home',
 'rushing-plays-rolling-8-away',
 'avg-gain-rushing-rolling-8-home',
 'avg-gain-rushing-rolling-8-away',
 'net-passing-yds-rolling-8-home',
 'net-passing-yds-rolling-8-away',
 'gross-passing-yds-rolling-8-home',
 'gross-passing-yds-rolling-8-away',
 'yds-per-att-rolling-8-home',
 'yds-per-att-rolling-8-away',
 'blocked-kicks-allowed-rolling-8-home',
 'blocked-kicks-allowed-rolling-8-away',
 'total-plays-rolling-8-home',
 'total-plays-rolling-8-away',
 'avg-gain-per-play-rolling-8-home',
 'avg-gain-per-play-rolling-8-away',
 'passing-attempts-rolling-8-home',
 'passing-attempts-rolling-8-away',
 'completions-rolling-8-home',
 'completions-rolling-8-away',
 'int-thrown-rolling-8-home',
 'int-thrown-rolling-8-away',
 'interceptions-rolling-8-home',
 'interceptions-rolling-8-away',
 'int-return-yards-rolling-8-home',
 'int-return-yards-rolling-8-away',
 'fumbles-rolling-8-home',
 'fumbles-rolling-8-away',
 'fumbles-lost-rolling-8-home',
 'fumbles-lost-rolling-8-away',
 'fga-rolling-8-home',
 'fga-rolling-8-away',
 'fgm-rolling-8-home',
 'fgm-rolling-8-away',
 '3rd-down-convs-rolling-8-home',
 '3rd-down-convs-rolling-8-away',
 '3rd-downs-rolling-8-home',
 '3rd-downs-rolling-8-away',
 '3rd-down-conv-rate-rolling-8-home',
 '3rd-down-conv-rate-rolling-8-away',
 'punts-rolling-8-home',
 'punts-rolling-8-away',
 'yards-per-punt-rolling-8-home',
 'yards-per-punt-rolling-8-away',
 'penalties-rolling-8-home',
 'penalties-rolling-8-away',
 'penalty-yards-rolling-8-home',
 'penalty-yards-rolling-8-away',
 'sacks_allowed-rolling-8-home',
 'sacks_allowed-rolling-8-away',
 'sack-yds-lost-rolling-8-home',
 'sack-yds-lost-rolling-8-away',
 'punts-returned-rolling-8-home',
 'punts-returned-rolling-8-away',
 'punt-return-yds-rolling-8-home',
 'punt-return-yds-rolling-8-away',
 'kicks-returned-rolling-8-home',
 'kicks-returned-rolling-8-away',
 'kick-return-yds-rolling-8-home',
 'kick-return-yds-rolling-8-away',
 'total-first-downs-rolling-16-home',
 'total-first-downs-rolling-16-away',
 'rushing-first-downs-rolling-16-home',
 'rushing-first-downs-rolling-16-away',
 'passing-first-downs-rolling-16-home',
 'passing-first-downs-rolling-16-away',
 'penalty-first-downs-rolling-16-home',
 'penalty-first-downs-rolling-16-away',
 'net-yards-rolling-16-home',
 'net-yards-rolling-16-away',
 'net-rushing-yds-rolling-16-home',
 'net-rushing-yds-rolling-16-away',
 'rushing-plays-rolling-16-home',
 'rushing-plays-rolling-16-away',
 'avg-gain-rushing-rolling-16-home',
 'avg-gain-rushing-rolling-16-away',
 'net-passing-yds-rolling-16-home',
 'net-passing-yds-rolling-16-away',
 'gross-passing-yds-rolling-16-home',
 'gross-passing-yds-rolling-16-away',
 'yds-per-att-rolling-16-home',
 'yds-per-att-rolling-16-away',
 'blocked-kicks-allowed-rolling-16-home',
 'blocked-kicks-allowed-rolling-16-away',
 'total-plays-rolling-16-home',
 'total-plays-rolling-16-away',
 'avg-gain-per-play-rolling-16-home',
 'avg-gain-per-play-rolling-16-away',
 'passing-attempts-rolling-16-home',
 'passing-attempts-rolling-16-away',
 'completions-rolling-16-home',
 'completions-rolling-16-away',
 'int-thrown-rolling-16-home',
 'int-thrown-rolling-16-away',
 'interceptions-rolling-16-home',
 'interceptions-rolling-16-away',
 'int-return-yards-rolling-16-home',
 'int-return-yards-rolling-16-away',
 'fumbles-rolling-16-home',
 'fumbles-rolling-16-away',
 'fumbles-lost-rolling-16-home',
 'fumbles-lost-rolling-16-away',
 'fga-rolling-16-home',
 'fga-rolling-16-away',
 'fgm-rolling-16-home',
 'fgm-rolling-16-away',
 '3rd-down-convs-rolling-16-home',
 '3rd-down-convs-rolling-16-away',
 '3rd-downs-rolling-16-home',
 '3rd-downs-rolling-16-away',
 '3rd-down-conv-rate-rolling-16-home',
 '3rd-down-conv-rate-rolling-16-away',
 'punts-rolling-16-home',
 'punts-rolling-16-away',
 'yards-per-punt-rolling-16-home',
 'yards-per-punt-rolling-16-away',
 'penalties-rolling-16-home',
 'penalties-rolling-16-away',
 'penalty-yards-rolling-16-home',
 'penalty-yards-rolling-16-away',
 'sacks_allowed-rolling-16-home',
 'sacks_allowed-rolling-16-away',
 'sack-yds-lost-rolling-16-home',
 'sack-yds-lost-rolling-16-away',
 'punts-returned-rolling-16-home',
 'punts-returned-rolling-16-away',
 'punt-return-yds-rolling-16-home',
 'punt-return-yds-rolling-16-away',
 'kicks-returned-rolling-16-home',
 'kicks-returned-rolling-16-away',
 'kick-return-yds-rolling-16-home',
 'kick-return-yds-rolling-16-away',
 'home_favorite',]
x_features_ml = [
 'spread_favorite',
 'over_under_line',
 'total-first-downs-rolling-4-home',
 'total-first-downs-rolling-4-away',
 'rushing-first-downs-rolling-4-home',
 'rushing-first-downs-rolling-4-away',
 'passing-first-downs-rolling-4-home',
 'passing-first-downs-rolling-4-away',
 'penalty-first-downs-rolling-4-home',
 'penalty-first-downs-rolling-4-away',
 'net-yards-rolling-4-home',
 'net-yards-rolling-4-away',
 'net-rushing-yds-rolling-4-home',
 'net-rushing-yds-rolling-4-away',
 'rushing-plays-rolling-4-home',
 'rushing-plays-rolling-4-away',
 'avg-gain-rushing-rolling-4-home',
 'avg-gain-rushing-rolling-4-away',
 'net-passing-yds-rolling-4-home',
 'net-passing-yds-rolling-4-away',
 'gross-passing-yds-rolling-4-home',
 'gross-passing-yds-rolling-4-away',
 'yds-per-att-rolling-4-home',
 'yds-per-att-rolling-4-away',
 'blocked-kicks-allowed-rolling-4-home',
 'blocked-kicks-allowed-rolling-4-away',
 'total-plays-rolling-4-home',
 'total-plays-rolling-4-away',
 'avg-gain-per-play-rolling-4-home',
 'avg-gain-per-play-rolling-4-away',
 'passing-attempts-rolling-4-home',
 'passing-attempts-rolling-4-away',
 'completions-rolling-4-home',
 'completions-rolling-4-away',
 'int-thrown-rolling-4-home',
 'int-thrown-rolling-4-away',
 'fumbles-rolling-4-home',
 'fumbles-rolling-4-away',
 'fumbles-lost-rolling-4-home',
 'fumbles-lost-rolling-4-away',
 'fga-rolling-4-home',
 'fga-rolling-4-away',
 'fgm-rolling-4-home',
 'fgm-rolling-4-away',
 '3rd-down-convs-rolling-4-home',
 '3rd-down-convs-rolling-4-away',
 '3rd-downs-rolling-4-home',
 '3rd-downs-rolling-4-away',
 '3rd-down-conv-rate-rolling-4-home',
 '3rd-down-conv-rate-rolling-4-away',
 'punts-rolling-4-home',
 'punts-rolling-4-away',
 'yards-per-punt-rolling-4-home',
 'yards-per-punt-rolling-4-away',
 'penalties-rolling-4-home',
 'penalties-rolling-4-away',
 'penalty-yards-rolling-4-home',
 'penalty-yards-rolling-4-away',
 'sacks_allowed-rolling-4-home',
 'sacks_allowed-rolling-4-away',
 'sack-yds-lost-rolling-4-home',
 'sack-yds-lost-rolling-4-away',
 'punts-returned-rolling-4-home',
 'punts-returned-rolling-4-away',
 'punt-return-yds-rolling-4-home',
 'punt-return-yds-rolling-4-away',
 'kicks-returned-rolling-4-home',
 'kicks-returned-rolling-4-away',
 'kick-return-yds-rolling-4-home',
 'kick-return-yds-rolling-4-away',
 'total-first-downs-rolling-8-home',
 'total-first-downs-rolling-8-away',
 'rushing-first-downs-rolling-8-home',
 'rushing-first-downs-rolling-8-away',
 'passing-first-downs-rolling-8-home',
 'passing-first-downs-rolling-8-away',
 'penalty-first-downs-rolling-8-home',
 'penalty-first-downs-rolling-8-away',
 'net-yards-rolling-8-home',
 'net-yards-rolling-8-away',
 'net-rushing-yds-rolling-8-home',
 'net-rushing-yds-rolling-8-away',
 'rushing-plays-rolling-8-home',
 'rushing-plays-rolling-8-away',
 'avg-gain-rushing-rolling-8-home',
 'avg-gain-rushing-rolling-8-away',
 'net-passing-yds-rolling-8-home',
 'net-passing-yds-rolling-8-away',
 'gross-passing-yds-rolling-8-home',
 'gross-passing-yds-rolling-8-away',
 'yds-per-att-rolling-8-home',
 'yds-per-att-rolling-8-away',
 'blocked-kicks-allowed-rolling-8-home',
 'blocked-kicks-allowed-rolling-8-away',
 'total-plays-rolling-8-home',
 'total-plays-rolling-8-away',
 'avg-gain-per-play-rolling-8-home',
 'avg-gain-per-play-rolling-8-away',
 'passing-attempts-rolling-8-home',
 'passing-attempts-rolling-8-away',
 'completions-rolling-8-home',
 'completions-rolling-8-away',
 'int-thrown-rolling-8-home',
 'int-thrown-rolling-8-away',
 'interceptions-rolling-8-home',
 'interceptions-rolling-8-away',
 'int-return-yards-rolling-8-home',
 'int-return-yards-rolling-8-away',
 'fumbles-rolling-8-home',
 'fumbles-rolling-8-away',
 'fumbles-lost-rolling-8-home',
 'fumbles-lost-rolling-8-away',
 'fga-rolling-8-home',
 'fga-rolling-8-away',
 'fgm-rolling-8-home',
 'fgm-rolling-8-away',
 '3rd-down-convs-rolling-8-home',
 '3rd-down-convs-rolling-8-away',
 '3rd-downs-rolling-8-home',
 '3rd-downs-rolling-8-away',
 '3rd-down-conv-rate-rolling-8-home',
 '3rd-down-conv-rate-rolling-8-away',
 'punts-rolling-8-home',
 'punts-rolling-8-away',
 'yards-per-punt-rolling-8-home',
 'yards-per-punt-rolling-8-away',
 'penalties-rolling-8-home',
 'penalties-rolling-8-away',
 'penalty-yards-rolling-8-home',
 'penalty-yards-rolling-8-away',
 'sacks_allowed-rolling-8-home',
 'sacks_allowed-rolling-8-away',
 'sack-yds-lost-rolling-8-home',
 'sack-yds-lost-rolling-8-away',
 'punts-returned-rolling-8-home',
 'punts-returned-rolling-8-away',
 'punt-return-yds-rolling-8-home',
 'punt-return-yds-rolling-8-away',
 'kicks-returned-rolling-8-home',
 'kicks-returned-rolling-8-away',
 'kick-return-yds-rolling-8-home',
 'kick-return-yds-rolling-8-away',
 'total-first-downs-rolling-16-home',
 'total-first-downs-rolling-16-away',
 'rushing-first-downs-rolling-16-home',
 'rushing-first-downs-rolling-16-away',
 'passing-first-downs-rolling-16-home',
 'passing-first-downs-rolling-16-away',
 'penalty-first-downs-rolling-16-home',
 'penalty-first-downs-rolling-16-away',
 'net-yards-rolling-16-home',
 'net-yards-rolling-16-away',
 'net-rushing-yds-rolling-16-home',
 'net-rushing-yds-rolling-16-away',
 'rushing-plays-rolling-16-home',
 'rushing-plays-rolling-16-away',
 'avg-gain-rushing-rolling-16-home',
 'avg-gain-rushing-rolling-16-away',
 'net-passing-yds-rolling-16-home',
 'net-passing-yds-rolling-16-away',
 'gross-passing-yds-rolling-16-home',
 'gross-passing-yds-rolling-16-away',
 'yds-per-att-rolling-16-home',
 'yds-per-att-rolling-16-away',
 'blocked-kicks-allowed-rolling-16-home',
 'blocked-kicks-allowed-rolling-16-away',
 'total-plays-rolling-16-home',
 'total-plays-rolling-16-away',
 'avg-gain-per-play-rolling-16-home',
 'avg-gain-per-play-rolling-16-away',
 'passing-attempts-rolling-16-home',
 'passing-attempts-rolling-16-away',
 'completions-rolling-16-home',
 'completions-rolling-16-away',
 'int-thrown-rolling-16-home',
 'int-thrown-rolling-16-away',
 'interceptions-rolling-16-home',
 'interceptions-rolling-16-away',
 'int-return-yards-rolling-16-home',
 'int-return-yards-rolling-16-away',
 'fumbles-rolling-16-home',
 'fumbles-rolling-16-away',
 'fumbles-lost-rolling-16-home',
 'fumbles-lost-rolling-16-away',
 'fga-rolling-16-home',
 'fga-rolling-16-away',
 'fgm-rolling-16-home',
 'fgm-rolling-16-away',
 '3rd-down-convs-rolling-16-home',
 '3rd-down-convs-rolling-16-away',
 '3rd-downs-rolling-16-home',
 '3rd-downs-rolling-16-away',
 '3rd-down-conv-rate-rolling-16-home',
 '3rd-down-conv-rate-rolling-16-away',
 'punts-rolling-16-home',
 'punts-rolling-16-away',
 'yards-per-punt-rolling-16-home',
 'yards-per-punt-rolling-16-away',
 'penalties-rolling-16-home',
 'penalties-rolling-16-away',
 'penalty-yards-rolling-16-home',
 'penalty-yards-rolling-16-away',
 'sacks_allowed-rolling-16-home',
 'sacks_allowed-rolling-16-away',
 'sack-yds-lost-rolling-16-home',
 'sack-yds-lost-rolling-16-away',
 'punts-returned-rolling-16-home',
 'punts-returned-rolling-16-away',
 'punt-return-yds-rolling-16-home',
 'punt-return-yds-rolling-16-away',
 'kicks-returned-rolling-16-home',
 'kicks-returned-rolling-16-away',
 'kick-return-yds-rolling-16-home',
 'kick-return-yds-rolling-16-away',
 'home_favorite',
 'Home Close Odds',
 'Home Open Odds',
 'Road Close Odds',
 'Road Open Odds',
 ]

CV Testing

In [29]:
def get_cv_scores(model,y_feat,x_features,calibrated=False,threshold=0):
    '''
    Runs cross validation testing on models for given x and y features
    returns a dict containing the accuracy, f1, and log loss scores for the given model  
    '''
    features = x_features + [y_feat]
    if y_feat == 'winner_h_or_a':
        training_data = master_df_with_moneyline[features].dropna()
    else:
        training_data = master_df[features].dropna()
    x_dict = training_data[x_features].to_dict(orient="records")
    y_train = training_data[y_feat]
    if calibrated == True:
        pipe = Pipeline([("vec",DictVectorizer(sparse=False)),("scaler",MaxAbsScaler()),("threshold",VarianceThreshold(threshold)),("clf",CalibratedClassifierCV(estimator=model,n_jobs=-1,cv=3,ensemble=True))])
    else:
        pipe = Pipeline([("vec",DictVectorizer(sparse=False)),("scaler",MaxAbsScaler()),("threshold",VarianceThreshold(threshold)),("clf",model)])
    scoring_metrics = {"accuracy":"accuracy","log loss":"neg_log_loss","f1":"f1_micro","precision":"precision_micro"}
    score = cross_validate(pipe,x_dict,y_train,scoring=scoring_metrics,error_score='raise')
    return score

def test_model(model):
    y_features = ['over','covered','winner_h_or_a']
    scores = []
    for y_feature in y_features:
        if y_feature == 'winner_h_or_a':
            score = get_cv_scores(model=model, y_feat=y_feature,x_features=x_features_ml,calibrated=True)
        else:
            score = get_cv_scores(model=model, y_feat=y_feature,x_features=selected_features)
        scores.append((model,y_feature,score))
    return scores

def test_models(models,**kwargs):
    scores = []
    for model in models:
        if kwargs['calibrated'] == True:
            scores.append(get_cv_scores(model=model,calibrated=True))
        else:
            scores.append(get_cv_scores(model=model))

Feature Selection

In [51]:
from sklearn.metrics import precision_score
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score
features = x_features + ['over']
training_data = master_df[features].dropna()
x_dict = training_data[x_features].to_dict(orient='records')
pipe = Pipeline([("scaler",MaxAbsScaler()),("clf",RandomForestClassifier())])
feat_selector = SequentialFeatureSelector(pipe,direction='forward',n_features_to_select=50)
feat_selector.fit(pd.get_dummies(training_data[x_features]),training_data['over'])
selected_features = feat_selector.get_feature_names_out(x_features)
display(selected_features)

In [49]:
get_cv_scores(model=SVC(C=.9,probability=True,class_weight='balanced'),x_features=list(selected_features),y_feat='covered',calibrated=True,threshold=.001)

{'fit_time': array([62.43572569, 61.40059328, 61.11831999, 59.63539052, 71.93465495]),
 'score_time': array([13.98583961, 14.3356154 , 14.26403093, 14.11977935, 15.40330529]),
 'test_accuracy': array([0.50451977, 0.50508475, 0.49802148, 0.50480497, 0.50141323]),
 'test_log loss': array([-0.8006341 , -0.80024484, -0.79846074, -0.80061843, -0.80052028]),
 'test_f1': array([0.50451977, 0.50508475, 0.49802148, 0.50480497, 0.50141323]),
 'test_precision': array([0.50451977, 0.50508475, 0.49802148, 0.50480497, 0.50141323])}

In [50]:
list(selected_features)

['spread_favorite',
 'over_under_line',
 'total-first-downs-rolling-4-home',
 'total-first-downs-rolling-4-away',
 'rushing-first-downs-rolling-4-home',
 'rushing-first-downs-rolling-4-away',
 'passing-first-downs-rolling-4-home',
 'passing-first-downs-rolling-4-away',
 'penalty-first-downs-rolling-4-home',
 'penalty-first-downs-rolling-4-away',
 'net-yards-rolling-4-home',
 'net-yards-rolling-4-away',
 'net-rushing-yds-rolling-4-home',
 'net-rushing-yds-rolling-4-away',
 'rushing-plays-rolling-4-home',
 'rushing-plays-rolling-4-away',
 'avg-gain-rushing-rolling-4-home',
 'avg-gain-rushing-rolling-4-away',
 'net-passing-yds-rolling-4-home',
 'net-passing-yds-rolling-4-away',
 'gross-passing-yds-rolling-4-home',
 'gross-passing-yds-rolling-4-away',
 'yds-per-att-rolling-4-home',
 'yds-per-att-rolling-4-away',
 'blocked-kicks-allowed-rolling-4-home',
 'blocked-kicks-allowed-rolling-4-away',
 'total-plays-rolling-4-home',
 'total-plays-rolling-4-away',
 'avg-gain-per-play-rolling-4-away',

Model selection and parameter tuning

In [22]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
def search_params(model,params,x_features=selected_features,y_feature='over',scoring_metric="precision_micro"):
    features = x_features + [y_feature]
    training_data = master_df[features].dropna()
    x_dict = training_data[x_features].to_dict(orient='records')
    grid = HalvingGridSearchCV(estimator=model,param_grid=params,scoring=scoring_metric,error_score='raise')
    grid.fit(x_dict,training_data[y_feature])
    score = grid.best_score_
    estimator = grid.best_estimator_
    return score, estimator

In [25]:
pipe = Pipeline([('vec',DictVectorizer()),('scaler',MaxAbsScaler()),('clf',RandomForestClassifier())])
params = {'vec':[DictVectorizer()],'scaler':[MaxAbsScaler()],'clf':[RandomForestClassifier()],'clf__estimator':[SVC()],'clf__estimator__probability':[True],'clf__estimator__class_weight':['balanced'],'clf__estimator__decision_function_shape':['ovo','ovr'],'clf__estimator__gamma':['scale','auto'],'clf__estimator__C':[1,.9,.75]}
score, estimator = search_params(pipe,params=params)

Feature Selection

In [26]:
score

0.5036738813144137

Profitability Testing

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def place_test_bet(predicted,actual,odds,bet):
    # If our predicted value == actual value return the winnings, else return -bet size
        if predicted == actual:
            return odds * bet + bet
        elif actual == "Push":
            return bet
        else:
            return -bet

def size_kelly_bet(bankroll, win_prob,odds):
    # size bet according to the kelly criterion
    return bankroll * (win_prob - (1-win_prob)/odds)

def calc_ev(bet_size,win_prob,odds):
    # calculate the expected value of a bet
    ev = bet_size * odds * win_prob - bet_size*(1-win_prob)
    return ev

def test_profitability(x_features,y_feature,probability_threshold,bankroll,test_model,max_bet):
    '''
    Tests the profitability of a bet by retroactively betting on past games using cross validation
    Params: 
        x_features: X columns to train the model on
        y_feature: y feature that is being predicted
        probability threshold: The probability threshold has a different meaning depending on the feature being predicted. If
        we are predicting the moneyline, the probability threshold is a value such that we want our prediction to be that value above 
        the implied probabiity from the odds for us to feel comofortable placing a bet. 
        If we are predicting the over or the spread, the probability threshold is the value we want our predicted probability of
        a team hitting the over or covering the spread to be above. 
        Bankroll: Our starting bankroll
        test_model: The model we are testing
        k: a parameter for the model, useful for trying to optimize the value of a specific parameter. 
    Returns:
        A dictionary containing our final bankroll, our models true accuracy, our models adjusted accuracy after using the probability threshold,
        the number of successful bets, the number of unsuccessful bets, the number of bets we did not place (model evaluated as not +ev)
        A list that tracks our bankroll after each bet
        A list that tracks our ev after each bet
        A list that tracks the size of each bet
        A list that tracks the results of each bet
    '''
    features = x_features + [y_feature]
    if y_feature == "over" or y_feature == "covered":
        training_data = master_df[features].dropna()
    # I could only get historical moneylines from 2007-present so we have to use a different dataset (about 300 rows) for predicting outright winners
    elif y_feature == "winner_h_or_a":
        training_data = master_df_with_moneyline[features].dropna()
    # split the training data for cross validation
    x_training_data,x_test_data,y_train,y_test = train_test_split(training_data[x_features],training_data[y_feature],test_size=.2)
    x_dict_train = x_training_data.to_dict(orient="records")
    x_dict_test = x_test_data.to_dict(orient="records")
    # one hot encode categorical variables for training data
    vec = DictVectorizer(sparse=False)
    vec.fit(x_dict_train)
    x_train = vec.transform(x_dict_train)
    # scale data
    scaler = MaxAbsScaler()
    scaler.fit(x_train)
    x_train_sc = scaler.transform(x_train)
    # Use the calibrated classifier to calibrate the probabilities of our model and get the most accurate predictions. 
    model = CalibratedClassifierCV(estimator=test_model,method='isotonic')
    # fit the model to the training data
    model.fit(x_train_sc,y_train)
    # one hot encode categorical variables for testing data
    vec = DictVectorizer(sparse=False)
    vec.fit(x_dict_test)
    x_test = vec.transform(x_dict_test)
    # scale testing data
    scaler = MaxAbsScaler()
    scaler.fit(x_test)
    x_test_sc = scaler.transform(x_test)
    # predict outcomes for the testing data
    y_pred_prob = model.predict_proba(x_test_sc)
    y_pred = model.predict(x_test_sc)
    #compare predicted values to actual values
    y_pred_lst = list(y_pred)
    y_test_lst = list(y_test)
    predictions_vs_results = list(zip(y_pred_prob,y_pred,y_test_lst))
    # display(predictions_vs_results)
    #start with a bankroll of 1000
    bets = []
    profits_over_time = [0]
    bet_evs = []
    bet_results = []
    bankroll = 1000
    accuracy = accuracy_score(y_pred=y_pred_lst, y_true=y_test_lst)
    hits = 0
    misses = 0
    no_bet = 0
    #simulate betting on past games based off of our test predictions
    for i in range(len(predictions_vs_results)):
        pred = predictions_vs_results[i][1]
        actual = predictions_vs_results[i][2]
        if y_feature == 'winner_h_or_a':
            # get the closing odds
            if pred == "Home":
                predicted_prob = predictions_vs_results[i][0][0]
                american_odds = training_data['Home Close Odds'].loc[i]
            elif pred == "Away":
                predicted_prob = predictions_vs_results[i][0][1]
                american_odds = training_data['Road Close Odds'].loc[i]
            # convert the american odds to the implied probability
            if american_odds > 0:
                implied_prob = (100 / (american_odds + 100)) * 100
                decimal_odds = (american_odds / 100) + 1 
            else:
                implied_prob = (np.abs(american_odds) / (np.abs(american_odds) + 100)) * 100
                decimal_odds = (100 / np.abs(american_odds)) + 1
            
            bet = size_kelly_bet(bankroll=bankroll,win_prob=predicted_prob,odds=implied_prob)
            if bet > max_bet * bankroll:
                bet = max_bet * bankroll
            ev = calc_ev(bet_size=bet,win_prob=predicted_prob - probability_threshold,odds=decimal_odds)
            bet_evs.append(ev)
        else:
            if y_feature == 'over':
                if pred == 'Over':
                    predicted_prob = predictions_vs_results[i][0][0]
                elif pred == 'Under':
                    predicted_prob = predictions_vs_results[i][0][2]
                else:
                    continue
                if predicted_prob > probability_threshold:
                    bet = size_kelly_bet(bankroll=bankroll,win_prob=accuracy,odds=.91)
                    if bet > max_bet * bankroll:
                        bet = max_bet * bankroll
                    ev = calc_ev(bet_size=bet,win_prob=accuracy,odds=.91)
                    bet_evs.append(ev)
                else:
                    bet = 0
                    ev = 0
            elif y_feature == 'covered':
                if pred == "Home":
                    predicted_prob = predictions_vs_results[i][0][1]
                elif pred == 'Away':
                    predicted_prob = predictions_vs_results[i][0][0]
                else:
                    continue
                if predicted_prob > probability_threshold:
                    bet = size_kelly_bet(bankroll=bankroll,win_prob=predicted_prob,odds=.91)
                    if bet > max_bet * bankroll:
                        bet = max_bet * bankroll
                    ev = calc_ev(bet_size=bet,win_prob=accuracy,odds=.91)
                    bet_evs.append(ev)
                else:
                    bet = 0
                    ev = 0
            else:
                return f"You entered y_feature = {y_feature}, please select either 'over', 'covered', or 'winner_h_or_a'"
            
        if ev > 0:
            bankroll = bankroll
            returns = place_test_bet(predicted=pred,actual=actual,odds=.91,bet=bet)
            bets.append(bet)
        else:
            returns = 0
            bets.append(0)
        if returns > bet:
            hits += 1
        elif returns == 0:
            no_bet+=1
        else:
            misses += 1
        bet_results.append(returns)
        bankroll += returns
        profits_over_time.append(bankroll-1000)
            
        if hits + misses != 0:
            adjusted_accuracy = hits / (hits + misses)
        else:
            adjusted_accuracy = np.nan
    return ({"profit": bankroll-1000, "pure accuracy": accuracy , "adjusted accuracy": adjusted_accuracy,"hits":hits,"misses":misses,"num skipped bets":no_bet},profits_over_time,bet_evs,bets,bet_results)

In [7]:
over_results_SGD = []
over_results_LR = []
over_results_GB = []
over_results_SVC = []
spread_results_SVC = []
spread_results_KN = []
moneyline_results_SGD = []
moneyline_results_LR = []
moneyline_results_GB = []

for i in range(10):
    over_results_SGD.append(test_profitability(x_features=x_features,y_feature='over',probability_threshold=.58,bankroll=1000,test_model=SGDClassifier(average=True, class_weight='balanced', loss='log_loss',max_iter=2500),max_bet=.05))
    #over_results_LR.append(test_profitability(x_features=x_features,y_feature='over',probability_threshold=.53,bankroll=1000,test_model=LogisticRegression(max_iter=2500),max_bet=.05))
    #over_results_GB.append(test_profitability(x_features=x_features,y_feature='over',probability_threshold=.53,bankroll=1000,test_model=GradientBoostingClassifier(),max_bet=.05))
    over_results_SVC.append(test_profitability(x_features=x_features,y_feature='over',probability_threshold=.58,bankroll=1000,test_model=SVC(probability=True,class_weight='balanced'),max_bet=.05))
    spread_results_SVC.append(test_profitability(x_features=x_features,y_feature='covered',probability_threshold=.55,bankroll=1000,test_model=SVC(probability=True,class_weight='balanced',C=.8),max_bet=.05))
    #spread_results_KN.append(test_profitability(x_features=x_features,y_feature='covered',probability_threshold=.55,bankroll=1000,test_model=SVC(probability=True,class_weight='balanced',C=.8),max_bet=.05))
    moneyline_results_SGD.append(test_profitability(x_features=x_features_ml,y_feature='winner_h_or_a',probability_threshold=.05,bankroll=1000,test_model=KNeighborsClassifier(n_neighbors=100),max_bet=.05))
    #moneyline_results_LR.append(test_profitability(x_features=x_features_ml,y_feature='winner_h_or_a',probability_threshold=.0,bankroll=1000,test_model=LogisticRegression(max_iter=3000,class_weight='balanced',C=1),max_bet=.05))
    #moneyline_results_GB.append(test_profitability(x_features=x_features_ml,y_feature='winner_h_or_a',probability_threshold=.0,bankroll=1000,test_model=GradientBoostingClassifier(),max_bet=.05))
    

KeyError: "['total-first-downs-rolling-4-home', 'total-first-downs-rolling-4-away', 'rushing-first-downs-rolling-4-home', 'rushing-first-downs-rolling-4-away', 'passing-first-downs-rolling-4-home', 'passing-first-downs-rolling-4-away', 'penalty-first-downs-rolling-4-home', 'penalty-first-downs-rolling-4-away', 'net-yards-rolling-4-home', 'net-yards-rolling-4-away', 'net-rushing-yds-rolling-4-home', 'net-rushing-yds-rolling-4-away', 'rushing-plays-rolling-4-home', 'rushing-plays-rolling-4-away', 'avg-gain-rushing-rolling-4-home', 'avg-gain-rushing-rolling-4-away', 'net-passing-yds-rolling-4-home', 'net-passing-yds-rolling-4-away', 'gross-passing-yds-rolling-4-home', 'gross-passing-yds-rolling-4-away', 'yds-per-att-rolling-4-home', 'yds-per-att-rolling-4-away', 'blocked-kicks-allowed-rolling-4-home', 'blocked-kicks-allowed-rolling-4-away', 'total-plays-rolling-4-home', 'total-plays-rolling-4-away', 'avg-gain-per-play-rolling-4-home', 'avg-gain-per-play-rolling-4-away', 'passing-attempts-rolling-4-home', 'passing-attempts-rolling-4-away', 'completions-rolling-4-home', 'completions-rolling-4-away', 'int-thrown-rolling-4-home', 'int-thrown-rolling-4-away', 'fumbles-rolling-4-home', 'fumbles-rolling-4-away', 'fumbles-lost-rolling-4-home', 'fumbles-lost-rolling-4-away', 'fga-rolling-4-home', 'fga-rolling-4-away', 'fgm-rolling-4-home', 'fgm-rolling-4-away', '3rd-down-convs-rolling-4-home', '3rd-down-convs-rolling-4-away', '3rd-downs-rolling-4-home', '3rd-downs-rolling-4-away', '3rd-down-conv-rate-rolling-4-home', '3rd-down-conv-rate-rolling-4-away', 'punts-rolling-4-home', 'punts-rolling-4-away', 'yards-per-punt-rolling-4-home', 'yards-per-punt-rolling-4-away', 'penalties-rolling-4-home', 'penalties-rolling-4-away', 'penalty-yards-rolling-4-home', 'penalty-yards-rolling-4-away', 'sacks_allowed-rolling-4-home', 'sacks_allowed-rolling-4-away', 'sack-yds-lost-rolling-4-home', 'sack-yds-lost-rolling-4-away', 'punts-returned-rolling-4-home', 'punts-returned-rolling-4-away', 'punt-return-yds-rolling-4-home', 'punt-return-yds-rolling-4-away', 'kicks-returned-rolling-4-home', 'kicks-returned-rolling-4-away', 'kick-return-yds-rolling-4-home', 'kick-return-yds-rolling-4-away', 'total-first-downs-rolling-8-home', 'total-first-downs-rolling-8-away', 'rushing-first-downs-rolling-8-home', 'rushing-first-downs-rolling-8-away', 'passing-first-downs-rolling-8-home', 'passing-first-downs-rolling-8-away', 'penalty-first-downs-rolling-8-home', 'penalty-first-downs-rolling-8-away', 'net-yards-rolling-8-home', 'net-yards-rolling-8-away', 'net-rushing-yds-rolling-8-home', 'net-rushing-yds-rolling-8-away', 'rushing-plays-rolling-8-home', 'rushing-plays-rolling-8-away', 'avg-gain-rushing-rolling-8-home', 'avg-gain-rushing-rolling-8-away', 'net-passing-yds-rolling-8-home', 'net-passing-yds-rolling-8-away', 'gross-passing-yds-rolling-8-home', 'gross-passing-yds-rolling-8-away', 'yds-per-att-rolling-8-home', 'yds-per-att-rolling-8-away', 'blocked-kicks-allowed-rolling-8-home', 'blocked-kicks-allowed-rolling-8-away', 'total-plays-rolling-8-home', 'total-plays-rolling-8-away', 'avg-gain-per-play-rolling-8-home', 'avg-gain-per-play-rolling-8-away', 'passing-attempts-rolling-8-home', 'passing-attempts-rolling-8-away', 'completions-rolling-8-home', 'completions-rolling-8-away', 'int-thrown-rolling-8-home', 'int-thrown-rolling-8-away', 'interceptions-rolling-8-home', 'interceptions-rolling-8-away', 'int-return-yards-rolling-8-home', 'int-return-yards-rolling-8-away', 'fumbles-rolling-8-home', 'fumbles-rolling-8-away', 'fumbles-lost-rolling-8-home', 'fumbles-lost-rolling-8-away', 'fga-rolling-8-home', 'fga-rolling-8-away', 'fgm-rolling-8-home', 'fgm-rolling-8-away', '3rd-down-convs-rolling-8-home', '3rd-down-convs-rolling-8-away', '3rd-downs-rolling-8-home', '3rd-downs-rolling-8-away', '3rd-down-conv-rate-rolling-8-home', '3rd-down-conv-rate-rolling-8-away', 'punts-rolling-8-home', 'punts-rolling-8-away', 'yards-per-punt-rolling-8-home', 'yards-per-punt-rolling-8-away', 'penalties-rolling-8-home', 'penalties-rolling-8-away', 'penalty-yards-rolling-8-home', 'penalty-yards-rolling-8-away', 'sacks_allowed-rolling-8-home', 'sacks_allowed-rolling-8-away', 'sack-yds-lost-rolling-8-home', 'sack-yds-lost-rolling-8-away', 'punts-returned-rolling-8-home', 'punts-returned-rolling-8-away', 'punt-return-yds-rolling-8-home', 'punt-return-yds-rolling-8-away', 'kicks-returned-rolling-8-home', 'kicks-returned-rolling-8-away', 'kick-return-yds-rolling-8-home', 'kick-return-yds-rolling-8-away', 'total-first-downs-rolling-16-home', 'total-first-downs-rolling-16-away', 'rushing-first-downs-rolling-16-home', 'rushing-first-downs-rolling-16-away', 'passing-first-downs-rolling-16-home', 'passing-first-downs-rolling-16-away', 'penalty-first-downs-rolling-16-home', 'penalty-first-downs-rolling-16-away', 'net-yards-rolling-16-home', 'net-yards-rolling-16-away', 'net-rushing-yds-rolling-16-home', 'net-rushing-yds-rolling-16-away', 'rushing-plays-rolling-16-home', 'rushing-plays-rolling-16-away', 'avg-gain-rushing-rolling-16-home', 'avg-gain-rushing-rolling-16-away', 'net-passing-yds-rolling-16-home', 'net-passing-yds-rolling-16-away', 'gross-passing-yds-rolling-16-home', 'gross-passing-yds-rolling-16-away', 'yds-per-att-rolling-16-home', 'yds-per-att-rolling-16-away', 'blocked-kicks-allowed-rolling-16-home', 'blocked-kicks-allowed-rolling-16-away', 'total-plays-rolling-16-home', 'total-plays-rolling-16-away', 'avg-gain-per-play-rolling-16-home', 'avg-gain-per-play-rolling-16-away', 'passing-attempts-rolling-16-home', 'passing-attempts-rolling-16-away', 'completions-rolling-16-home', 'completions-rolling-16-away', 'int-thrown-rolling-16-home', 'int-thrown-rolling-16-away', 'interceptions-rolling-16-home', 'interceptions-rolling-16-away', 'int-return-yards-rolling-16-home', 'int-return-yards-rolling-16-away', 'fumbles-rolling-16-home', 'fumbles-rolling-16-away', 'fumbles-lost-rolling-16-home', 'fumbles-lost-rolling-16-away', 'fga-rolling-16-home', 'fga-rolling-16-away', 'fgm-rolling-16-home', 'fgm-rolling-16-away', '3rd-down-convs-rolling-16-home', '3rd-down-convs-rolling-16-away', '3rd-downs-rolling-16-home', '3rd-downs-rolling-16-away', '3rd-down-conv-rate-rolling-16-home', '3rd-down-conv-rate-rolling-16-away', 'punts-rolling-16-home', 'punts-rolling-16-away', 'yards-per-punt-rolling-16-home', 'yards-per-punt-rolling-16-away', 'penalties-rolling-16-home', 'penalties-rolling-16-away', 'penalty-yards-rolling-16-home', 'penalty-yards-rolling-16-away', 'sacks_allowed-rolling-16-home', 'sacks_allowed-rolling-16-away', 'sack-yds-lost-rolling-16-home', 'sack-yds-lost-rolling-16-away', 'punts-returned-rolling-16-home', 'punts-returned-rolling-16-away', 'punt-return-yds-rolling-16-home', 'punt-return-yds-rolling-16-away', 'kicks-returned-rolling-16-home', 'kicks-returned-rolling-16-away', 'kick-return-yds-rolling-16-home', 'kick-return-yds-rolling-16-away'] not in index"

In [None]:
def get_results(results):
    scores = []
    profits = []
    evs = []
    bet_sizes = []
    bet_results = []
    for result in results:
        scores.append(result[0])
        profits.append(result[1])
        evs.append(result[2])
        bet_sizes.append(result[3])
        bet_results.append(result[4])
    return scores,profits,evs,bet_sizes,bet_results

SGD_over_scores,SGD_over_profits,SGD_over_evs,SGD_over_bet_sizes,SGD_over_bet_results= get_results(over_results_SGD)
SVC_over_scores,SVC_over_profits,SVC_over_evs,SVC_over_bet_sizes,SVC_over_bet_results = get_results(over_results_SVC)
SVC_spread_scores,SVC_spread_profits,SVC_spread_evs,SVC_spread_bet_sizes,SVC_spread_bet_results= get_results(spread_results_SVC)
SGD_moneyline_scores,SGD_moneyline_profits,SGD_moneyline_evs,SGD_moneyline_bet_sizes,SGD_moneyline_bet_results= get_results(moneyline_results_SGD)