# NFL Against the Spread Classification

- Building a classification model to predict if an NFL team will beat, push, or lose to the spread
- We will test multiple classification models and parameters to find the best model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# scikit-learn

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer

# models

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# metrics

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# pipeline and grid search

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline


In [2]:
# load in csv file

df = pd.read_csv("../Data/nfl_game_data.csv")

df.head()

Unnamed: 0,game_id,season,team,opponent,spread_line,coach,roof,ats_win,location,epa,...,rush_yds_per_attempt,int,fumbles_lost,penalty_yards,def_epa,def_pass_yds_per_attempt,def_rush_yds_per_attempt,def_int,forced_fumbles,def_penlty_yards
0,2001_01_CHI_BAL,2001,BAL,CHI,10.5,Brian Billick,outdoors,win,home,-0.077838,...,1.8,0,2,15,-0.247703,5.75,2.0,2,0,16
1,2001_01_NO_BUF,2001,BUF,NO,-1.5,Gregg Williams,outdoors,lose,home,-0.340673,...,4.464286,3,0,15,0.040729,11.611111,4.192308,0,0,0
2,2001_01_NE_CIN,2001,CIN,NE,0.0,Dick LeBeau,outdoors,win,home,0.065117,...,4.757576,0,1,0,0.004959,10.954545,3.238095,0,0,0
3,2001_01_SEA_CLE,2001,CLE,SEA,-4.0,Butch Davis,outdoors,win,home,-0.137322,...,3.6,1,0,0,-0.118707,8.9,3.740741,2,0,15
4,2001_01_TB_DAL,2001,DAL,TB,-9.0,Dave Campo,outdoors,win,home,-0.429948,...,4.304348,2,1,10,-0.029573,7.8,2.151515,1,1,0


In [3]:
# drop unnecessary columns (game_id, team, )
# convert season column from integer to category
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10326 entries, 0 to 10325
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   game_id                   10326 non-null  object 
 1   season                    10326 non-null  int64  
 2   team                      10326 non-null  object 
 3   opponent                  10326 non-null  object 
 4   spread_line               10326 non-null  float64
 5   coach                     10326 non-null  object 
 6   roof                      10326 non-null  object 
 7   ats_win                   10326 non-null  object 
 8   location                  10326 non-null  object 
 9   epa                       10326 non-null  float64
 10  pass_yds_per_attempt      10326 non-null  float64
 11  rush_yds_per_attempt      10326 non-null  float64
 12  int                       10326 non-null  int64  
 13  fumbles_lost              10326 non-null  int64  
 14  penalt

In [4]:
df = df.drop(columns=["game_id", "team"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10326 entries, 0 to 10325
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   season                    10326 non-null  int64  
 1   opponent                  10326 non-null  object 
 2   spread_line               10326 non-null  float64
 3   coach                     10326 non-null  object 
 4   roof                      10326 non-null  object 
 5   ats_win                   10326 non-null  object 
 6   location                  10326 non-null  object 
 7   epa                       10326 non-null  float64
 8   pass_yds_per_attempt      10326 non-null  float64
 9   rush_yds_per_attempt      10326 non-null  float64
 10  int                       10326 non-null  int64  
 11  fumbles_lost              10326 non-null  int64  
 12  penalty_yards             10326 non-null  int64  
 13  def_epa                   10326 non-null  float64
 14  def_pa

In [11]:
# split data into x and y and then train and test splits

X = df.drop(columns="ats_win")
y = df['ats_win']

Xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=1, test_size=0.3)

# split data into numeric categorical

categorical_features = ["season", "opponent", "coach", "roof", "location"]

numerical_features = ["spread_line", "epa", "pass_yds_per_attempt", "rush_yds_per_attempt", "int", "fumbles_lost", "penalty_yards", "def_epa",
                         "def_pass_yds_per_attempt", "def_rush_yds_per_attempt", "def_int", "forced_fumbles", "def_penlty_yards"]

In [14]:
# Make Pipeline
# Using column transformer scales numerical variables and one hot encode categorical variables

preprocess = make_column_transformer((numerical_features, make_pipeline(SimpleImputer(), StandardScaler())),
                                    (categorical_features, OneHotEncoder()))

preprocess.fit_transform(X)

TypeError: All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. '['spread_line', 'epa', 'pass_yds_per_attempt', 'rush_yds_per_attempt', 'int', 'fumbles_lost', 'penalty_yards', 'def_epa', 'def_pass_yds_per_attempt', 'def_rush_yds_per_attempt', 'def_int', 'forced_fumbles', 'def_penlty_yards']' (type <class 'list'>) doesn't.

In [None]:
# random forest

rf = make_pipeline(preprocess, LogisticRegression())