# NFL Against the Spread Classification

- Building a classification model to predict if an NFL team will beat, push, or lose to the spread
- We will test multiple classification models and parameters to find the best model

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# scikit-learn

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector as selector
# models

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# metrics

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score

# pipeline and grid search

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV



In [2]:
# load in csv file

df = pd.read_csv("../Data/nfl_game_data.csv")

df.head()

Unnamed: 0,game_id,season,team,opponent,spread_line,coach,roof,ats_win,location,epa,...,rush_yds_per_attempt,int,fumbles_lost,penalty_yards,def_epa,def_pass_yds_per_attempt,def_rush_yds_per_attempt,def_int,forced_fumbles,def_penlty_yards
0,2001_01_CHI_BAL,2001,BAL,CHI,10.5,Brian Billick,outdoors,win,home,-0.077838,...,1.8,0,2,15,-0.247703,5.75,2.0,2,0,16
1,2001_01_NO_BUF,2001,BUF,NO,-1.5,Gregg Williams,outdoors,lose,home,-0.340673,...,4.464286,3,0,15,0.040729,11.611111,4.192308,0,0,0
2,2001_01_NE_CIN,2001,CIN,NE,0.0,Dick LeBeau,outdoors,win,home,0.065117,...,4.757576,0,1,0,0.004959,10.954545,3.238095,0,0,0
3,2001_01_SEA_CLE,2001,CLE,SEA,-4.0,Butch Davis,outdoors,win,home,-0.137322,...,3.6,1,0,0,-0.118707,8.9,3.740741,2,0,15
4,2001_01_TB_DAL,2001,DAL,TB,-9.0,Dave Campo,outdoors,win,home,-0.429948,...,4.304348,2,1,10,-0.029573,7.8,2.151515,1,1,0


In [3]:
# drop unnecessary columns (game_id, team, )
# convert season column from integer to category
df["season"] = df.season.astype('category')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10326 entries, 0 to 10325
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   game_id                   10326 non-null  object 
 1   season                    10326 non-null  int64  
 2   team                      10326 non-null  object 
 3   opponent                  10326 non-null  object 
 4   spread_line               10326 non-null  float64
 5   coach                     10326 non-null  object 
 6   roof                      10326 non-null  object 
 7   ats_win                   10326 non-null  object 
 8   location                  10326 non-null  object 
 9   epa                       10326 non-null  float64
 10  pass_yds_per_attempt      10326 non-null  float64
 11  rush_yds_per_attempt      10326 non-null  float64
 12  int                       10326 non-null  int64  
 13  fumbles_lost              10326 non-null  int64  
 14  penalt

In [4]:
df = df.drop(columns=["game_id", "team"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10326 entries, 0 to 10325
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype   
---  ------                    --------------  -----   
 0   season                    10326 non-null  category
 1   opponent                  10326 non-null  object  
 2   spread_line               10326 non-null  float64 
 3   coach                     10326 non-null  object  
 4   roof                      10326 non-null  object  
 5   ats_win                   10326 non-null  object  
 6   location                  10326 non-null  object  
 7   epa                       10326 non-null  float64 
 8   pass_yds_per_attempt      10326 non-null  float64 
 9   rush_yds_per_attempt      10326 non-null  float64 
 10  int                       10326 non-null  int64   
 11  fumbles_lost              10326 non-null  int64   
 12  penalty_yards             10326 non-null  int64   
 13  def_epa                   10326 non-null  floa

In [5]:
# split data into x and y 

X = df.drop("ats_win", axis=1)
y = df['ats_win']



In [6]:
numeric_features = ["spread_line", "epa", "pass_yds_per_attempt", "rush_yds_per_attempt", "int", "fumbles_lost", "penalty_yards", "def_epa",
                         "def_pass_yds_per_attempt", "def_rush_yds_per_attempt", "def_int", "forced_fumbles", "def_penlty_yards"]

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["season", "opponent", "coach", "roof", "location"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [8]:
# Logistic Regresssion Model

log_regression_model = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(max_iter=10000))]

)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

log_regression_model.fit(X_train, y_train)

print(log_regression_model.score(X_test, y_test))

0.7989025177533893


In [9]:
prds = log_regression_model.predict(X_test)

confusion_matrix(y_test, prds)



array([[1265,    0,  267],
       [  44,    0,   39],
       [ 273,    0, 1210]], dtype=int64)

In [11]:
print(classification_report(y_test, prds))

              precision    recall  f1-score   support

        lose       0.80      0.83      0.81      1532
        push       0.00      0.00      0.00        83
         win       0.80      0.82      0.81      1483

    accuracy                           0.80      3098
   macro avg       0.53      0.55      0.54      3098
weighted avg       0.78      0.80      0.79      3098



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:


preprocessor2 = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="object")),
        ("cat", categorical_transformer, selector(dtype_include="category")),
    ]

)

model2 = Pipeline(
    steps=[("preprocessor", preprocessor2), ("classifier", LogisticRegression(max_iter=10000))]

)

model2.fit(X_train, y_train)

print(model2.score(X_test, y_test))



0.790187217559716


In [18]:
preds2 = model2.predict(X_test)

print(classification_report(y_test, preds2))

              precision    recall  f1-score   support

        lose       0.79      0.82      0.80      1532
        push       0.00      0.00      0.00        83
         win       0.79      0.80      0.80      1483

    accuracy                           0.79      3098
   macro avg       0.53      0.54      0.53      3098
weighted avg       0.77      0.79      0.78      3098



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
model3 = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier())]

)

model3.fit(X_train, y_train)

model3.score(X_test, y_test)

0.7963202065848934

In [21]:
preds3 = model3.predict(X_test)

print(classification_report(y_test, preds3))

              precision    recall  f1-score   support

        lose       0.80      0.82      0.81      1532
        push       0.00      0.00      0.00        83
         win       0.79      0.82      0.80      1483

    accuracy                           0.80      3098
   macro avg       0.53      0.55      0.54      3098
weighted avg       0.77      0.80      0.79      3098



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
probs = model3.predict_proba(X_test)

In [23]:
probs

array([[0.43, 0.04, 0.53],
       [0.82, 0.02, 0.16],
       [0.3 , 0.08, 0.62],
       ...,
       [0.81, 0.01, 0.18],
       [0.91, 0.  , 0.09],
       [0.8 , 0.03, 0.17]])

In [37]:
predictions = pd.DataFrame(probs)

In [27]:
preds3

array(['win', 'lose', 'win', ..., 'lose', 'lose', 'lose'], dtype=object)

In [38]:
predictions

Unnamed: 0,0,1,2
0,0.43,0.04,0.53
1,0.82,0.02,0.16
2,0.30,0.08,0.62
3,0.39,0.02,0.59
4,0.58,0.03,0.39
...,...,...,...
3093,0.19,0.04,0.77
3094,0.66,0.02,0.32
3095,0.81,0.01,0.18
3096,0.91,0.00,0.09
