In [1]:
#import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from itertools import combinations
from scipy import stats as st
import numpy as np
from numpy import argmax

In [2]:
# Machine learning model libraries 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler, SMOTE, BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from imblearn.ensemble import BalancedRandomForestClassifier

In [3]:
# Import Dataset
df = pd.read_csv("../Resources/sb_champion_stats.csv", index_col = 1)
df.head()

Unnamed: 0_level_0,Unnamed: 0,total_tackles_def,sacks_def,safties_def,passes_defended_def,ints_def,fumbles_lost_fum,td_passes_pass,ints_pass,sacks_pass,...,year,team,fg_success_rate,extra_success_rate,pass_success_rate,avg_yards_pass,avg_yards_rush,kicks_blocked,passes_per_rushing_plays,Champion
team_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
arizona_cardinals_2001,0,670,5,0,29,13,7.0,0.0,0.0,0.0,...,2001,arizona_cardinals,0.833333,1.0,0.0,10.715232,3.546939,0.0,0.004082,0.0
arizona_cardinals_2002,1,547,12,0,24,7,1.0,0.0,2.0,5.0,...,2002,arizona_cardinals,0.0,0.0,0.388889,11.470588,4.25641,0.0,0.461538,0.0
arizona_cardinals_2003,2,548,15,0,35,8,7.0,5.0,7.0,25.0,...,2003,arizona_cardinals,0.75,1.0,0.568862,12.957831,4.943396,0.0,3.150943,0.0
arizona_cardinals_2004,3,511,16,0,43,7,7.0,13.0,18.0,39.0,...,2004,arizona_cardinals,0.758621,1.0,0.56015,11.369231,3.585034,0.0,3.619048,0.0
arizona_cardinals_2005,4,483,19,0,27,9,11.0,21.0,21.0,45.0,...,2005,arizona_cardinals,0.955556,1.0,0.625373,12.421053,3.541436,0.0,3.701657,0.0


In [4]:
# Drop Columns 
df = df.drop(["year", "team", "Unnamed: 0", "extra_success_rate", "fg_success_rate", "kicks_blocked", 
             "safties_def"], axis = 1)
df.shape

(511, 15)

In [5]:
X = df.drop("Champion", axis = 1)
y = df["Champion"]

# Scaling Features

In [6]:
# Generate Scaler instance and scale the data
X_scaled = StandardScaler().fit_transform(X)

# Adressing Target Variable Imbalance

## Target variable imbalance needs to be addressed to avoid overfitting

In [7]:
# Value counts for target variable shows a minority:majority class ratio of about 1:30
y.value_counts()

0.0    495
1.0     16
Name: Champion, dtype: int64

In [8]:
# Prior to addressing the issue, we must split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=1)

## Naive random over and under sampling

In [9]:
# Naive under sampling will randomly reduce the amount of observations in the majority class to balance outcomes
naive_rus = RandomUnderSampler(random_state=1)
X_naive_rus, y_naive_rus = naive_rus.fit_resample(X_train, y_train)

In [10]:
# Show results
y_naive_rus.value_counts()

0.0    10
1.0    10
Name: Champion, dtype: int64

In [11]:
# Naive over sampling will randomly generate new minority class observations to balance outcomes
naive_ros = RandomOverSampler(random_state=1)
X_naive_ros, y_naive_ros = naive_ros.fit_resample(X_train, y_train)

In [12]:
# Show results
y_naive_ros.value_counts()

0.0    373
1.0    373
Name: Champion, dtype: int64

In [13]:
# We may also apply both sequentially and set the sampling strategy parameter to balance out the outcomes
naive_ros_2 = RandomOverSampler(random_state=1, sampling_strategy=.5)
X_naive_ros_2, y_naive_ros_2 = naive_ros_2.fit_resample(X_train, y_train)

naive_rus_2 = RandomUnderSampler(random_state=1)
X_naive_ros_rus, y_naive_ros_rus = naive_rus_2.fit_resample(X_naive_ros_2, y_naive_ros_2)

In [14]:
# Show Results
y_naive_ros_rus.value_counts()

0.0    186
1.0    186
Name: Champion, dtype: int64

## SMOTE oversampling

In [15]:
# Smote oversampling will create synthetic new observations similar to the minority class to balance outcomes
smote = SMOTE(random_state=1)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [16]:
# Show results
y_smote.value_counts()

0.0    373
1.0    373
Name: Champion, dtype: int64

## Combination sampling: SMOTEENN

In [17]:
# Smoteenn will balance outcomes by utilizing both over and under sampling methods
smoteenn = SMOTEENN(random_state=1)
X_smoteenn, y_smoteenn = smoteenn.fit_resample(X_train, y_train)

In [18]:
# Show results
y_smoteenn.value_counts()

1.0    368
0.0    264
Name: Champion, dtype: int64

# Supervised Learning Models: Fourth Iteration (reduced features; threshold moving)

## Logistic Regression

In [32]:
# Generate model instance
lr_model = LogisticRegression(solver='lbfgs', random_state=1)

In [33]:
# Define thresholds
thresholds = [.5, .55, .6, .65, .7, .75, .8, .85, .9]

In [34]:
# Fit on resampled features and predict

## Naive random undersample
lr_model.fit(X_naive_rus, y_naive_rus)
y_lr_naive_rus = lr_model.predict(X_test)
y_lr_naive_rus_probs = lr_model.predict_proba(X_test)[:, 1]

## Naive random oversample
lr_model.fit(X_naive_ros, y_naive_ros)
y_lr_naive_ros = lr_model.predict(X_test)
y_lr_naive_ros_probs = lr_model.predict_proba(X_test)[:, 1]

## Naive ros rus
lr_model.fit(X_naive_ros_rus, y_naive_ros_rus)
y_lr_naive_ros_rus = lr_model.predict(X_test)
y_lr_naive_ros_rus_probs = lr_model.predict_proba(X_test)[:, 1]

## SMOTE
lr_model.fit(X_smote, y_smote)
y_lr_smote = lr_model.predict(X_test)
y_lr_smote_probs = lr_model.predict_proba(X_test)[:, 1]

## SMOTEENN
lr_model.fit(X_smoteenn, y_smoteenn)
y_lr_smoteenn = lr_model.predict(X_test)
y_lr_smoteenn_probs = lr_model.predict_proba(X_test)[:, 1]

In [22]:
# Display results

## Confusion Matrix, Accuracy Score and Classification Report
for t in thresholds:
    for x in [y_lr_naive_rus_probs, y_lr_naive_ros_probs, y_lr_naive_ros_rus_probs, y_lr_smote_probs, y_lr_smoteenn_probs]:
        print(f"Model: Logistic Regression; Threshold: {t}")
        print("Accuracy Score:")
        print(accuracy_score(y_test, (x >= t).astype(int)))
        print("Confusion Matrix")
        print(confusion_matrix(y_test, (x >= t).astype(int)))
        print("Classification Report")
        print(classification_report(y_test, (x >= t).astype(int)))
        print(" ")

Model: Logistic Regression; Threshold: 0.5
Accuracy Score:
0.515625
Confusion Matrix
[[62 60]
 [ 2  4]]
Classification Report
              precision    recall  f1-score   support

         0.0       0.97      0.51      0.67       122
         1.0       0.06      0.67      0.11         6

    accuracy                           0.52       128
   macro avg       0.52      0.59      0.39       128
weighted avg       0.93      0.52      0.64       128

 
Model: Logistic Regression; Threshold: 0.5
Accuracy Score:
0.78125
Confusion Matrix
[[98 24]
 [ 4  2]]
Classification Report
              precision    recall  f1-score   support

         0.0       0.96      0.80      0.88       122
         1.0       0.08      0.33      0.12         6

    accuracy                           0.78       128
   macro avg       0.52      0.57      0.50       128
weighted avg       0.92      0.78      0.84       128

 
Model: Logistic Regression; Threshold: 0.5
Accuracy Score:
0.765625
Confusion Matrix
[[96 2

 [ 4  2]]
Classification Report
              precision    recall  f1-score   support

         0.0       0.96      0.80      0.88       122
         1.0       0.08      0.33      0.12         6

    accuracy                           0.78       128
   macro avg       0.52      0.57      0.50       128
weighted avg       0.92      0.78      0.84       128

 
Model: Logistic Regression; Threshold: 0.7
Accuracy Score:
0.75
Confusion Matrix
[[95 27]
 [ 5  1]]
Classification Report
              precision    recall  f1-score   support

         0.0       0.95      0.78      0.86       122
         1.0       0.04      0.17      0.06         6

    accuracy                           0.75       128
   macro avg       0.49      0.47      0.46       128
weighted avg       0.91      0.75      0.82       128

 
Model: Logistic Regression; Threshold: 0.7
Accuracy Score:
0.8984375
Confusion Matrix
[[113   9]
 [  4   2]]
Classification Report
              precision    recall  f1-score   support

  

## Random Forest Classifier

In [23]:
# Generate model instance
rf_model = RandomForestClassifier(n_estimators=400, random_state=1)

In [24]:
# Fit on resampled features and predict

## Naive random undersample
rf_model.fit(X_naive_rus, y_naive_rus)
y_rf_naive_rus = rf_model.predict(X_test)
y_rf_naive_rus_probs = rf_model.predict_proba(X_test)[:, 1]

## Naive random oversample
rf_model.fit(X_naive_ros, y_naive_ros)
y_rf_naive_ros = rf_model.predict(X_test)
y_rf_naive_ros_probs = rf_model.predict_proba(X_test)[:, 1]

## Naive ros rus
rf_model.fit(X_naive_ros_rus, y_naive_ros_rus)
y_rf_naive_ros_rus = rf_model.predict(X_test)
y_rf_naive_ros_rus_probs = rf_model.predict_proba(X_test)[:, 1]

## SMOTE
rf_model.fit(X_smote, y_smote)
y_rf_smote = rf_model.predict(X_test)
y_rf_smote_probs = rf_model.predict_proba(X_test)[:, 1]

## SMOTEENN
rf_model.fit(X_smoteenn, y_smoteenn)
y_rf_smoteenn = rf_model.predict(X_test)
y_rf_smoteenn_probs = rf_model.predict_proba(X_test)[:, 1]

In [25]:
# Display results

## Confusion Matrix, Accuracy Score and Classification Report
for t in thresholds:
    for x in [y_rf_naive_rus_probs, y_rf_naive_ros_probs, y_rf_naive_ros_rus_probs, y_rf_smote_probs, y_rf_smoteenn_probs]:
        print(f"Model: Random Forest Classifier; Threshold: {t}")
        print("Accuracy Score:")
        print(accuracy_score(y_test, (x >= t).astype(int)))
        print("Confusion Matrix")
        print(confusion_matrix(y_test, (x >= t).astype(int)))
        print("Classification Report")
        print(classification_report(y_test, (x >= t).astype(int)))
        print(" ")

Model: Random Forest Classifier; Threshold: 0.5
Accuracy Score:
0.421875
Confusion Matrix
[[50 72]
 [ 2  4]]
Classification Report
              precision    recall  f1-score   support

         0.0       0.96      0.41      0.57       122
         1.0       0.05      0.67      0.10         6

    accuracy                           0.42       128
   macro avg       0.51      0.54      0.34       128
weighted avg       0.92      0.42      0.55       128

 
Model: Random Forest Classifier; Threshold: 0.5
Accuracy Score:
0.953125
Confusion Matrix
[[122   0]
 [  6   0]]
Classification Report
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.98       122
         1.0       0.00      0.00      0.00         6

    accuracy                           0.95       128
   macro avg       0.48      0.50      0.49       128
weighted avg       0.91      0.95      0.93       128

 
Model: Random Forest Classifier; Threshold: 0.5
Accuracy Score:
0.953125
Con

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

              precision    recall  f1-score   support

         0.0       0.95      1.00      0.98       122
         1.0       0.00      0.00      0.00         6

    accuracy                           0.95       128
   macro avg       0.48      0.50      0.49       128
weighted avg       0.91      0.95      0.93       128

 
Model: Random Forest Classifier; Threshold: 0.7
Accuracy Score:
0.953125
Confusion Matrix
[[122   0]
 [  6   0]]
Classification Report
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.98       122
         1.0       0.00      0.00      0.00         6

    accuracy                           0.95       128
   macro avg       0.48      0.50      0.49       128
weighted avg       0.91      0.95      0.93       128

 
Model: Random Forest Classifier; Threshold: 0.7
Accuracy Score:
0.953125
Confusion Matrix
[[122   0]
 [  6   0]]
Classification Report
              precision    recall  f1-score   support

         0.0     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

              precision    recall  f1-score   support

         0.0       0.95      1.00      0.98       122
         1.0       0.00      0.00      0.00         6

    accuracy                           0.95       128
   macro avg       0.48      0.50      0.49       128
weighted avg       0.91      0.95      0.93       128

 
Model: Random Forest Classifier; Threshold: 0.85
Accuracy Score:
0.953125
Confusion Matrix
[[122   0]
 [  6   0]]
Classification Report
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.98       122
         1.0       0.00      0.00      0.00         6

    accuracy                           0.95       128
   macro avg       0.48      0.50      0.49       128
weighted avg       0.91      0.95      0.93       128

 
Model: Random Forest Classifier; Threshold: 0.9
Accuracy Score:
0.953125
Confusion Matrix
[[122   0]
 [  6   0]]
Classification Report
              precision    recall  f1-score   support

         0.0    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Balanced Random Forest Classifier

In [26]:
# Generate Imbalanced Random Forest Classier instance
irf_model = BalancedRandomForestClassifier(n_estimators=250, random_state=1)

In [27]:
# Fit on features and predict
irf_model.fit(X_train, y_train)
y_irf = irf_model.predict(X_test)
y_irf_probs = irf_model.predict_proba(X_test)[:, 1]

In [28]:
# Display results
for t in thresholds:
    print(f"Model: Balanced Random Forest Classifier; Threshold: {t}")
    print("Accuracy Score:")
    print(accuracy_score(y_test, (y_irf_probs >= t).astype(int)))
    print("Confusion Matrix")
    print(confusion_matrix(y_test, (y_irf_probs >= t).astype(int)))
    print("Classification Report")
    print(classification_report(y_test, (y_irf_probs >= t).astype(int)))
    print(" ")

Model: Balanced Random Forest Classifier; Threshold: 0.5
Accuracy Score:
0.5625
Confusion Matrix
[[68 54]
 [ 2  4]]
Classification Report
              precision    recall  f1-score   support

         0.0       0.97      0.56      0.71       122
         1.0       0.07      0.67      0.12         6

    accuracy                           0.56       128
   macro avg       0.52      0.61      0.42       128
weighted avg       0.93      0.56      0.68       128

 
Model: Balanced Random Forest Classifier; Threshold: 0.55
Accuracy Score:
0.6953125
Confusion Matrix
[[85 37]
 [ 2  4]]
Classification Report
              precision    recall  f1-score   support

         0.0       0.98      0.70      0.81       122
         1.0       0.10      0.67      0.17         6

    accuracy                           0.70       128
   macro avg       0.54      0.68      0.49       128
weighted avg       0.94      0.70      0.78       128

 
Model: Balanced Random Forest Classifier; Threshold: 0.6
Accur

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Analyzing threshold values between .7-.8 in logistic regression

In [35]:
# Generate model instance
lr_model = LogisticRegression(solver='lbfgs', random_state=1)

In [36]:
# Define thresholds
thresholds = [.7, .71, .72, .73, .74, .75, .76, .77, .78, .79]

In [37]:
# Display results

## Confusion Matrix, Accuracy Score and Classification Report
for t in thresholds:
    for x in [y_lr_naive_ros_probs, y_lr_naive_ros_rus_probs, y_lr_smote_probs]:
        print(f"Model: Logistic Regression; Threshold: {t}")
        print("Accuracy Score:")
        print(accuracy_score(y_test, (x >= t).astype(int)))
        print("Confusion Matrix")
        print(confusion_matrix(y_test, (x >= t).astype(int)))
        print("Classification Report")
        print(classification_report(y_test, (x >= t).astype(int)))
        print(" ")

Model: Logistic Regression; Threshold: 0.7
Accuracy Score:
0.8984375
Confusion Matrix
[[113   9]
 [  4   2]]
Classification Report
              precision    recall  f1-score   support

         0.0       0.97      0.93      0.95       122
         1.0       0.18      0.33      0.24         6

    accuracy                           0.90       128
   macro avg       0.57      0.63      0.59       128
weighted avg       0.93      0.90      0.91       128

 
Model: Logistic Regression; Threshold: 0.7
Accuracy Score:
0.8515625
Confusion Matrix
[[107  15]
 [  4   2]]
Classification Report
              precision    recall  f1-score   support

         0.0       0.96      0.88      0.92       122
         1.0       0.12      0.33      0.17         6

    accuracy                           0.85       128
   macro avg       0.54      0.61      0.55       128
weighted avg       0.92      0.85      0.88       128

 
Model: Logistic Regression; Threshold: 0.7
Accuracy Score:
0.890625
Confusion Ma

              precision    recall  f1-score   support

         0.0       0.97      0.93      0.95       122
         1.0       0.20      0.33      0.25         6

    accuracy                           0.91       128
   macro avg       0.58      0.63      0.60       128
weighted avg       0.93      0.91      0.92       128

 
Model: Logistic Regression; Threshold: 0.77
Accuracy Score:
0.90625
Confusion Matrix
[[115   7]
 [  5   1]]
Classification Report
              precision    recall  f1-score   support

         0.0       0.96      0.94      0.95       122
         1.0       0.12      0.17      0.14         6

    accuracy                           0.91       128
   macro avg       0.54      0.55      0.55       128
weighted avg       0.92      0.91      0.91       128

 
Model: Logistic Regression; Threshold: 0.77
Accuracy Score:
0.8828125
Confusion Matrix
[[112  10]
 [  5   1]]
Classification Report
              precision    recall  f1-score   support

         0.0       0.96  

# Best Performing Model: Logistic Regression with a .75 decision threshold and utilizing Naive Random Oversampling