# **Week 2**

Lasso, Ridge, and Elastic Regression

#### **Package Imports**

In [53]:
# Standard Libraries
import os
import time
import math
import io
import zipfile
import requests
from urllib.parse import urlparse
from itertools import chain, combinations

# Data Science Libraries
import numpy as np
import pandas as pd
import seaborn as sns

# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.ticker as mticker  # Optional: Format y-axis labels as dollars
import seaborn as sns
import matplotlib.pyplot as plt



# Scikit-learn (Machine Learning)
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    GridSearchCV,
    RandomizedSearchCV,
    RepeatedStratifiedKFold,
    RepeatedKFold
)

from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, root_mean_squared_error, accuracy_score, f1_score, roc_auc_score, balanced_accuracy_score
from sklearn.feature_selection import SequentialFeatureSelector, f_regression, SelectKBest
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_predict, KFold
# Progress Tracking

from tqdm import tqdm

# =============================
# Global Variables
# =============================
random_state = 42




#### **Dataset Imports**

In [2]:
BDB_All_Plays_Model_Ready = pd.read_csv("../../AFL_Final_Project/BDB_All_Plays_Model_Ready.csv") # Big Data Bowl Dataset

#### **Getting Started: Decisions Made Thus Far**

The 'Big Data Bowl' dataset is the most difficult of the three football datasets to work with, but it is also the most feature rich. The following is a collection of observations and datapoints made thus far. 

#### **1. Numeric Data Correlation with the target Variable: *Inj_Occurred***

Bi and multivariate analysis in Semester 2 showed that none of the numeric fields had a strong correlation with the target variable, aside from the foulID columns. These were later encoded into Foul Flag columns (six seperate foul ID columns were present, each containing redundant information)

#### **2. Categorical Relation to target Variable**

The following columns have low p-vlaues meaning their relationship with the target variable is considered not due to chance. 

#### **3. VIF Analysis**

The following features were dropped because they had extremely high VIF values, even after multicollinearity drops in dummie variable creation were done. 

- pff_cassCoverage - VIF scores so high it was litterally off the charts
- personnelO - VIF scores so high it was literally off the charts
- personnelD - Highly correlated with other defensive stats. 
- prePenaltyPlayResult - similar to play result
- defendersInBox -High VIF but not highly correlated with target
- quarter - Highly correlated with other time variables but not with target

___

#### **Function Definitions**

Function to take a provided dataframe and split that dataframe into feature and target columns. 

In [3]:
# ===========================================================================================
# Function taken from Module 3 Final Project
# https://github.com/LeeMcFarling/Final_Project_Writeup/blob/main/Final_Project_Report.ipynb
# ===========================================================================================

def train_test_split_data(df, target_col):
    X = df.drop(columns=target_col)
    y = df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

___

#### **Lists to split Data into Numeric and Categorical Data**

Because we already made one hot encoded variables here are lists to seperate numeric and categorical data. 

In [None]:
numeric_columns = [
 'down',
 'yardsToGo',
 'yardlineNumber',
 'preSnapHomeScore',
 'preSnapVisitorScore',
 'penaltyYards',
 'playResult',
 'absoluteYardlineNumber',
 'frac_quarter_elapsed']

categorical_columns = [
 'possessionTeam_ATL',
 'possessionTeam_BAL',
 'possessionTeam_BUF',
 'possessionTeam_CAR',
 'possessionTeam_CHI',
 'possessionTeam_CIN',
 'possessionTeam_CLE',
 'possessionTeam_DAL',
 'possessionTeam_DEN',
 'possessionTeam_DET',
 'possessionTeam_GB',
 'possessionTeam_HOU',
 'possessionTeam_IND',
 'possessionTeam_JAX',
 'possessionTeam_KC',
 'possessionTeam_LA',
 'possessionTeam_LAC',
 'possessionTeam_LV',
 'possessionTeam_MIA',
 'possessionTeam_MIN',
 'possessionTeam_NE',
 'possessionTeam_NO',
 'possessionTeam_NYG',
 'possessionTeam_NYJ',
 'possessionTeam_PHI',
 'possessionTeam_PIT',
 'possessionTeam_SEA',
 'possessionTeam_SF',
 'possessionTeam_TB',
 'possessionTeam_TEN',
 'possessionTeam_WAS',
 'defensiveTeam_ATL',
 'defensiveTeam_BAL',
 'defensiveTeam_BUF',
 'defensiveTeam_CAR',
 'defensiveTeam_CHI',
 'defensiveTeam_CIN',
 'defensiveTeam_CLE',
 'defensiveTeam_DAL',
 'defensiveTeam_DEN',
 'defensiveTeam_DET',
 'defensiveTeam_GB',
 'defensiveTeam_HOU',
 'defensiveTeam_IND',
 'defensiveTeam_JAX',
 'defensiveTeam_KC',
 'defensiveTeam_LA',
 'defensiveTeam_LAC',
 'defensiveTeam_LV',
 'defensiveTeam_MIA',
 'defensiveTeam_MIN',
 'defensiveTeam_NE',
 'defensiveTeam_NO',
 'defensiveTeam_NYG',
 'defensiveTeam_NYJ',
 'defensiveTeam_PHI',
 'defensiveTeam_PIT',
 'defensiveTeam_SEA',
 'defensiveTeam_SF',
 'defensiveTeam_TB',
 'defensiveTeam_TEN',
 'defensiveTeam_WAS',
 'passResult_I',
 'passResult_IN',
 'passResult_R',
 'passResult_S',
 'offenseFormation_I_FORM',
 'offenseFormation_JUMBO',
 'offenseFormation_PISTOL',
 'offenseFormation_SHOTGUN',
 'offenseFormation_SINGLEBACK',
 'offenseFormation_WILDCAT',
 'dropBackType_DESIGNED_ROLLOUT_RIGHT',
 'dropBackType_DESIGNED_RUN',
 'dropBackType_SCRAMBLE',
 'dropBackType_SCRAMBLE_ROLLOUT_LEFT',
 'dropBackType_SCRAMBLE_ROLLOUT_RIGHT',
 'dropBackType_UNKNOWN',
 'pff_passCoverageType_Other',
 'pff_passCoverageType_Zone',
 'pff_playAction',
 'Inj_Occured',
 'foul_on_play',
]

#### **Standardization Function**

In [None]:
# =============================================================================================
# Standardize Numeric Features
# 
# Taken from Mod 3 final project found here: 
# https://github.com/LeeMcFarling/Final_Project_Writeup/blob/main/Final_Project_Report.ipynb
#
# Note: some errors were calling so I just hard coded it. 
# =============================================================================================

def standardize_features(df, target_column=None, debug=False, return_scaler=False):
    df_scaled = df.copy()
    numeric_cols = numeric_columns = [
                        'down',
                        'yardsToGo',
                        'yardlineNumber',
                        'preSnapHomeScore',
                        'preSnapVisitorScore',
                        'penaltyYards',
                        'playResult',
                        'absoluteYardlineNumber',
                        'frac_quarter_elapsed']

    if target_column in numeric_cols:
        numeric_cols.remove(target_column)

    # Step 4: Further exclude binary (0/1) columns
    numeric_cols_to_scale = [
        col for col in numeric_cols
        if df[col].nunique(dropna=True) > 2
    ]

    # DEBUG
    if debug:
        print("Numeric columns before filtering:", numeric_cols)
        print("Numeric columns after filtering:", numeric_cols_to_scale)

    scaler = StandardScaler()
    df_scaled[numeric_cols_to_scale] = scaler.fit_transform(df_scaled[numeric_cols_to_scale])

    if return_scaler:
        return df_scaled, scaler

    return df_scaled

#### **Run Model Classifier** 


In [None]:
# =============================================================================================
# Taken from Mod 3 Week 8:
# https://github.com/waysnyder/Module-3-Assignments/blob/main/Homework_08.ipynb
# =============================================================================================

def run_model_classifier(model, X_train, y_train, X_test, y_test, n_repeats=10, n_jobs=-1, **model_params):

    # Remove extra key used to store error metric, if it was added to the parameter dictionary
    
    if 'accuracy_found' in model_params:
        model_params = model_params.copy()
        model_params.pop('accuracy_found', None)  
        
    # Instantiate the model if a class is provided
    if isinstance(model, type):
        model = model(**model_params)
    else:                                    
        model.set_params(**model_params)    

    # Use RepeatedStratifiedKFold for classification to preserve class distribution
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=n_repeats, random_state=42)
    
    # Perform 5-fold cross-validation using accuracy as the scoring metric
    cv_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=n_jobs)
    
    mean_cv_accuracy = np.mean(cv_scores)
    std_cv_accuracy  = np.std(cv_scores)
    
    # Fit the model on the full training set
    model.fit(X_train, y_train)
    
    # Compute training and testing accuracy
    train_preds    = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, train_preds)
    test_preds     = model.predict(X_test)
    test_accuracy  = accuracy_score(y_test, test_preds)
    
    return mean_cv_accuracy, std_cv_accuracy, train_accuracy, test_accuracy

____

#### **Prepare Data**

In [45]:
# Non Standardized Data
X_train, X_test, y_train, y_test = train_test_split_data(BDB_All_Plays_Model_Ready, 'Inj_Occured')


# Standardized Numeric Data
BDB_All_Plays_Standardized = standardize_features(BDB_All_Plays_Model_Ready, target_column='Inj_Occured')
X_train_Scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split_data(BDB_All_Plays_Standardized, 'Inj_Occured')

____

# **Modeling**

NOTE: As previously discussed in Semester 2, the primary goal of this analysis and modeling excercise is to classify whether a particular play will result in an injury, and to determine the factors that are most likely to cause this injury. Furthermore, as was *also* previously discussed, this dataset has some extreme imbalance issues (injury occurance < 2%), and as such high performance on these baseline models is NOT expected. 

## **Baseline Logistic Regression**

In [42]:
# =============================================================================================
#
# Parameters chosen from SK Learn Documentation and picked for imbalance. 
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# =============================================================================================

params_ = {
    'class_weight' : 'balanced',     # attempt to balance dataset
    'solver': 'saga',                # Doc said that this solver is better for larger datasets
    'penalty': 'l2',                 # default 
    'max_iter' : 10000,
    'fit_intercept': True,
    'random_state' : 42
}
mean_cv_accuracy, std_cv_accuracy, train_accuracy, test_accuracy = run_model_classifier(LogisticRegression, X_train, y_train, X_test, y_test, n_repeats=5, n_jobs=-1, **params_)


print(f"Mean CV Acc:  {mean_cv_accuracy*100:.2f}%")
print(f"Std CV Acc:   {std_cv_accuracy:.4f}")
print(f"Train Acc:    {train_accuracy*100:.2f}%")
print(f"Test Acc:     {test_accuracy*100:.2f}%")



Mean CV Acc:  66.06%
Std CV Acc:   0.1612
Train Acc:    59.34%
Test Acc:     57.95%




^ On our first runs, there were a ton of convergence warnings, so the max_iter was cranked up to 50 and tolerance was set at 1e-2 

In [None]:
# =============================================================================================
#
# Parameters chosen from SK Learn Documentation and picked for imbalance. 
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# =============================================================================================

params_ = {
    'class_weight' : 'balanced',        # attempt to balance dataset
    'solver': 'saga',                   # Doc said that this solver is better for larger datasets
    'penalty': 'l2',                    # default
    'max_iter' : 50000,                 # Iteratively increased this until Convergence Warnings went away
    'tol': 1e-2,                        # Another convergence warning measure
    'random_state' : 42
}
mean_cv_accuracy, std_cv_accuracy, train_accuracy, test_accuracy = run_model_classifier(LogisticRegression,
                                                                                         X_train, 
                                                                                         y_train, 
                                                                                         X_test, 
                                                                                         y_test,
                                                                                         n_repeats=5, 
                                                                                         n_jobs=-1, 
                                                                                         **params_
                                                                                         )


print(f"Mean CV Acc:  {mean_cv_accuracy*100:.2f}%")
print(f"Std CV Acc:   {std_cv_accuracy:.4f}")
print(f"Train Acc:    {train_accuracy*100:.2f}%")
print(f"Test Acc:     {test_accuracy*100:.2f}%")

Mean CV Acc:  64.08%
Std CV Acc:   0.0821
Train Acc:    63.27%
Test Acc:     62.57%


Noice, we fixed the convergence issue -- now let's see how using the standardized data effects this. 

In [46]:
# =============================================================================================
#
# Parameters chosen from SK Learn Documentation and picked for imbalance. 
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# =============================================================================================

params_ = {
    'class_weight' : 'balanced',
    'solver': 'saga',
    'penalty': 'l2',
    'max_iter' : 50000,
    'tol': 1e-2,
    'random_state' : 42
}
mean_cv_accuracy, std_cv_accuracy, train_accuracy, test_accuracy = run_model_classifier(LogisticRegression,
                                                                                         X_train_Scaled, 
                                                                                         y_train_scaled,
                                                                                         X_test_scaled,  
                                                                                         y_test_scaled,
                                                                                         n_repeats=5, 
                                                                                         n_jobs=-1, 
                                                                                         **params_
                                                                                         )


print(f"Mean CV Acc:  {mean_cv_accuracy*100:.2f}%")
print(f"Std CV Acc:   {std_cv_accuracy:.4f}")
print(f"Train Acc:    {train_accuracy*100:.2f}%")
print(f"Test Acc:     {test_accuracy*100:.2f}%")



Mean CV Acc:  69.84%
Std CV Acc:   0.0173
Train Acc:    69.73%
Test Acc:     68.19%


Those are some moderate improvement gains there. We can still use a different scoring accuracy metric to see if that will help even more. 

Below is the same code as was in cell [39], just with 'balanced_accuracy' substituted in the cross validation scoring

In [84]:
# =============================================================================================
# Taken from Mod 3 Week 8:
# https://github.com/waysnyder/Module-3-Assignments/blob/main/Homework_08.ipynb
# =============================================================================================

def run_model_classifier(model, X_train, y_train, X_test, y_test, n_repeats=10, n_jobs=-1, run_comment=None, return_model=False, concat_results=False, **model_params):

    global combined_results
    # Remove extra key used to store error metric, if it was added to the parameter dictionary
    if 'accuracy_found' in model_params:
        model_params = model_params.copy()
        model_params.pop('accuracy_found', None)  
        
    # Instantiate the model if a class is provided
    if isinstance(model, type):
        model = model(**model_params)
    else:                                    
        model.set_params(**model_params)    

    model_name = model.__name__ if isinstance(model, type) else model.__class__.__name__


    # Use RepeatedStratifiedKFold for classification to preserve class distribution
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=n_repeats, random_state=42)
    
    # Perform 5-fold cross-validation using accuracy as the scoring metric
    cv_scores = cross_val_score(model, X_train, y_train, scoring='balanced_accuracy', cv=cv, n_jobs=n_jobs)
    
    mean_cv_accuracy = np.mean(cv_scores)
    std_cv_accuracy  = np.std(cv_scores)
    
    # Fit the model on the full training set
    model.fit(X_train, y_train)
    
    # Compute training and testing accuracy
    train_preds    = model.predict(X_train)
    test_preds     = model.predict(X_test)

    # Normal Accuracy 
    train_accuracy = accuracy_score(y_train, train_preds)
    test_accuracy  = accuracy_score(y_test, test_preds)

    # Balanced Accuracy Metrics
    balanced_train_accuracy = balanced_accuracy_score(y_train, train_preds)
    balanced_test_accuracy = balanced_accuracy_score(y_test, test_preds)

    results_df = pd.DataFrame([{
        'model': model_name, 
        'model_params': model.get_params(),
        'mean_cv_accuracy': mean_cv_accuracy,
        'std_cv_accuracy': std_cv_accuracy,
        'train_accuracy': train_accuracy, 
        'test_accuracy': test_accuracy,
        'run_comment': run_comment
    }])
    
    if concat_results:
        try:
            combined_results = pd.concat([combined_results, results_df], ignore_index=True)
        except NameError:
            combined_results = results_df
            
    return (results_df, model) if return_model else results_df

And re-running the same cell as [46]

In [51]:
# =============================================================================================
# Parameters chosen from SK Learn Documentation and picked for imbalance. 
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# =============================================================================================

params_ = {
    'class_weight' : 'balanced',
    'solver': 'saga',
    'penalty': 'l2',
    'max_iter' : 60000,
    'tol': 1e-2,
    'random_state' : 42
}
mean_cv_accuracy, std_cv_accuracy, train_accuracy, test_accuracy = run_model_classifier(LogisticRegression,
                                                                                         X_train_Scaled, 
                                                                                         y_train_scaled,
                                                                                         X_test_scaled,  
                                                                                         y_test_scaled,
                                                                                         n_repeats=5, 
                                                                                         n_jobs=-1, 
                                                                                         **params_
                                                                                         )


print(f"Mean CV Acc:  {mean_cv_accuracy*100:.2f}%")
print(f"Std CV Acc:   {std_cv_accuracy:.4f}")
print(f"Train Acc:    {train_accuracy*100:.2f}%")
print(f"Test Acc:     {test_accuracy*100:.2f}%")



Mean CV Acc:  57.15%
Std CV Acc:   0.0320
Train Acc:    72.95%
Test Acc:     50.77%


____

#### **Baseline Results**

_____

## **Lasso Classification**

So there isn't a lasso classification class in SK-Learn. But I saw tutorials for how to use rounding to force it to exhibit classifier-like behavior here: 

- https://saturncloud.io/blog/python-classification-with-lasso-how-to-predict-classes/#2

And then used ChatGPT to help with syntax to build a class that I could pass into our cross validation function here: 

- https://chatgpt.com/share/68cf6cfd-ff4c-800f-a7ab-ef0d5d769bf9


In [110]:
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.base import BaseEstimator, ClassifierMixin

class LassoClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, 
                alpha=1.0,
                max_iter=1000,
                tol=1e-4,
                random_state=42,
                threshold=0.5, 
                **lasso_params):
        """
        Wraps Lasso regression to behave like a classifier.
        
        alpha: regularization strength for Lasso
        threshold: value above which prediction is class 1, else 0
        lasso_params: additional parameters passed to sklearn's Lasso
        """
        self.alpha = alpha
        self.max_iter = max_iter
        self.tol = tol
        self.random_state = random_state
        self.threshold = threshold
        self.lasso_params = lasso_params
        self.model = None

    def fit(self, X, y):
        # Instantiate and fit underlying Lasso
        self.model = Lasso(alpha=self.alpha, **self.lasso_params)
        self.model.fit(X, y)
        return self

    def predict(self, X):
        # Get continuous predictions and threshold them
        y_pred_cont = self.model.predict(X)
        return (y_pred_cont >= self.threshold).astype(int)

    def predict_proba(self, X):
        """
        Not strictly meaningful for regression, but we can return
        probabilities as scaled regression outputs between 0 and 1.
        """
        y_pred_cont = self.model.predict(X)
        # clip to [0, 1] range so it behaves like probabilities
        y_proba = np.clip(y_pred_cont, 0, 1)
        return np.column_stack([1 - y_proba, y_proba])

    @property
    def coef_(self):
        return self.model.coef_

    @property
    def intercept_(self):
        return self.model.intercept_


In [None]:
# =============================================================================================
# Parameters chosen from SK Learn Documentation and picked for imbalance. 
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# =============================================================================================

params_ = {
    'alpha' : 0.1,
    'max_iter' : 60000,
    'tol': 1e-2,
    'random_state' : 42
}
results_df, Fitted_Lasso = run_model_classifier(
    LassoClassifier,
    X_train_Scaled, 
    y_train_scaled,
    X_test_scaled,  
    y_test_scaled,
    n_repeats=5, 
    n_jobs=-1, 
    run_comment=None, 
    return_model=True,
    concat_results=False,
    **params_
    )

results_df


# print(f"Mean CV Acc:  {mean_cv_accuracy*100:.2f}%")
# print(f"Std CV Acc:   {std_cv_accuracy:.4f}")
# print(f"Train Acc:    {train_accuracy*100:.2f}%")
# print(f"Test Acc:     {test_accuracy*100:.2f}%")

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. -0. -0. -0.  0. -0.  0.
  0. -0.  0. -0. -0.  0. -0. -0. -0.  0. -0. -0. -0.  0.  0. -0. -0. -0.
  0. -0.  0. -0.  0.  0.  0.  0. -0. -0. -0. -0.  0. -0.  0. -0.  0.  0.
 -0. -0. -0. -0.  0.  0. -0.  0.  0. -0. -0.  0. -0. -0.  0.  0.  0.  0.
 -0.  0. -0. -0. -0. -0. -0.  0. -0. -0. -0.  0.  0. -0. -0. -0.  0.  0.
  0. -0. -0.  0. -0. -0. -0. -0.  0.  0.  0. -0.  0.  0.  0. -0. -0.  0.
 -0.  0. -0. -0.  0.  0.  0. -0. -0.  0.  0. -0.  0. -0.  0.]
0.024711215089925426


In [115]:
coef_df = pd.DataFrame({
    'feature' : X_train_Scaled.columns,
    'coef' : Fitted_Lasso.coef_
})

coef_df

Unnamed: 0,feature,coef
0,down,0.0
1,yardsToGo,0.0
2,yardlineNumber,0.0
3,preSnapHomeScore,0.0
4,preSnapVisitorScore,0.0
...,...,...
118,dropBackType_SCRAMBLE_ROLLOUT_LEFT,0.0
119,dropBackType_SCRAMBLE_ROLLOUT_RIGHT,-0.0
120,dropBackType_UNKNOWN,0.0
121,pff_passCoverageType_Other,-0.0
