# **Capstone Development Week 8: K-Nearest Neighbors (KNN)**

In this week we will investigate the effect that continuous numeric data has on injuries within our NFL datasets using KNN.

Because we previously investigated using Decision trees and ensemble methods to no avail, it is useful to consider the numberic (continuous) data alone to see if this 'moves the needle' on our classification strategy. 

#### **Useful Imports**

In [50]:
# Data Science Libraries
import numpy as np
import pandas as pd
import seaborn as sns

# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.ticker as mticker  # Optional: Format y-axis labels as dollars
import seaborn as sns
import matplotlib.pyplot as plt

# sk-learn stuff
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix


# Scikit-learn (Machine Learning)
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    GridSearchCV,
    RandomizedSearchCV,
    RepeatedStratifiedKFold,
    RepeatedKFold
)

from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, root_mean_squared_error, accuracy_score, f1_score, roc_auc_score, balanced_accuracy_score
from sklearn.feature_selection import SequentialFeatureSelector, f_regression, SelectKBest
from sklearn.linear_model import LogisticRegression, Lasso, RidgeClassifier, ElasticNet
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.ensemble        import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
# from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier, RUSBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier


# Progress Tracking
from tqdm import tqdm

#### **Dataset Imports**

In [85]:
# # Base Datasets
# BDB_All_Plays_Model_Ready = pd.read_csv("../../BDB_All_Plays_Model_Ready.csv") # Big Data Bowl Dataset
# PDA_Model_Ready = pd.read_csv("../../PDA_Model_Ready.csv") # Punt Data Analytics
# FNF_Model_Ready = pd.read_csv("../../FNF_Model_Ready.csv") # First and Future
# bdb_all_plays_clean_numeric = pd.read_csv(".../.../Feature_Subsets/BDB_all_plays_clean_numeric")

# # PCA and Standardized Datasets
# PDA_PCA_Features = pd.read_csv('../../Feature_Subsets/PDA_PCA_Features.csv')
# FNF_PCA_Features = pd.read_csv('../../Feature_Subsets/FNF_PCA_Features.csv')
# BDB_PCA_Features = pd.read_csv('../../Feature_Subsets/BDB_PCA_Features.csv')


bdb_all_plays_clean_numeric = pd.read_csv("../../Feature_Subsets/BDB_all_plays_clean_numeric.csv")
fnf_all_plays_clean_numeric = pd.read_csv("../../Feature_Subsets/FNF_All_Plays_Numeric.csv")
pda_all_plays_clean_numeric = pd.read_csv("../../Feature_Subsets/PDA_Numeric.csv")


#### **Useful Functions**

In [5]:
def categorify(df, target_column: str ='Inj_Occured'):
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    # Remove target if it’s numeric
    if target_column in numeric_cols:
        numeric_cols.remove(target_column)

    # Identify binary (0/1) columns among numeric ones
    def is_binary_01(s: pd.Series) -> bool:
        x = s.dropna().unique()
        if len(x) == 0:
            return False
        return set(pd.Series(x).astype(float)).issubset({0.0, 1.0}) and s.nunique(dropna=True) <= 2

    one_hot_cols = [c for c in numeric_cols if is_binary_01(df[c])]
    numeric_cols_to_scale = [c for c in numeric_cols if c not in one_hot_cols]

    # Seperate out categorical columns
    exclude = set(numeric_cols_to_scale + [target_column])
    categorical_cols = [c for c in df.columns if c not in exclude]

    return categorical_cols
    

Modified to get continuous Numeric Data out on already-one-hot-encoded DFs

In [32]:
def get_numeric_non_onehot(df, target_column='Inj_Occured'):
    # 1. Numeric columns
    numeric_cols = df.select_dtypes(include='number').columns.tolist()
    
    # 2. Remove target if present
    if target_column in numeric_cols:
        numeric_cols.remove(target_column)

    # 3. Keep only columns with more than 2 unique values → not one-hot
    numeric_non_onehot = [col for col in numeric_cols 
                          if df[col].nunique(dropna=True) > 2]

    return numeric_non_onehot

And re-using our run_model_classifier from previous weeks

In [49]:
# =============================================================================================
# Taken from Mod 3 Week 8:
# https://github.com/waysnyder/Module-3-Assignments/blob/main/Homework_08.ipynb
# 
# Global dataframe logic taken from mod 3 final project: 
# https://github.com/LeeMcFarling/Final_Project_Writeup/blob/main/Final_Project_Report.ipynb
# 
# Final Function was developed in Week 2 of this Module
# =============================================================================================

def run_model_classifier(model, X_train, y_train, X_test, y_test, n_repeats=10, n_jobs=-1, run_comment=None, return_model=False, concat_results=False, **model_params):

    global combined_results
    # Remove extra key used to store error metric, if it was added to the parameter dictionary
    if 'accuracy_found' in model_params:
        model_params = model_params.copy()
        model_params.pop('accuracy_found', None)  
        
    # Instantiate the model if a class is provided
    if isinstance(model, type):
        model = model(**model_params)
    else:                                    
        model.set_params(**model_params)    

    model_name = model.__name__ if isinstance(model, type) else model.__class__.__name__ # Added because 


    # Use RepeatedStratifiedKFold for classification to preserve class distribution
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=n_repeats, random_state=42)
    
    # Perform 5-fold cross-validation using accuracy as the scoring metric
    cv_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=n_jobs)
    
    mean_cv_accuracy = np.mean(cv_scores)
    std_cv_accuracy  = np.std(cv_scores)
    
    # Fit the model on the full training set
    model.fit(X_train, y_train)
    
    # Compute training and testing accuracy
    train_preds    = model.predict(X_train)
    test_preds     = model.predict(X_test)

    # Normal Accuracy 
    train_accuracy = accuracy_score(y_train, train_preds)
    test_accuracy  = accuracy_score(y_test, test_preds)

    # Balanced Accuracy Metrics
    balanced_train_accuracy = balanced_accuracy_score(y_train, train_preds)
    balanced_test_accuracy = balanced_accuracy_score(y_test, test_preds)

    results_df = pd.DataFrame([{
        'model': model_name, 
        'model_params': model.get_params(),
        'mean_cv_accuracy': mean_cv_accuracy,
        'std_cv_accuracy': std_cv_accuracy,
        'train_accuracy': train_accuracy, 
        'test_accuracy': test_accuracy,
        'balanced_train_accuracy' : balanced_train_accuracy,
        'balanced_test_accuracy': balanced_test_accuracy,
        'run_comment': run_comment
    }])
    
    if concat_results:
        try:
            combined_results = pd.concat([combined_results, results_df], ignore_index=True)
        except NameError:
            combined_results = results_df

    return (results_df, model) if return_model else results_df

#### **Big Data Bowl**

Before we get started, it is useful to note that KNN is mostly for numeric data regarding continuous physical pehnomena. In this sense, we will isolate continuous variables with value in the physical world (playResult, yardline number, etc.)

And we will investigate Euclidean and MInkowski distance metrics from there. 

In [None]:
bdb_all_plays_clean_numeric.drop(columns='Unnamed: 0', inplace=True)

In [34]:
BDB_numeric_cols = get_numeric_non_onehot(bdb_all_plays_clean_numeric)
bdb_numeric = bdb_all_plays_clean_numeric[BDB_numeric_cols]
bdb_numeric 

Unnamed: 0,quarter,down,yardsToGo,yardlineNumber,preSnapHomeScore,preSnapVisitorScore,penaltyYards,prePenaltyPlayResult,playResult,absoluteYardlineNumber,defendersInBox,frac_quarter_elapsed
0,1,3,2,33,0,0,0.0,0,0,43.0,6.0,0.10
1,1,2,6,34,0,0,0.0,5,5,76.0,6.0,0.17
2,1,1,10,39,0,0,0.0,0,0,49.0,6.0,0.34
3,1,3,15,44,0,0,0.0,0,0,54.0,7.0,0.35
4,1,2,5,11,0,0,0.0,10,10,21.0,6.0,0.41
...,...,...,...,...,...,...,...,...,...,...,...,...
8544,4,3,8,8,17,17,0.0,-8,-8,18.0,4.0,0.87
8545,4,1,10,25,20,17,0.0,3,3,35.0,5.0,0.93
8546,4,2,7,28,20,17,0.0,-8,-8,38.0,6.0,0.93
8547,4,3,15,20,20,17,0.0,0,0,30.0,5.0,0.96


Dropping the followning columns because they don't have the same positional ordinal meaning 

In [36]:
bdb_numeric = bdb_numeric.drop(columns=['preSnapHomeScore', 'preSnapVisitorScore'])

In [37]:
bdb_numeric.columns

Index(['quarter', 'down', 'yardsToGo', 'yardlineNumber', 'penaltyYards',
       'prePenaltyPlayResult', 'playResult', 'absoluteYardlineNumber',
       'defendersInBox', 'frac_quarter_elapsed'],
      dtype='object')

Train Test Split

In [46]:
numeric_cols = [
    'quarter',
    'down',
    'yardsToGo',
    'yardlineNumber',
    'penaltyYards',
    'prePenaltyPlayResult',
    'playResult',
    'absoluteYardlineNumber',
    'defendersInBox',
    'frac_quarter_elapsed'
]

target_column = 'Inj_Occured'

X = bdb_numeric[numeric_cols]
y = bdb_all_plays_clean_numeric[target_column].astype(int)

# (A second pass through cleaning in case we missed something)
X = X.replace([np.inf, -np.inf], np.nan)
mask = X.notna().all(axis=1)
X = X[mask]
y = y[mask]


bdb_X_train, bdb_X_test, bdb_y_train, bdb_y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

#### **KNN Pipeline**


In [47]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

knn_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

#### **Baseline KNN**

In [51]:
params_knn = dict(
    knn__n_neighbors=15,
    knn__weights='distance',
    knn__metric='minkowski',
    knn__p=2   # Euclidean
)



baseline_results_KNN = run_model_classifier(
    model=knn_pipe,           # Pipeline(scaler + knn)
    X_train=bdb_X_train,
    y_train=bdb_y_train,
    X_test=bdb_X_test,
    y_test=bdb_y_test,
    n_repeats=5,
    n_jobs=-1,
    run_comment='KNN Baseline: k=15, p=2, distance',
    return_model=False,
    concat_results=False,
    **params_knn
)

baseline_results_KNN

Unnamed: 0,model,model_params,mean_cv_accuracy,std_cv_accuracy,train_accuracy,test_accuracy,balanced_train_accuracy,balanced_test_accuracy,run_comment
0,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.975581,0.000355,1.0,0.975439,1.0,0.5,"KNN Baseline: k=15, p=2, distance"


#### **KNN Parameter Sweep**

In [None]:
combined_results = pd.DataFrame()

for k in [5, 11, 21, 31]: # 'K' closest neighbors
    for p in [1, 2, 3]: # Manhatten / Euclidean / and Chebyshev distances
        for w in ['uniform', 'distance']:
            
            params_knn = dict(
                knn__n_neighbors=k,
                knn__weights=w,
                knn__metric='minkowski',
                knn__p=p
            )

            run_model_classifier(
                model=knn_pipe,
                X_train=bdb_X_train, 
                y_train=bdb_y_train,
                X_test=bdb_X_test,
                y_test=bdb_y_test,
                n_repeats=5,
                n_jobs=-1,
                run_comment=f'KNN sweep: k={k}, w={w}, p={p}',
                concat_results=True,
                return_model=False,
                **params_knn
            )

In [54]:
combined_results

Unnamed: 0,model,model_params,mean_cv_accuracy,std_cv_accuracy,train_accuracy,test_accuracy,balanced_train_accuracy,balanced_test_accuracy,run_comment
0,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.975581,0.000355,0.975581,0.975439,0.5,0.5,"KNN sweep: k=5, w=uniform, p=1"
1,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.974675,0.000922,1.0,0.973684,1.0,0.499101,"KNN sweep: k=5, w=distance, p=1"
2,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.975523,0.000363,0.975581,0.975439,0.5,0.5,"KNN sweep: k=5, w=uniform, p=2"
3,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.974645,0.000845,1.0,0.974269,1.0,0.4994,"KNN sweep: k=5, w=distance, p=2"
4,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.975523,0.000363,0.975581,0.975439,0.5,0.5,"KNN sweep: k=5, w=uniform, p=3"
5,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.974587,0.000835,1.0,0.974269,1.0,0.4994,"KNN sweep: k=5, w=distance, p=3"
6,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.975581,0.000355,0.975581,0.975439,0.5,0.5,"KNN sweep: k=11, w=uniform, p=1"
7,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.975435,0.000412,1.0,0.974854,1.0,0.4997,"KNN sweep: k=11, w=distance, p=1"
8,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.975581,0.000355,0.975581,0.975439,0.5,0.5,"KNN sweep: k=11, w=uniform, p=2"
9,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.975435,0.000412,1.0,0.975439,1.0,0.5,"KNN sweep: k=11, w=distance, p=2"


so as evidenced in our balanced test accuracy, it looks like the model is just learning to guess 'no injury' every time. Again... Let's 

In [56]:
results_knn, knn_model = run_model_classifier(
    model=knn_pipe,
    X_train=bdb_X_train,
    y_train=bdb_y_train,
    X_test=bdb_X_test,
    y_test=bdb_y_test,
    n_repeats=3,
    n_jobs=-1,
    run_comment='KNN check: k=15, distance, p=2',
    return_model=True,
    concat_results=False,
    knn__n_neighbors=15,
    knn__weights='distance',
    knn__metric='minkowski',
    knn__p=2
)

from sklearn.metrics import confusion_matrix, classification_report

y_pred = knn_model.predict(bdb_X_test)

print(confusion_matrix(bdb_y_test, y_pred))
print(classification_report(bdb_y_test, y_pred, digits=4))

[[1668    0]
 [  42    0]]
              precision    recall  f1-score   support

           0     0.9754    1.0000    0.9876      1668
           1     0.0000    0.0000    0.0000        42

    accuracy                         0.9754      1710
   macro avg     0.4877    0.5000    0.4938      1710
weighted avg     0.9515    0.9754    0.9633      1710



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


So as we expected, KNN cannot cleanly predict the classes, as there is not enough of a 'signal' for the model to correctly classify, and the classes are way too imbalanced. Shucks :(

KNN predicted none of them correctly, resulting in TP = 0 and FN = 42. The classifier effectively defaulted to predicting the majority class. Although this produced a superficially high accuracy (97.5%), it came entirely from predicting the dominant non-injury class. Balanced accuracy was 0.50, confirming that the classifier performed no better than random guessing at identifying injuries.

This demonstrates that game-state numeric features do not produce a neighborhood structure in which injury plays are meaningfully close to one another. KNN cannot learn minority-class signal under these conditions, especially given the extreme class imbalance and the absence of biomechanical variables such as velocity, deceleration, or relative contact angle. The model overfits in training (balanced training accuracy = 1.0) but fails to generalize (balanced test accuracy = 0.50), a known behavior for distance-based classifiers on highly imbalanced data.




## **First and Future**

Now let's investigate the first and future dataset to see if this is any better. 

In [68]:
fnf_all_plays_clean_numeric.drop(columns='Unnamed: 0', inplace=True)

In [70]:
fnf_all_plays_clean_numeric.columns

Index(['PlayerDay', 'PlayerGame', 'Temperature', 'PlayerGamePlay', 'x', 'y',
       'direction', 'speed', 'distance', 'time', 'Inj_Occured'],
      dtype='object')

In [72]:
fnf_all_plays_clean_numeric.drop(columns=['PlayerDay', 'PlayerGame', 'PlayerGamePlay'], inplace=True)

In [73]:
fnf_all_plays_clean_numeric

Unnamed: 0,Temperature,x,y,direction,speed,distance,time,Inj_Occured
0,63,87.665753,28.221104,186.148361,0.535753,0.056288,14.90,0
1,63,86.616462,29.144077,211.949423,0.849692,0.090077,12.95,0
2,63,79.677264,28.900203,194.134122,0.331014,0.036081,14.75,0
3,63,72.469449,26.680157,163.762205,0.523701,0.054016,6.30,0
4,63,65.407846,24.865577,222.120577,0.970385,0.099769,12.95,0
...,...,...,...,...,...,...,...,...
267001,33,41.700296,18.853989,155.077709,1.229191,0.124582,18.50,0
267002,33,53.980678,18.102780,169.963559,1.796203,0.180780,14.70,0
267003,33,57.170638,10.130479,181.141330,1.806649,0.182394,9.35,0
267004,33,54.662095,18.973687,166.529609,2.147682,0.215279,17.85,0


In [78]:
fnf_all_plays_clean_numeric.columns

Index(['Temperature', 'x', 'y', 'direction', 'speed', 'distance', 'time',
       'Inj_Occured'],
      dtype='object')

Train test split

In [81]:
def train_test_split_df(df, numeric_cols, target_column):
    numeric_cols = numeric_cols
    target_column = target_column

    X = df[numeric_cols]
    y = df[target_column].astype(int)

    # (A second pass through cleaning in case we missed something)
    X = X.replace([np.inf, -np.inf], np.nan)
    mask = X.notna().all(axis=1)
    X = X[mask]
    y = y[mask]


    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )
    return X_train, X_test, y_train, y_test

In [82]:
numeric_cols = [
            'Temperature', 
            'x', 
            'y', 
            'direction', 
            'speed', 
            'distance', 
            'time'
        ]

target_column = 'Inj_Occured'


fnf_X_train, fnf_X_test, fnf_y_train, fnf_y_test = train_test_split_df(fnf_all_plays_clean_numeric, numeric_cols, target_column)

#### **Baseline**

In [83]:
params_knn = dict(
    knn__n_neighbors=15,
    knn__weights='distance',
    knn__metric='minkowski',
    knn__p=2   # Euclidean
)



fnf_baseline_results_KNN = run_model_classifier(
    model=knn_pipe,           # Pipeline(scaler + knn)
    X_train=fnf_X_train,
    y_train=fnf_y_train,
    X_test=fnf_X_test,
    y_test=fnf_y_test,
    n_repeats=5,
    n_jobs=-1,
    run_comment='fnf KNN Baseline: k=15, p=2, distance',
    return_model=False,
    concat_results=False,
    **params_knn
)

fnf_baseline_results_KNN

Unnamed: 0,model,model_params,mean_cv_accuracy,std_cv_accuracy,train_accuracy,test_accuracy,balanced_train_accuracy,balanced_test_accuracy,run_comment
0,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.99971,1.1e-05,1.0,0.999738,1.0,0.533333,"fnf KNN Baseline: k=15, p=2, distance"


In [84]:
combined_results = pd.DataFrame()

for k in [5, 11, 21, 31]:
    for p in [1, 2, 3]:
        for w in ['uniform', 'distance']:
            
            params_knn = dict(
                knn__n_neighbors=k,
                knn__weights=w,
                knn__metric='minkowski',
                knn__p=p
            )

            run_model_classifier(
                model=knn_pipe,
                X_train=fnf_X_train, 
                y_train=fnf_y_train,
                X_test=fnf_X_test,
                y_test=fnf_y_test,
                n_repeats=5,
                n_jobs=-1,
                run_comment=f' fnf KNN sweep: k={k}, w={w}, p={p}',
                concat_results=True,
                return_model=False,
                **params_knn
            )

In [92]:
combined_results[combined_results['run_comment'].str.contains('fnf')].sort_values(by='balanced_test_accuracy', ascending=False)

Unnamed: 0,model,model_params,mean_cv_accuracy,std_cv_accuracy,train_accuracy,test_accuracy,balanced_train_accuracy,balanced_test_accuracy,run_comment
23,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.99971,1.1e-05,1.0,0.999738,1.0,0.533333,"fnf KNN sweep: k=31, w=distance, p=3"
3,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.99971,1.1e-05,1.0,0.999738,1.0,0.533333,"fnf KNN sweep: k=5, w=distance, p=2"
21,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.99971,1.1e-05,1.0,0.999738,1.0,0.533333,"fnf KNN sweep: k=31, w=distance, p=2"
5,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.99971,1.1e-05,1.0,0.999738,1.0,0.533333,"fnf KNN sweep: k=5, w=distance, p=3"
7,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.99971,1.1e-05,1.0,0.999738,1.0,0.533333,"fnf KNN sweep: k=11, w=distance, p=1"
19,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.99971,1.1e-05,1.0,0.999738,1.0,0.533333,"fnf KNN sweep: k=31, w=distance, p=1"
9,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.99971,1.1e-05,1.0,0.999738,1.0,0.533333,"fnf KNN sweep: k=11, w=distance, p=2"
11,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.99971,1.1e-05,1.0,0.999738,1.0,0.533333,"fnf KNN sweep: k=11, w=distance, p=3"
1,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.99971,1.1e-05,1.0,0.999738,1.0,0.533333,"fnf KNN sweep: k=5, w=distance, p=1"
13,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.99971,1.1e-05,1.0,0.999738,1.0,0.533333,"fnf KNN sweep: k=21, w=distance, p=1"


Yup, still terrible. 

The model basically isn't able to learn the dataset. Especially for this one, this makes total sense as the signal is exceedingly small and, like the other dataset that we investigated, did not have a strong correlation between the numeric data and the target variable (this was well documented in Week 1).

### **Punt Data Analytics**

In [94]:
pda_all_plays_clean_numeric.drop(columns='Season_Year', inplace=True)

In [95]:
pda_all_plays_clean_numeric

Unnamed: 0,Week,Temperature,Quarter,Inj_Occured,home_score,away_score,frac_quarter_elapsed,yardline_100
0,2,79.0,1,0,7,0,0.17,47
1,2,79.0,2,0,21,7,0.19,29
2,2,79.0,2,0,21,7,0.33,18
3,2,79.0,2,0,24,7,0.98,54
4,2,79.0,3,0,24,14,0.30,15
...,...,...,...,...,...,...,...,...
6676,3,47.0,2,0,7,14,0.19,33
6677,3,47.0,2,0,7,14,0.37,44
6678,3,47.0,2,0,7,21,0.96,47
6679,3,47.0,4,0,7,38,0.35,45


In [97]:
pda_all_plays_clean_numeric.columns

Index(['Week', 'Temperature', 'Quarter', 'Inj_Occured', 'home_score',
       'away_score', 'frac_quarter_elapsed', 'yardline_100'],
      dtype='object')

In [99]:
numeric_cols = [
            'Week', 
            'Temperature', 
            'Quarter',
            'home_score', 
            'away_score', 
            'frac_quarter_elapsed',
            'yardline_100'
        ]

target_column = 'Inj_Occured'


pda_X_train, pda_X_test, pda_y_train, pda_y_test = train_test_split_df(pda_all_plays_clean_numeric, numeric_cols, target_column)

In [100]:
params_knn = dict(
    knn__n_neighbors=15,
    knn__weights='distance',
    knn__metric='minkowski',
    knn__p=2   # Euclidean
)



pda_baseline_results_KNN = run_model_classifier(
    model=knn_pipe,           # Pipeline(scaler + knn)
    X_train=fnf_X_train,
    y_train=fnf_y_train,
    X_test=fnf_X_test,
    y_test=fnf_y_test,
    n_repeats=5,
    n_jobs=-1,
    run_comment='pda KNN Baseline: k=15, p=2, distance',
    return_model=False,
    concat_results=False,
    **params_knn
)

pda_baseline_results_KNN

Unnamed: 0,model,model_params,mean_cv_accuracy,std_cv_accuracy,train_accuracy,test_accuracy,balanced_train_accuracy,balanced_test_accuracy,run_comment
0,Pipeline,"{'memory': None, 'steps': [('scaler', Standard...",0.99971,1.1e-05,1.0,0.999738,1.0,0.533333,"pda KNN Baseline: k=15, p=2, distance"


And it's the same for punt data analytics.