In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV

In [3]:
pbp = pd.read_csv('../../Data/play-by-play/pbp_exp12.csv')

In [None]:
# List of categorical columns
categorical_cols = [
    'OffenseTeam', 'DefenseTeam', 'play_type', 'side_of_field', 'stadium', 
    'play_type_nfl', 'roof', 'surface_type', 'home_team', 'away_team', 
    'season_type', 'offense_formation', 'temperature_grade', 'temperature_bucket'
]

# Dictionary to hold LabelEncoders for each column
label_encoders = {}

# Apply LabelEncoder to each categorical column and store encoders
for col in categorical_cols:
    le = LabelEncoder()
    pbp[col] = le.fit_transform(pbp[col].astype(str))  # Ensure the column is treated as a string
    label_encoders[col] = le 

In [5]:
X = pbp[['week', 'OffenseTeam', 'DefenseTeam', 'Down', 'YardsToGo', 'yardline_100', 'play_type',
               'Quarter', 'game_seconds_remaining', 'game_half', 'drive', 'series', 'score_differential',
               'stadium', 'roof', 'surface_type', 'HOME_day_since_last_game', 'AWAY_day_since_last_game',
               'total_play_count', 'home_team_on_offense', 'home_team_on_defense', 'season_progression',
               'defenders_in_box', 'offense_formation', 'Temperature', 'Precipitation', 'goal_line_situation',
               'pass_count', 'run_count', 'current_defence_rank','temperature_grade', 'epa', 
               'late_season','is_close_game', 'poor_field_condition', 'rest_differential', 'blitz_situation',
               'short_rest','pass_run_ratio', 'offensive_predictability', 'fourth_down_attempt', 'third_and_long']]


y = pbp['Player-Injured-On-Play']

In [None]:
k = 53  
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

# Initialize lists for storing metrics and models
accuracy_scores, precision_scores, recall_scores = [], [], []
false_positive_cases, false_negative_cases = [], []
fold_models = {}


# --- Cross-Validation Loop ---
for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Separate injured and non-injured plays
    X_train_pos = X_train[y_train == 1]
    X_train_neg = X_train[y_train == 0]
    y_train_pos = y_train[y_train == 1]
    y_train_neg = y_train[y_train == 0]

    X_test_pos = X_test[y_test == 1]
    X_test_neg = X_test[y_test == 0]
    y_test_pos = y_test[y_test == 1]
    y_test_neg = y_test[y_test == 0]

    # Resample to ensure balance
    X_train_pos_resampled, y_train_pos_resampled = resample(X_train_pos, y_train_pos, replace=True, n_samples=1619, random_state=42)
    X_train_neg_resampled, y_train_neg_resampled = resample(X_train_neg, y_train_neg, replace=False, n_samples=1619, random_state=42)

    X_test_pos_resampled, y_test_pos_resampled = resample(X_test_pos, y_test_pos, replace=True, n_samples=1619, random_state=42)
    X_test_neg_resampled, y_test_neg_resampled = resample(X_test_neg, y_test_neg, replace=False, n_samples=1619, random_state=42)

    # Combine resampled data
    X_train_resampled = pd.concat([X_train_pos_resampled, X_train_neg_resampled])
    y_train_resampled = pd.concat([y_train_pos_resampled, y_train_neg_resampled])

    X_test_resampled = pd.concat([X_test_pos_resampled, X_test_neg_resampled])
    y_test_resampled = pd.concat([y_test_pos_resampled, y_test_neg_resampled])

    weights = pd.Series(1, index=X_train_resampled.index)

    # Weighting conditions
    weights[X_train_resampled['defenders_in_box'] > 2.5] += 1
    weights[X_train_resampled['epa'] <= 1] += 1
    weights[X_train_resampled['play_type'] > 7.5] += 1
    weights[X_train_resampled['game_seconds_remaining'] > 2300 ] -= 1

    # Train Decision Tree
    dt_model = DecisionTreeClassifier(random_state=42, max_depth=6)
    dt_model.fit(X_train_resampled, y_train_resampled, sample_weight=weights)

    # Predict & Evaluate
    y_pred = dt_model.predict(X_test_resampled)
    accuracy_scores.append(accuracy_score(y_test_resampled, y_pred))
    precision_scores.append(precision_score(y_test_resampled, y_pred))
    recall_scores.append(recall_score(y_test_resampled, y_pred))


In [8]:
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print(f"Mean Accuracy: {mean_accuracy:.4f}")
print(f"Min Accuracy: {min(accuracy_scores):.4f}")
print(f"Max Accuracy: {max(accuracy_scores):.4f}")

mean_precision = sum(precision_scores) / len(precision_scores)
print(f"Mean Precision: {mean_precision:.4f}")
print(f"Min Precision: {min(precision_scores):.4f}")
print(f"Max Precision: {max(precision_scores):.4f}")

mean_recall = sum(recall_scores) / len(recall_scores)
print(f"Mean Recall: {mean_recall:.4f}")
print(f"Min Recall: {min(recall_scores):.4f}")
print(f"Max Recall: {max(recall_scores):.4f}")


Mean Accuracy: 0.5807
Min Accuracy: 0.4833
Max Accuracy: 0.6640
Mean Precision: 0.5628
Min Precision: 0.4844
Max Precision: 0.6374
Mean Recall: 0.7110
Min Recall: 0.4466
Max Recall: 0.8814


**Mean Accuracy:** 0.5696  
**Mean Precision:** 0.5543  
**Mean Recall:** 0.6853