In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import resample

In [10]:
pbp = pd.read_csv('../../Data/play-by-play/pbp_exp12.csv')

## Python Feature Importance - RF

In [None]:
# List of categorical columns
categorical_cols = [
    'OffenseTeam', 'stadium'
]

# Dictionary to hold LabelEncoders for each column
label_encoders = {}

# Apply LabelEncoder to each categorical column and store encoders
for col in categorical_cols:
    le = LabelEncoder()
    pbp[col] = le.fit_transform(pbp[col].astype(str))  # Ensure the column is treated as a string
    label_encoders[col] = le 

In [30]:
X = pbp[['defenders_in_box', 'epa', 'game_seconds_remaining', 'yardline_100',
       'score_differential', 'OffenseTeam', 'stadium',
       'offensive_predictability', 'current_defence_rank',
       'home_team_on_offense', ]]


y = pbp['Player-Injured-On-Play']

In [31]:
k = 53  
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

# Initialize lists for storing metrics and models
accuracy_scores, precision_scores, recall_scores = [], [], []
false_positive_cases, false_negative_cases = [], []
fold_models = {}
feature_importances = np.zeros(X.shape[1])


# --- Cross-Validation Loop ---
for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Separate injured and non-injured plays
    X_train_pos = X_train[y_train == 1]
    X_train_neg = X_train[y_train == 0]
    y_train_pos = y_train[y_train == 1]
    y_train_neg = y_train[y_train == 0]

    X_test_pos = X_test[y_test == 1]
    X_test_neg = X_test[y_test == 0]
    y_test_pos = y_test[y_test == 1]
    y_test_neg = y_test[y_test == 0]

    # Resample to ensure balance
    X_train_pos_resampled, y_train_pos_resampled = resample(X_train_pos, y_train_pos, replace=True, n_samples=1619, random_state=42)
    X_train_neg_resampled, y_train_neg_resampled = resample(X_train_neg, y_train_neg, replace=False, n_samples=1619, random_state=42)

    X_test_pos_resampled, y_test_pos_resampled = resample(X_test_pos, y_test_pos, replace=True, n_samples=1619, random_state=42)
    X_test_neg_resampled, y_test_neg_resampled = resample(X_test_neg, y_test_neg, replace=False, n_samples=1619, random_state=42)

    # Combine resampled data
    X_train_resampled = pd.concat([X_train_pos_resampled, X_train_neg_resampled])
    y_train_resampled = pd.concat([y_train_pos_resampled, y_train_neg_resampled])

    X_test_resampled = pd.concat([X_test_pos_resampled, X_test_neg_resampled])
    y_test_resampled = pd.concat([y_test_pos_resampled, y_test_neg_resampled])

    # Train Decision Tree
    dt_model = DecisionTreeClassifier(random_state=42, max_depth=6)
    dt_model.fit(X_train_resampled, y_train_resampled)

    # Predict & Evaluate
    y_pred = dt_model.predict(X_test_resampled)
    accuracy_scores.append(accuracy_score(y_test_resampled, y_pred))
    precision_scores.append(precision_score(y_test_resampled, y_pred))
    recall_scores.append(recall_score(y_test_resampled, y_pred))

    # Identify misclassified cases
    test_df = X_test_resampled.copy()
    test_df['Predicted'] = y_pred
    test_df['Actual'] = y_test_resampled
    test_df['Fold'] = fold

    # Store the trained model
    fold_models[fold] = [dt_model, test_df]



In [32]:
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print(f"Mean Accuracy: {mean_accuracy:.4f}")
#print(f"Min Accuracy: {min(accuracy_scores):.4f}")
#print(f"Max Accuracy: {max(accuracy_scores):.4f}")

mean_precision = sum(precision_scores) / len(precision_scores)
print(f"Mean Precision: {mean_precision:.4f}")
#print(f"Min Precision: {min(precision_scores):.4f}")
#print(f"Max Precision: {max(precision_scores):.4f}")

mean_recall = sum(recall_scores) / len(recall_scores)
print(f"Mean Recall: {mean_recall:.4f}")
#print(f"Min Recall: {min(recall_scores):.4f}")
#print(f"Max Recall: {max(recall_scores):.4f}")


Mean Accuracy: 0.5670
Mean Precision: 0.5525
Mean Recall: 0.6787


## Our Feature Importance - RF

In [None]:
# List of categorical columns
categorical_cols = [
    'stadium', 'surface_type', 'offense_formation'
]

# Dictionary to hold LabelEncoders for each column
label_encoders = {}

# Apply LabelEncoder to each categorical column and store encoders
for col in categorical_cols:
    le = LabelEncoder()
    pbp[col] = le.fit_transform(pbp[col].astype(str))  # Ensure the column is treated as a string
    label_encoders[col] = le 

In [4]:
X = pbp[[ 'yardline_100', 'stadium', 'game_seconds_remaining', 
            'surface_type', 'HOME_day_since_last_game', 'home_team_on_defense', 
               'defenders_in_box', 'offense_formation','run_count', 'epa']]


y = pbp['Player-Injured-On-Play']

In [6]:
k = 53  
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

# Initialize lists for storing metrics and models
accuracy_scores, precision_scores, recall_scores = [], [], []
false_positive_cases, false_negative_cases = [], []
fold_models = {}


# --- Cross-Validation Loop ---
for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Separate injured and non-injured plays
    X_train_pos = X_train[y_train == 1]
    X_train_neg = X_train[y_train == 0]
    y_train_pos = y_train[y_train == 1]
    y_train_neg = y_train[y_train == 0]

    X_test_pos = X_test[y_test == 1]
    X_test_neg = X_test[y_test == 0]
    y_test_pos = y_test[y_test == 1]
    y_test_neg = y_test[y_test == 0]

    # Resample to ensure balance
    X_train_pos_resampled, y_train_pos_resampled = resample(X_train_pos, y_train_pos, replace=True, n_samples=1619, random_state=42)
    X_train_neg_resampled, y_train_neg_resampled = resample(X_train_neg, y_train_neg, replace=False, n_samples=1619, random_state=42)

    X_test_pos_resampled, y_test_pos_resampled = resample(X_test_pos, y_test_pos, replace=True, n_samples=1619, random_state=42)
    X_test_neg_resampled, y_test_neg_resampled = resample(X_test_neg, y_test_neg, replace=False, n_samples=1619, random_state=42)

    # Combine resampled data
    X_train_resampled = pd.concat([X_train_pos_resampled, X_train_neg_resampled])
    y_train_resampled = pd.concat([y_train_pos_resampled, y_train_neg_resampled])

    X_test_resampled = pd.concat([X_test_pos_resampled, X_test_neg_resampled])
    y_test_resampled = pd.concat([y_test_pos_resampled, y_test_neg_resampled])

    # Train Decision Tree
    dt_model = DecisionTreeClassifier(random_state=42, max_depth=6)
    dt_model.fit(X_train_resampled, y_train_resampled)

    # Predict & Evaluate
    y_pred = dt_model.predict(X_test_resampled)
    accuracy_scores.append(accuracy_score(y_test_resampled, y_pred))
    precision_scores.append(precision_score(y_test_resampled, y_pred))
    recall_scores.append(recall_score(y_test_resampled, y_pred))

    # Identify misclassified cases
    test_df = X_test_resampled.copy()
    test_df['Predicted'] = y_pred
    test_df['Actual'] = y_test_resampled
    test_df['Fold'] = fold

    # Store the trained model
    fold_models[fold] = [dt_model, test_df]

In [8]:
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print(f"Mean Accuracy: {mean_accuracy:.4f}")
#print(f"Min Accuracy: {min(accuracy_scores):.4f}")
#print(f"Max Accuracy: {max(accuracy_scores):.4f}")

mean_precision = sum(precision_scores) / len(precision_scores)
print(f"Mean Precision: {mean_precision:.4f}")
#print(f"Min Precision: {min(precision_scores):.4f}")
#print(f"Max Precision: {max(precision_scores):.4f}")

mean_recall = sum(recall_scores) / len(recall_scores)
print(f"Mean Recall: {mean_recall:.4f}")
#print(f"Min Recall: {min(recall_scores):.4f}")
#print(f"Max Recall: {max(recall_scores):.4f}")

Mean Accuracy: 0.5745
Mean Precision: 0.5582
Mean Recall: 0.6954
