In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier,
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import resample

In [2]:
pbp = pd.read_csv('../../Data/play-by-play/pbp_exp12.csv')

In [None]:
# List of categorical columns
categorical_cols = [
    'OffenseTeam', 'DefenseTeam', 'play_type', 'side_of_field', 'stadium', 
    'play_type_nfl', 'roof', 'surface_type', 'home_team', 'away_team', 
    'season_type', 'offense_formation', 'temperature_grade', 'temperature_bucket'
]

# Dictionary to hold LabelEncoders for each column
label_encoders = {}

# Apply LabelEncoder to each categorical column and store encoders
for col in categorical_cols:
    le = LabelEncoder()
    pbp[col] = le.fit_transform(pbp[col].astype(str))  # Ensure the column is treated as a string
    label_encoders[col] = le 

In [4]:
for col, le in label_encoders.items():
    print(f"Column: {col}")
    for i, class_label in enumerate(le.classes_):
        print(f"  {class_label} -> {i}")
    print("\n" + "-"*30 + "\n")

Column: OffenseTeam
  ARI -> 0
  ATL -> 1
  BAL -> 2
  BUF -> 3
  CAR -> 4
  CHI -> 5
  CIN -> 6
  CLE -> 7
  DAL -> 8
  DEN -> 9
  DET -> 10
  GB -> 11
  HOU -> 12
  IND -> 13
  JAX -> 14
  KC -> 15
  LA -> 16
  LAC -> 17
  LV -> 18
  MIA -> 19
  MIN -> 20
  NE -> 21
  NO -> 22
  NYG -> 23
  NYJ -> 24
  PHI -> 25
  PIT -> 26
  SEA -> 27
  SF -> 28
  TB -> 29
  TEN -> 30
  WAS -> 31

------------------------------

Column: DefenseTeam
  ARI -> 0
  ATL -> 1
  BAL -> 2
  BUF -> 3
  CAR -> 4
  CHI -> 5
  CIN -> 6
  CLE -> 7
  DAL -> 8
  DEN -> 9
  DET -> 10
  GB -> 11
  HOU -> 12
  IND -> 13
  JAX -> 14
  KC -> 15
  LA -> 16
  LAC -> 17
  LV -> 18
  MIA -> 19
  MIN -> 20
  NE -> 21
  NO -> 22
  NYG -> 23
  NYJ -> 24
  PHI -> 25
  PIT -> 26
  SEA -> 27
  SF -> 28
  TB -> 29
  TEN -> 30
  WAS -> 31

------------------------------

Column: play_type
  extra_point -> 0
  field_goal -> 1
  kickoff -> 2
  no_play -> 3
  pass -> 4
  punt -> 5
  qb_kneel -> 6
  qb_spike -> 7
  run -> 8

---------

In [5]:
X = pbp[['week', 'OffenseTeam', 'DefenseTeam', 'Down', 'YardsToGo', 'yardline_100', 'play_type',
               'Quarter', 'game_seconds_remaining', 'game_half', 'drive', 'series', 'score_differential',
               'stadium', 'roof', 'surface_type', 'HOME_day_since_last_game', 'AWAY_day_since_last_game',
               'total_play_count', 'home_team_on_offense', 'home_team_on_defense', 'season_progression',
               'defenders_in_box', 'offense_formation', 'Temperature', 'Precipitation', 'goal_line_situation',
               'pass_count', 'run_count', 'current_defence_rank','temperature_grade', 'epa', 
               'late_season','is_close_game', 'poor_field_condition', 'rest_differential', 'blitz_situation',
               'short_rest','pass_run_ratio', 'offensive_predictability', 'fourth_down_attempt', 'third_and_long']]


y = pbp['Player-Injured-On-Play']

In [6]:
def extract_rules(tree, feature_names):
    """ Extracts decision rules and correctly maps leaf nodes to rule numbers. """
    rules = {}
    leaf_to_rule = {}  # Maps leaf node index to sequential rule number
    node_counter = 0  

    def traverse(node, conditions):
        nonlocal node_counter
        if tree.children_left[node] == -1:  # Leaf node
            rule_number = node_counter
            leaf_to_rule[node] = rule_number  # Map tree leaf node to our rule number
            # Use the model's class prediction directly from tree.value, adjusted for multi-class cases
            class_pred = np.argmax(tree.value[node])  # This gives the predicted class at this leaf node
            rules[rule_number] = f"IF {' AND '.join(conditions)} THEN Injury = {class_pred}"
            node_counter += 1
            return
        
        feature = feature_names[tree.feature[node]]
        threshold = tree.threshold[node]

        traverse(tree.children_left[node], conditions + [f"{feature} <= {threshold:.2f}"])
        traverse(tree.children_right[node], conditions + [f"{feature} > {threshold:.2f}"])

    traverse(0, [])
    return rules, leaf_to_rule  # Return both rules and mapping

In [7]:
# --- Function to Assign a Rule Number to Each Instance Using Leaf Mapping ---
def get_rule_number(tree, sample, feature_names, leaf_to_rule):
    """ Returns the correctly mapped rule number for a given sample. """
    node = 0  
    while tree.children_left[node] != -1:  # Not a leaf node
        feature = feature_names[tree.feature[node]]
        threshold = tree.threshold[node]
        node = tree.children_left[node] if sample[feature] <= threshold else tree.children_right[node]
    
    return leaf_to_rule.get(node, -1)

In [13]:
k = 53  
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

# Initialize lists for storing metrics and models
accuracy_scores, precision_scores, recall_scores = [], [], []
false_positive_cases, false_negative_cases = [], []
fold_models = {}

# --- Cross-Validation Loop ---
for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Separate injured and non-injured plays
    X_train_pos = X_train[y_train == 1]
    X_train_neg = X_train[y_train == 0]
    y_train_pos = y_train[y_train == 1]
    y_train_neg = y_train[y_train == 0]

    X_test_pos = X_test[y_test == 1]
    X_test_neg = X_test[y_test == 0]
    y_test_pos = y_test[y_test == 1]
    y_test_neg = y_test[y_test == 0]

    # Resample to ensure balance
    X_train_pos_resampled, y_train_pos_resampled = resample(X_train_pos, y_train_pos, replace=True, n_samples=1619, random_state=42)
    X_train_neg_resampled, y_train_neg_resampled = resample(X_train_neg, y_train_neg, replace=False, n_samples=1619, random_state=42)

    X_test_pos_resampled, y_test_pos_resampled = resample(X_test_pos, y_test_pos, replace=True, n_samples=1619, random_state=42)
    X_test_neg_resampled, y_test_neg_resampled = resample(X_test_neg, y_test_neg, replace=False, n_samples=1619, random_state=42)

    # Combine resampled data
    X_train_resampled = pd.concat([X_train_pos_resampled, X_train_neg_resampled])
    y_train_resampled = pd.concat([y_train_pos_resampled, y_train_neg_resampled])

    X_test_resampled = pd.concat([X_test_pos_resampled, X_test_neg_resampled])
    y_test_resampled = pd.concat([y_test_pos_resampled, y_test_neg_resampled])

    # Train Decision Tree
    dt_model = DecisionTreeClassifier(random_state=42, max_depth=6)
    dt_model.fit(X_train_resampled, y_train_resampled)

    # Extract rules and leaf node mapping for this fold
    rules, leaf_to_rule = extract_rules(dt_model.tree_, list(X.columns))

    # Predict & Evaluate
    y_pred = dt_model.predict(X_test_resampled)
    accuracy_scores.append(accuracy_score(y_test_resampled, y_pred))
    precision_scores.append(precision_score(y_test_resampled, y_pred))
    recall_scores.append(recall_score(y_test_resampled, y_pred))

    # Identify misclassified cases
    test_df = X_test_resampled.copy()
    test_df['Predicted'] = y_pred
    test_df['Actual'] = y_test_resampled
    test_df['Fold'] = fold

    # Assign correctly mapped rule numbers
    test_df['Rule_Number'] = test_df.apply(lambda row: get_rule_number(dt_model.tree_, row, list(X.columns), leaf_to_rule), axis=1)
    # Store the trained model
    fold_models[fold] = [dt_model, test_df]

## Mean Accuracy fold - Rulebase Analysis

In [None]:
# --- Identify Mean Fold ---
mean_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print(f"Mean Accuracy: {mean_accuracy:.4f}")
print(accuracy_scores)

best_fold_index = np.argmin(np.abs(np.array(accuracy_scores) - mean_accuracy))
print(best_fold_index)

# --- Extract Rules for the Mean Fold ---
best_fold_model = fold_models[best_fold_index][0]
best_fold_df = fold_models[best_fold_index][1]
best_tree_rules, best_leaf_to_rule = extract_rules(best_fold_model.tree_, list(X.columns))

Mean Accuracy: 0.5696
[0.6000617665225447, 0.541383570105003, 0.5216182828906732, 0.46973440395305743, 0.6111797405806053, 0.6164298949969117, 0.5781346510191476, 0.5466337245213094, 0.47992588017294624, 0.5562075355157504, 0.5046324891908586, 0.5728844966028412, 0.6238418777022854, 0.5778258184064237, 0.535206917850525, 0.6084002470660902, 0.6012970969734404, 0.4845583693638048, 0.5707226683137739, 0.5194564546016059, 0.5904879555281037, 0.519147621988882, 0.6402100061766522, 0.6037677578752316, 0.592958616429895, 0.5030883261272391, 0.5491043854231007, 0.5679431747992588, 0.6179740580605312, 0.6624459542927733, 0.5716491661519456, 0.6170475602223595, 0.5815318097591106, 0.5605311920938851, 0.6331068560840025, 0.5342804200123533, 0.5799876466954911, 0.6031500926497838, 0.5982087708462014, 0.4576899320568252, 0.6065472513897467, 0.5799876466954911, 0.6479308214947499, 0.5052501544163064, 0.6077825818406424, 0.5401482396541075, 0.5392217418159357, 0.5957381099444101, 0.5753551575046325,

In [None]:
# --- Save Rules to File ---
with open("../../Data/explainability/mean_acc_fold/mean_rulebase.txt", "w") as f:
    for rule_number, rule_text in best_tree_rules.items():
        f.write(f"Rule {rule_number}: {rule_text}\n")


# Save all instances with corrected rule numbers
best_fold_df.to_csv('../../Data/explainability/mean_acc_fold/mean_classifications_with_rules.csv', index=False)

In [17]:
class_df = pd.read_csv('../../Data/explainability/mean_acc_fold/mean_classifications_with_rules.csv')

In [18]:
true_df = class_df[class_df['Actual'] == class_df['Predicted']]
false_df = class_df[class_df['Actual'] != class_df['Predicted']]
print(true_df.shape, false_df.shape)

(1848, 46) (1390, 46)


In [19]:
rule_acc_df = pd.DataFrame()
for rule in class_df['Rule_Number'].unique():
    rule_df = class_df[class_df['Rule_Number']==rule]
    rule_count = class_df[class_df['Rule_Number']==rule].shape[0]
    rule_decisions = list(rule_df['Predicted'])
    #print(rule_decisions)
    rule_accuracy = true_df[true_df['Rule_Number']==rule].shape[0] / class_df[class_df['Rule_Number']==rule].shape[0]
    row = pd.DataFrame({'Rule' : [rule], 'Accuracy' : [rule_accuracy], '#Times_Used' : [rule_count], 'Rule Decision' : [rule_decisions[0]]})
    rule_acc_df = pd.concat([rule_acc_df, row], axis=0)
    #print(rule, rule_accuracy)
rule_acc_df.sort_values(by=['Accuracy', '#Times_Used'], ascending=[True, True], ignore_index=True, inplace=True)

rule_acc_df.to_csv('../../Data/explainability/mean_acc_fold/mean_rule_results.csv', index=False)

## Max Accuracy fold - Rulebase Analysis

In [None]:
# --- Identify Max Fold  ---
max_accuracy = max(accuracy_scores)
print(f"Max Accuracy: {max_accuracy:.4f}")
print(accuracy_scores)

max_fold_index = np.argmax(accuracy_scores)
print(max_fold_index)

# --- Extract Rules for the Max Fold ---
max_fold_model = fold_models[max_fold_index][0]
max_fold_df = fold_models[max_fold_index][1]
max_tree_rules, max_leaf_to_rule = extract_rules(max_fold_model.tree_, list(X.columns))

Max Accuracy: 0.6624
[0.6000617665225447, 0.541383570105003, 0.5216182828906732, 0.46973440395305743, 0.6111797405806053, 0.6164298949969117, 0.5781346510191476, 0.5466337245213094, 0.47992588017294624, 0.5562075355157504, 0.5046324891908586, 0.5728844966028412, 0.6238418777022854, 0.5778258184064237, 0.535206917850525, 0.6084002470660902, 0.6012970969734404, 0.4845583693638048, 0.5707226683137739, 0.5194564546016059, 0.5904879555281037, 0.519147621988882, 0.6402100061766522, 0.6037677578752316, 0.592958616429895, 0.5030883261272391, 0.5491043854231007, 0.5679431747992588, 0.6179740580605312, 0.6624459542927733, 0.5716491661519456, 0.6170475602223595, 0.5815318097591106, 0.5605311920938851, 0.6331068560840025, 0.5342804200123533, 0.5799876466954911, 0.6031500926497838, 0.5982087708462014, 0.4576899320568252, 0.6065472513897467, 0.5799876466954911, 0.6479308214947499, 0.5052501544163064, 0.6077825818406424, 0.5401482396541075, 0.5392217418159357, 0.5957381099444101, 0.5753551575046325, 

In [None]:
# --- Save Rules to File  ---
with open("../../Data/explainability/max_acc_fold/max_rulebase.txt", "w") as f:
    for rule_number, rule_text in max_tree_rules.items():
        f.write(f"Rule {rule_number}: {rule_text}\n")


# Save all instances with corrected rule numbers
max_fold_df.to_csv('../../Data/explainability/max_acc_fold/max_classifications_with_rules.csv', index=False)

In [22]:
max_class_df = pd.read_csv('../../Data/explainability/max_acc_fold/max_classifications_with_rules.csv')

In [23]:
max_true_df = max_class_df[max_class_df['Actual'] == max_class_df['Predicted']]
max_false_df = max_class_df[max_class_df['Actual'] != max_class_df['Predicted']]
print(max_true_df.shape, max_false_df.shape)

(2145, 46) (1093, 46)


In [24]:
max_rule_acc_df = pd.DataFrame()
for rule in max_class_df['Rule_Number'].unique():
    max_rule_df = max_class_df[max_class_df['Rule_Number']==rule]
    max_rule_count = max_class_df[max_class_df['Rule_Number']==rule].shape[0]
    max_rule_decisions = list(max_rule_df['Predicted'])
    #print(max_rule_decisions)
    max_rule_accuracy = max_true_df[max_true_df['Rule_Number']==rule].shape[0] / max_class_df[max_class_df['Rule_Number']==rule].shape[0]
    max_row = pd.DataFrame({'Rule' : [rule], 'Accuracy' : [max_rule_accuracy], '#Times_Used' : [max_rule_count], 'Rule Decision' : [max_rule_decisions[0]]})
    max_rule_acc_df = pd.concat([max_rule_acc_df, max_row], axis=0)
    #print(rule, rule_accuracy)
max_rule_acc_df.sort_values(by=['Accuracy', '#Times_Used'], ascending=[True, True], ignore_index=True, inplace=True)

max_rule_acc_df.to_csv('../../Data/explainability/max_acc_fold/max_rule_results.csv', index=False)

## Min Accuracy fold - Rulebase Analysis

In [None]:
# --- Identify Min Fold  ---
min_accuracy = min(accuracy_scores)
print(f"Min Accuracy: {min_accuracy:.4f}")
print(accuracy_scores)

min_fold_index = np.argmin(accuracy_scores)
print(min_fold_index)

# --- Extract Rules for the Min Fold ---
min_fold_model = fold_models[min_fold_index][0]
min_fold_df = fold_models[min_fold_index][1]
min_tree_rules, min_leaf_to_rule = extract_rules(min_fold_model.tree_, list(X.columns))

Min Accuracy: 0.4577
[0.6000617665225447, 0.541383570105003, 0.5216182828906732, 0.46973440395305743, 0.6111797405806053, 0.6164298949969117, 0.5781346510191476, 0.5466337245213094, 0.47992588017294624, 0.5562075355157504, 0.5046324891908586, 0.5728844966028412, 0.6238418777022854, 0.5778258184064237, 0.535206917850525, 0.6084002470660902, 0.6012970969734404, 0.4845583693638048, 0.5707226683137739, 0.5194564546016059, 0.5904879555281037, 0.519147621988882, 0.6402100061766522, 0.6037677578752316, 0.592958616429895, 0.5030883261272391, 0.5491043854231007, 0.5679431747992588, 0.6179740580605312, 0.6624459542927733, 0.5716491661519456, 0.6170475602223595, 0.5815318097591106, 0.5605311920938851, 0.6331068560840025, 0.5342804200123533, 0.5799876466954911, 0.6031500926497838, 0.5982087708462014, 0.4576899320568252, 0.6065472513897467, 0.5799876466954911, 0.6479308214947499, 0.5052501544163064, 0.6077825818406424, 0.5401482396541075, 0.5392217418159357, 0.5957381099444101, 0.5753551575046325, 

In [None]:
# --- Save Rules to File  ---
with open("../../Data/explainability/min_acc_fold/min_rulebase.txt", "w") as f:
    for rule_number, rule_text in min_tree_rules.items():
        f.write(f"Rule {rule_number}: {rule_text}\n")


# Save all instances with corrected rule numbers
min_fold_df.to_csv('../../Data/explainability/min_acc_fold/min_classifications_with_rules.csv', index=False)

In [27]:
min_class_df = pd.read_csv('../../Data/explainability/min_acc_fold/min_classifications_with_rules.csv')

In [28]:
min_true_df = min_class_df[min_class_df['Actual'] == min_class_df['Predicted']]
min_false_df = min_class_df[min_class_df['Actual'] != min_class_df['Predicted']]
print(min_true_df.shape, min_false_df.shape)

(1482, 46) (1756, 46)


In [29]:
min_rule_acc_df = pd.DataFrame()
for rule in min_class_df['Rule_Number'].unique():
    min_rule_df = min_class_df[min_class_df['Rule_Number']==rule]
    min_rule_count = min_class_df[min_class_df['Rule_Number']==rule].shape[0]
    min_rule_decisions = list(min_rule_df['Predicted'])
    #print(min_rule_decisions)
    min_rule_accuracy = min_true_df[min_true_df['Rule_Number']==rule].shape[0] / min_class_df[min_class_df['Rule_Number']==rule].shape[0]
    min_row = pd.DataFrame({'Rule' : [rule], 'Accuracy' : [min_rule_accuracy], '#Times_Used' : [min_rule_count], 'Rule Decision' : [min_rule_decisions[0]]})
    min_rule_acc_df = pd.concat([min_rule_acc_df, min_row], axis=0)
    #print(rule, rule_accuracy)
min_rule_acc_df.sort_values(by=['Accuracy', '#Times_Used'], ascending=[True, True], ignore_index=True, inplace=True)

min_rule_acc_df.to_csv('../../Data/explainability/min_acc_fold/min_rule_results.csv', index=False)