In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import pandas as pd
import numpy as np
import copy
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
import clean as clean

In [3]:
# Helper function to calculate False Negative Rate
def evaluate_fnr(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    fn = cm[1, 0]
    tp = cm[1, 1]
    return fn / (fn + tp)

In [4]:
# Load the raw data
dt_raw = pd.read_csv('../data/TBI PUD 10-08-2013.csv')
dt_clean = clean.clean_data('../data/TBI PUD 10-08-2013.csv', remove_feats_after_ct=True, remove_TBI_rows_with_nan=True,threshold=0.5,rm_feats=True,remove_GCS_total_mismatch=True)

In [5]:
# Split into training and test sets
train_df, test_df = train_test_split(dt_clean, test_size=0.2, random_state=42)

# Separate features (X) and target (y)
y_train = train_df["PosIntFinal"]
X_train = train_df.drop(columns=["PosIntFinal"])
y_test = test_df["PosIntFinal"]
X_test = test_df.drop(columns=["PosIntFinal"])

In [6]:
# Impute missing values in X_train and X_test
imputer = SimpleImputer(strategy='mean')  # Impute with mean values
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Initialize models
lda_model = LinearDiscriminantAnalysis()

In [7]:
# Train LDA model
lda_model.fit(X_train, y_train)

# Predictions for LDA
y_pred_lda = lda_model.predict(X_test)
y_proba_lda = lda_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class
# save the model in results folder
np.save('../results/lda.npy', y_proba_lda)

In [8]:
# Evaluation Metrics for LDA
print("=== LDA Evaluation ===")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lda))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lda))
print("\nROC AUC Score:")
print(roc_auc_score(y_test, y_proba_lda))

=== LDA Evaluation ===
Confusion Matrix:
[[8186   68]
 [  83   71]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99      8254
         1.0       0.51      0.46      0.48       154

    accuracy                           0.98      8408
   macro avg       0.75      0.73      0.74      8408
weighted avg       0.98      0.98      0.98      8408


ROC AUC Score:
0.9385831033517004


In [9]:
# False negative rate for LDA
fnr_lda = evaluate_fnr(y_test, y_pred_lda)
print(f'False Negative Rate for LDA: {fnr_lda}')

False Negative Rate for LDA: 0.538961038961039
