In [19]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import clean as clean
from sklearn.impute import SimpleImputer
from sklearn.utils import resample

In [21]:
# Helper function to calculate False Negative Rate
def evaluate_fnr(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    fn = cm[1, 0]
    tp = cm[1, 1]
    return fn / (fn + tp)

In [22]:
# Load the raw data
dt_raw = pd.read_csv('../data/TBI PUD 10-08-2013.csv')
dt_clean = clean.clean_data('../data/TBI PUD 10-08-2013.csv', remove_feats_after_ct=True, remove_TBI_rows_with_nan=True,threshold=0.5,rm_feats=True,remove_GCS_total_mismatch=True)

In [23]:
# Sample dataset (replace this with your actual data)
X = dt_clean.drop(columns=["PosIntFinal"])  # Features
y = dt_clean["PosIntFinal"]  # Target

# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values
imputer = SimpleImputer(strategy='mean')  # Replace NaN with mean
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

In [24]:
# Oversample the minority class to simulate class weighting, run this is you want to save train test data
save = False
# Combine X_train and y_train into one dataset
train_data = np.hstack((X_train, y_train.values.reshape(-1, 1)))

# Separate majority and minority classes
majority = train_data[y_train == 0]
minority = train_data[y_train == 1]

# Oversample the minority class
minority_oversampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)

# Combine back into a balanced dataset
train_balanced = np.vstack((majority, minority_oversampled))
np.random.shuffle(train_balanced)  # Shuffle the dataset

# Split features and target
X_train_balanced = train_balanced[:, :-1]
y_train_balanced = train_balanced[:, -1]

if save:
    # save x_train_balanced, y_train_balanced, x_test, y_test
    np.save('../train_test/X_train_balanced.npy', X_train_balanced)
    np.save('../train_test/y_train_balanced.npy', y_train_balanced)
    np.save('../train_test/X_test.npy', X_test)
    np.save('../train_test/y_test.npy', y_test)

In [25]:
# LDA
# Initialize model
lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train_balanced, y_train_balanced)
# Predictions for LDA
y_pred_lda = lda_model.predict(X_test)
y_proba_lda = lda_model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class
np.save('../results/lda.npy', y_proba_lda)

In [26]:
# False negative rate for LDA
fnr_lda = evaluate_fnr(y_test, y_pred_lda)
print(f'False Negative Rate for LDA: {fnr_lda}')

False Negative Rate for LDA: 0.16233766233766234


In [27]:
# Evaluation Metrics for LDA
print("=== LDA Evaluation ===")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_lda))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_lda))
print("\nROC AUC Score:")
print(roc_auc_score(y_test, y_proba_lda))

=== LDA Evaluation ===
Confusion Matrix:
[[7339  915]
 [  25  129]]

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.89      0.94      8254
         1.0       0.12      0.84      0.22       154

    accuracy                           0.89      8408
   macro avg       0.56      0.86      0.58      8408
weighted avg       0.98      0.89      0.93      8408


ROC AUC Score:
0.9380143118330665
