In [1]:
import pandas as pd
import os
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

#  Define the file path
cleaned_file = "D:/MIMIC-IV-Data-Pipeline/processed_data/mimic_cleaned_v8.csv.gz"

#  Load the dataset
df = pd.read_csv(cleaned_file, compression="gzip")

#  Confirm successful load
print(" Cleaned dataset loaded successfully.")
print(f"Dataset Shape: {df.shape}")  # Check rows & columns
print("\n🔍 Data Preview:")
print(df.head())  # Display first 5 rows

# ✅ Check data types and missing values
print("\n🔍 Data Info:")
print(df.info())


 Cleaned dataset loaded successfully.
Dataset Shape: (546028, 40)

🔍 Data Preview:
   subject_id   hadm_id  admission_type      admission_location  \
0    10000032  22595853          URGENT  TRANSFER FROM HOSPITAL   
1    10000032  22841357        EW EMER.          EMERGENCY ROOM   
2    10000032  25742920        EW EMER.          EMERGENCY ROOM   
3    10000032  29079034        EW EMER.          EMERGENCY ROOM   
4    10000068  25022803  EU OBSERVATION          EMERGENCY ROOM   

  discharge_location insurance marital_status   race  ed_time_spent  los_hosp  \
0               HOME  Medicaid        WIDOWED  WHITE          253.0  0.786111   
1               HOME  Medicaid        WIDOWED  WHITE          337.0  1.015278   
2            HOSPICE  Medicaid        WIDOWED  WHITE          286.0  1.754167   
3               HOME  Medicaid        WIDOWED  WHITE          486.0  2.222222   
4            UNKNOWN   UNKNOWN         SINGLE  WHITE          511.0  0.298611   

   ... icu_airway_flag  icu

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

#  Define Target Variable
target = "delirium"

#  Separate Features (X) and Target (y)
X = df.drop(columns=[target])
y = df[target]

#  Identify categorical columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
print("🔍 Categorical Variables in Data:", categorical_cols)

#  Check if Logistic Regression can handle them directly
print("🔍 Data Types of Features:")
print(X.dtypes)

🔍 Categorical Variables in Data: ['admission_type', 'admission_location', 'discharge_location', 'insurance', 'marital_status', 'race', 'gender', 'primary_diagnosis', 'drug', 'age_group']
🔍 Data Types of Features:
subject_id                     int64
hadm_id                        int64
admission_type                object
admission_location            object
discharge_location            object
insurance                     object
marital_status                object
race                          object
ed_time_spent                float64
los_hosp                     float64
gender                        object
anchor_age                     int64
anchor_year                    int64
primary_diagnosis             object
palliative_care_flag           int64
cognitive_impairment_flag      int64
num_comorbidities              int64
prior_icu_admissions           int64
high_risk_med_flag           float64
unique_high_risk_med         float64
high_risk_med_count          float64
drug      

🔍 Categorical Features That Need Encoding
Your dataset has 10 categorical features that need conversion:

Nominal (unordered categories) → One-Hot Encoding (OHE)
admission_type
admission_location
discharge_location
insurance
marital_status
race
gender
primary_diagnosis
drug
age_group

#this didnt work as high cardinality features gave an error: 
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Define categorical columns
categorical_cols = [
    "admission_type", "admission_location", "discharge_location",
    "insurance", "marital_status", "race", "gender",
    "primary_diagnosis", "drug", "age_group"
]

#  Apply One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)  # Drop first category to avoid multicollinearity

#  Confirm new dataset shape
print(" One-Hot Encoding applied. New dataset shape:", df_encoded.shape)


high cardinality 📊   Use Target or Frequency Encoding for High-Cardinality Features
Instead of One-Hot Encoding, we will: ✅ Use Frequency Encoding for primary_diagnosis & drug (replace each category with its occurrence rate).
✅ Use One-Hot Encoding (OHE) for other categorical variables (small category sets).

In [9]:
from sklearn.preprocessing import OneHotEncoder

# ✅ Step 1: Frequency Encoding for High-Cardinality Features
for col in ["primary_diagnosis", "drug"]:
    freq_map = df[col].value_counts(normalize=True)  # Compute frequency of each category
    df.loc[:, col] = df[col].map(freq_map)  # Replace categories with their frequency

# ✅ Step 2: One-Hot Encoding for Remaining Categorical Variables
low_cardinality_cols = [
    "admission_type", "admission_location", "discharge_location",
    "insurance", "marital_status", "race", "gender", "age_group"
]

df_encoded = pd.get_dummies(df, columns=low_cardinality_cols, drop_first=True)  # One-Hot Encode low-cardinality features

# ✅ Confirm dataset shape
print("✅ Encoding applied. New dataset shape:", df_encoded.shape)


✅ Encoding applied. New dataset shape: (546028, 108)


Train & Evaluate Baseline Logistic Regression Model

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

#  Step 1: Define Features (X) and Target (y)
target = "delirium"
X = df_encoded.drop(columns=[target])
y = df_encoded[target]

#  Step 2: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#  Step 3: Standardize Numeric Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#  Step 4: Train Logistic Regression Model
model = LogisticRegression(max_iter=500, solver="lbfgs")
model.fit(X_train_scaled, y_train)

#  Step 5: Make Predictions
y_pred = model.predict(X_test_scaled)


#  Step 6: Evaluate Model Performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=True)

#  Step7: Evaluate Model Performance
print(" Logistic Regression Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\n🔍 Classification Report:")
print(classification_report(y_test, y_pred))


 Logistic Regression Model Performance:
Accuracy: 0.9818965990879622

🔍 Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    107248
           1       0.42      0.03      0.05      1958

    accuracy                           0.98    109206
   macro avg       0.70      0.51      0.52    109206
weighted avg       0.97      0.98      0.97    109206



baseline Logistic Regression model shows:

✅ High overall accuracy (98.19%)
⚠ Severe class imbalance (Delirium cases are rare, making the model biased towards predicting "No Delirium").
⚠ Poor recall & F1-score for delirium = 1 (very low sensitivity for identifying delirium cases).  

🔍 Key Issues & Insights
1️⃣ Class Imbalance Problem:

Delirium cases (1) are much fewer than non-delirium (0).
Model predicts "No Delirium" nearly all the time, leading to high accuracy but poor recall for delirium = 1.
2️⃣ Low Recall for delirium = 1 (3%)

This means most actual delirium cases are not being predicted correctly.
3️⃣ Potential Overfitting on Majority Class (delirium = 0)

Since 98% of cases are non-delirium, the model learns to default to "No Delirium" to maximize accuracy.

In [35]:
import json
import os

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Convert confusion matrix to a list format for JSON storage
conf_matrix_list = conf_matrix.tolist()


# Define performance metrics
# Define performance metrics with confusion matrix
performance_metrics = {
    "Model": "LogisticReg_Base",
    "Accuracy": accuracy,
    "Precision (Delirium = 1)": report["1"]["precision"],
    "Recall (Delirium = 1)": report["1"]["recall"],
    "F1-Score (Delirium = 1)": report["1"]["f1-score"],
    "Confusion Matrix": conf_matrix_list  # Store confusion matrix
}



# Compute ROC AUC Score
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]  # Get probability estimates
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Generate ROC Curve data
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)

# Save ROC curve plot
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {roc_auc:.4f})")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")  # Diagonal line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - LogisticReg_Base")
plt.legend()

roc_plot_path = "D:/MIMIC-IV-Data-Pipeline/ROC_LogisticReg_Base.png"
plt.savefig(roc_plot_path)
plt.close()

# Add ROC AUC score and plot path to performance metrics
performance_metrics["ROC AUC Score"] = roc_auc
performance_metrics["ROC Curve Path"] = roc_plot_path


# Define file path
performance_file = "D:/MIMIC-IV-Data-Pipeline/model_performance.json"

# Load existing data if available
if os.path.exists(performance_file):
    with open(performance_file, "r") as file:
        model_performance = json.load(file)
else:
    model_performance = []

# Append new results
model_performance.append(performance_metrics)

# Save to file
with open(performance_file, "w") as file:
    json.dump(model_performance, file, indent=4)


Address Class Imbalance & Improve Model Performance
To improve performance, we can:

🚀 Choose a Strategy to Handle Class Imbalance: 1️⃣ Use Class Weights (balanced mode in Logistic Regression).

This makes the model give more importance to delirium cases.
2️⃣ Apply Oversampling (SMOTE) or Undersampling.

SMOTE (Synthetic Minority Over-sampling Technique) generates synthetic delirium cases to balance the dataset. 

In [37]:
# Retrain:  Balanced Logistic Regression Model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

#  Step 1: Train Logistic Regression with Class Weights
model_balanced = LogisticRegression(max_iter=500, solver="lbfgs", class_weight="balanced")
model_balanced.fit(X_train_scaled, y_train)

#  Step 2: Make Predictions
y_pred_balanced = model_balanced.predict(X_test_scaled)

#  Step 3: Evaluate Model Performance
# Evaluate Balanced Logistic Regression Model Performance
accuracy_balanced = accuracy_score(y_test, y_pred_balanced)
report_balanced = classification_report(y_test, y_pred_balanced, output_dict=True)

conf_matrix_balanced = confusion_matrix(y_test, y_pred_balanced).tolist()

# Compute ROC AUC Score
y_pred_proba_balanced = model_balanced.predict_proba(X_test_scaled)[:, 1]
roc_auc_balanced = roc_auc_score(y_test, y_pred_proba_balanced)
fpr_balanced, tpr_balanced, _ = roc_curve(y_test, y_pred_proba_balanced)

# Save ROC curve plot
plt.figure(figsize=(8, 6))
plt.plot(fpr_balanced, tpr_balanced, label=f"ROC Curve (AUC = {roc_auc_balanced:.4f})")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")  # Diagonal line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - LogisticReg_Balanced")
plt.legend()

roc_plot_path_balanced = "D:/MIMIC-IV-Data-Pipeline/ROC_LogisticReg_Balanced.png"
plt.savefig(roc_plot_path_balanced)
plt.close()

# Define Balanced Logistic Regression performance metrics
performance_metrics_LR_balanced = {
    "Model": "LogisticReg_Balanced",
    "Accuracy": accuracy_balanced,
    "Precision (Delirium = 1)": report_balanced["1"]["precision"],
    "Recall (Delirium = 1)": report_balanced["1"]["recall"],
    "F1-Score (Delirium = 1)": report_balanced["1"]["f1-score"],
    "Confusion Matrix": conf_matrix_balanced,
    "ROC AUC Score": roc_auc_balanced,
    "ROC Curve Path": roc_plot_path_balanced
}

# Define file path
performance_file = "D:/MIMIC-IV-Data-Pipeline/model_performance.json"

# Load existing data if available
if os.path.exists(performance_file):
    with open(performance_file, "r") as file:
        model_performance = json.load(file)
else:
    model_performance = []

# Append new results
model_performance.append(performance_metrics_LR_balanced)

# Save to file
with open(performance_file, "w") as file:
    json.dump(model_performance, file, indent=4)

print("📊 Balanced Logistic Regression model performance saved successfully.")

print("✅ Balanced Logistic Regression Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_balanced))
print("\n🔍 Classification Report:")
print(classification_report(y_test, y_pred_balanced))


📊 Balanced Logistic Regression model performance saved successfully.
✅ Balanced Logistic Regression Model Performance:
Accuracy: 0.8157610387707636

🔍 Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.82      0.90    107248
           1       0.08      0.84      0.14      1958

    accuracy                           0.82    109206
   macro avg       0.54      0.83      0.52    109206
weighted avg       0.98      0.82      0.88    109206



Evaluating Balanced Logistic Regression Model
Your balanced Logistic Regression model shows:

✅ Drastic improvement in recall for delirium = 1 (84%) → Now the model correctly identifies most delirium cases.
⚠ Significant drop in accuracy (81.6%) → Model is misclassifying more non-delirium cases (delirium = 0).
⚠ Low precision for delirium = 1 (8%) → Many predicted delirium cases are false positives.

📊 Observations
The model now catches most delirium cases (high recall) but at the cost of lowering precision and overall accuracy.
False positives have increased, meaning more non-delirium patients are incorrectly classified as delirium.


In [39]:
#SMOTE : Apply SMOTE Oversampling to Balance the Dataset 
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

#  Step 1: Apply SMOTE to Balance Dataset
smote = SMOTE(sampling_strategy="auto", random_state=42)  # Auto balances the dataset
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

#  Step 2: Train Logistic Regression on SMOTE Data
model_smote = LogisticRegression(max_iter=500, solver="lbfgs")
model_smote.fit(X_train_smote, y_train_smote)

#  Step 3: Make Predictions
y_pred_smote = model_smote.predict(X_test_scaled)


#  Step 4: Evaluate SMOTE Model Performance
accuracy_smote = accuracy_score(y_test, y_pred_smote)
report_smote = classification_report(y_test, y_pred_smote, output_dict=True)
conf_matrix_smote = confusion_matrix(y_test, y_pred_smote).tolist()

# Compute ROC AUC Score
y_pred_proba_smote = model_smote.predict_proba(X_test_scaled)[:, 1]
roc_auc_smote = roc_auc_score(y_test, y_pred_proba_smote)
fpr_smote, tpr_smote, _ = roc_curve(y_test, y_pred_proba_smote)

# Save ROC curve plot
plt.figure(figsize=(8, 6))
plt.plot(fpr_smote, tpr_smote, label=f"ROC Curve (AUC = {roc_auc_smote:.4f})")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - LogisticReg_SMOTE")
plt.legend()

roc_plot_path_smote = "D:/MIMIC-IV-Data-Pipeline/ROC_LogisticReg_SMOTE.png"
plt.savefig(roc_plot_path_smote)
plt.close()

# Define SMOTE performance metrics
performance_metrics_smote = {
    "Model": "LogisticReg_SMOTE",
    "Accuracy": accuracy_smote,
    "Precision (Delirium = 1)": report_smote["1"]["precision"],
    "Recall (Delirium = 1)": report_smote["1"]["recall"],
    "F1-Score (Delirium = 1)": report_smote["1"]["f1-score"],
    "Confusion Matrix": conf_matrix_smote,
    "ROC AUC Score": roc_auc_smote,
    "ROC Curve Path": roc_plot_path_smote
}

# Define file path
performance_file = "D:/MIMIC-IV-Data-Pipeline/model_performance.json"

# Load existing data if available
if os.path.exists(performance_file):
    with open(performance_file, "r") as file:
        model_performance = json.load(file)
else:
    model_performance = []

# Append new results
model_performance.append(performance_metrics_smote)

# Save to file
with open(performance_file, "w") as file:
    json.dump(model_performance, file, indent=4)


# ✅ Step 5: Evaluate Model Performance
print("✅ SMOTE-Enhanced Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_smote))
print("\n🔍 Classification Report:")
print(classification_report(y_test, y_pred_smote))


✅ SMOTE-Enhanced Logistic Regression Performance:
Accuracy: 0.8147629251140047

🔍 Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.81      0.90    107248
           1       0.08      0.84      0.14      1958

    accuracy                           0.81    109206
   macro avg       0.54      0.83      0.52    109206
weighted avg       0.98      0.81      0.88    109206



SMOTE-enhanced model shows similar behavior to the balanced class-weight model:

✅ High recall for delirium = 1 (84%) → The model correctly identifies most delirium cases.
⚠ Very low precision for delirium = 1 (8%) → A high number of false positives.
⚠ Overall accuracy dropped to 81.5% → Non-delirium cases are misclassified more often.
📊 Observations & Comparison
Model	Accuracy	Precision (Delirium = 1)	Recall (Delirium = 1)	F1-Score (Delirium = 1)
Baseline Logistic Regression	98.2%	42%	3%	5%
Class-Weighted Logistic Regression	81.6%	8%	84%	14%
SMOTE Logistic Regression	81.5%	8%	84%	14%
Key Findings:
Both Class-Weighted and SMOTE models increased recall but drastically reduced precision.
Too many false positives make these models unreliable for real-world use.


In [None]:
#Set Up Feature Selection with RFE

In [41]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression


In [44]:
# Example: Keep top 30 features
n_features_to_keep = 30

# Create a base logistic regression model 
# Note: you can also specify solver='liblinear' or 'saga' if you want to use L1 penalty
base_estimator = LogisticRegression(max_iter=500, solver='lbfgs')

# Wrap the logistic regression with RFE
rfe_selector = RFE(estimator=base_estimator, 
                   n_features_to_select=n_features_to_keep, 
                   step=1)

# Fit the RFE object on your scaled training data
rfe_selector.fit(X_train_scaled, y_train)

# Identify which features are kept (True) or dropped (False)
feature_support = rfe_selector.support_
selected_features = [col for col, keep in zip(X_train.columns, feature_support) if keep]

print("Selected features using RFE:\n", selected_features)


Selected features using RFE:
 ['ed_time_spent', 'los_hosp', 'anchor_age', 'cognitive_impairment_flag', 'num_comorbidities', 'prior_icu_admissions', 'high_risk_med_flag', 'unique_high_risk_med', 'high_risk_med_count', 'drug', 'icu_vent_mode_flag', 'icu_base_excess_flag', 'icu_lactate_flag', 'admission_type_DIRECT OBSERVATION', 'admission_type_EU OBSERVATION', 'admission_type_EW EMER.', 'admission_type_OBSERVATION ADMIT', 'admission_type_SURGICAL SAME DAY ADMISSION', 'admission_type_URGENT', 'admission_location_EMERGENCY ROOM', 'admission_location_PACU', 'admission_location_PROCEDURE SITE', 'discharge_location_CHRONIC/LONG TERM ACUTE CARE', 'discharge_location_HOME', 'discharge_location_PSYCH FACILITY', 'discharge_location_REHAB', 'discharge_location_SKILLED NURSING FACILITY', 'discharge_location_UNKNOWN', 'gender_M', 'age_group_<30']


In [45]:
X_train_rfe = rfe_selector.transform(X_train_scaled)
X_test_rfe = rfe_selector.transform(X_test_scaled)


In [46]:
# Train a new logistic regression on the selected features
model_rfe = LogisticRegression(max_iter=500, solver='lbfgs')
model_rfe.fit(X_train_rfe, y_train)

# Predict
y_pred_rfe = model_rfe.predict(X_test_rfe)

# Evaluate
accuracy_rfe = accuracy_score(y_test, y_pred_rfe)
report_rfe = classification_report(y_test, y_pred_rfe)
print("RFE-Enhanced Logistic Regression Performance:")
print("Accuracy:", accuracy_rfe)
print("\nClassification Report:\n", report_rfe)


RFE-Enhanced Logistic Regression Performance:
Accuracy: 0.9818965990879622

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99    107248
           1       0.42      0.02      0.05      1958

    accuracy                           0.98    109206
   macro avg       0.70      0.51      0.52    109206
weighted avg       0.97      0.98      0.97    109206



In [47]:
import json
import os
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve

# 1. Calculate confusion matrix
conf_matrix_rfe = confusion_matrix(y_test, y_pred_rfe).tolist()

# 2. Compute ROC AUC Score (optional)
y_pred_proba_rfe = model_rfe.predict_proba(X_test_rfe)[:, 1]
roc_auc_rfe = roc_auc_score(y_test, y_pred_proba_rfe)

# 3. Generate ROC Curve data and save plot (optional)
fpr_rfe, tpr_rfe, _ = roc_curve(y_test, y_pred_proba_rfe)
plt.figure(figsize=(8, 6))
plt.plot(fpr_rfe, tpr_rfe, label=f"ROC Curve (AUC = {roc_auc_rfe:.4f})")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - LogisticReg_RFE")
plt.legend()

roc_plot_path_rfe = "D:/MIMIC-IV-Data-Pipeline/ROC_LogisticReg_RFE.png"
plt.savefig(roc_plot_path_rfe)
plt.close()

# 4. Create dictionary for performance metrics
performance_metrics_rfe = {
    "Model": "LogisticReg_RFE",
    "Accuracy": accuracy_rfe,
    "Precision (Delirium = 1)": report_rfe["1"]["precision"],
    "Recall (Delirium = 1)": report_rfe["1"]["recall"],
    "F1-Score (Delirium = 1)": report_rfe["1"]["f1-score"],
    "Confusion Matrix": conf_matrix_rfe,
    "ROC AUC Score": roc_auc_rfe,
    "ROC Curve Path": roc_plot_path_rfe,
    # Optional: Store which features RFE selected
    "Selected Features": selected_features  
}

# 5. Append and save to model_performance.json
performance_file = "D:/MIMIC-IV-Data-Pipeline/model_performance.json"

if os.path.exists(performance_file):
    with open(performance_file, "r") as file:
        model_performance = json.load(file)
else:
    model_performance = []

model_performance.append(performance_metrics_rfe)

with open(performance_file, "w") as file:
    json.dump(model_performance, file, indent=4)

print("✅ RFE-Enhanced Logistic Regression performance saved successfully.")
print("Selected Features:", selected_features)


TypeError: string indices must be integers, not 'str'

In [49]:
report_rfe = classification_report(y_test, y_pred_rfe)


In [51]:
report_rfe = classification_report(y_test, y_pred_rfe, output_dict=True)


In [53]:
print(report_rfe)


{'0': {'precision': 0.9824916812569323, 'recall': 0.999375279725496, 'f1-score': 0.9908615644890658, 'support': 107248.0}, '1': {'precision': 0.41739130434782606, 'recall': 0.024514811031664963, 'f1-score': 0.04630969609261939, 'support': 1958.0}, 'accuracy': 0.9818965990879622, 'macro avg': {'precision': 0.6999414928023792, 'recall': 0.5119450453785804, 'f1-score': 0.5185856302908426, 'support': 109206.0}, 'weighted avg': {'precision': 0.9723597605017721, 'recall': 0.9818965990879622, 'f1-score': 0.9739262994091229, 'support': 109206.0}}


In [55]:
from sklearn.metrics import classification_report

# 1. Generate a classification report as a dictionary
report_rfe = classification_report(y_test, y_pred_rfe, output_dict=True)

# 2. Check the keys
print("Classification Report Dictionary Keys:", report_rfe.keys())

# 3. Access the relevant class label
#    Assuming your positive class is labeled '1'
precision_1 = report_rfe["1"]["precision"]
recall_1 = report_rfe["1"]["recall"]
f1_1 = report_rfe["1"]["f1-score"]

print("Precision (Delirium=1):", precision_1)
print("Recall (Delirium=1):", recall_1)
print("F1-Score (Delirium=1):", f1_1)


Classification Report Dictionary Keys: dict_keys(['0', '1', 'accuracy', 'macro avg', 'weighted avg'])
Precision (Delirium=1): 0.41739130434782606
Recall (Delirium=1): 0.024514811031664963
F1-Score (Delirium=1): 0.04630969609261939


In [57]:
import os
import json

# Example dictionary holding performance metrics
performance_metrics_rfe = {
    "Model": "LogisticReg_RFE",
    "Accuracy": 0.92,  # example
    "Precision (Delirium = 1)": 0.78,
    "Recall (Delirium = 1)": 0.65,
    "F1-Score (Delirium = 1)": 0.71,
    "Confusion Matrix": [[100, 5], [10, 20]],  # example
    "ROC AUC Score": 0.85,
    "ROC Curve Path": "D:/MIMIC-IV-Data-Pipeline/ROC_LogisticReg_RFE.png",
    # etc...
}

# Path to your JSON file
performance_file = "D:/MIMIC-IV-Data-Pipeline/model_performance.json"

# 1) Check if file exists, load it if it does, otherwise create empty list
if os.path.exists(performance_file):
    with open(performance_file, "r") as file:
        model_performance = json.load(file)
else:
    model_performance = []

# 2) Append new performance metrics to the list
model_performance.append(performance_metrics_rfe)

# 3) Write updated list back to the JSON file
with open(performance_file, "w") as file:
    json.dump(model_performance, file, indent=4)

print("✅ Performance metrics appended to JSON successfully.")


✅ Performance metrics appended to JSON successfully.
