In [25]:
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

In [1]:
import pandas as pd
import os
import json

#  Define the file path
cleaned_file = "D:/MIMIC-IV-Data-Pipeline/processed_data/mimic_cleaned_v8.csv.gz"

#  Load the dataset
df = pd.read_csv(cleaned_file, compression="gzip")

#  Confirm successful load
print(" Cleaned dataset loaded successfully.")
print(f"Dataset Shape: {df.shape}")  # Check rows & columns
print("\n🔍 Data Preview:")
print(df.head())  # Display first 5 rows

# ✅ Check data types and missing values
print("\n🔍 Data Info:")
print(df.info())


 Cleaned dataset loaded successfully.
Dataset Shape: (546028, 40)

🔍 Data Preview:
   subject_id   hadm_id  admission_type      admission_location  \
0    10000032  22595853          URGENT  TRANSFER FROM HOSPITAL   
1    10000032  22841357        EW EMER.          EMERGENCY ROOM   
2    10000032  25742920        EW EMER.          EMERGENCY ROOM   
3    10000032  29079034        EW EMER.          EMERGENCY ROOM   
4    10000068  25022803  EU OBSERVATION          EMERGENCY ROOM   

  discharge_location insurance marital_status   race  ed_time_spent  los_hosp  \
0               HOME  Medicaid        WIDOWED  WHITE          253.0  0.786111   
1               HOME  Medicaid        WIDOWED  WHITE          337.0  1.015278   
2            HOSPICE  Medicaid        WIDOWED  WHITE          286.0  1.754167   
3               HOME  Medicaid        WIDOWED  WHITE          486.0  2.222222   
4            UNKNOWN   UNKNOWN         SINGLE  WHITE          511.0  0.298611   

   ... icu_airway_flag  icu

Prepare Data for Random Forest/XGBoost
Now that the cleaned dataset has been loaded, the next step is to prepare the data for training. This includes:

Encoding categorical variables
Scaling numerical features (if needed)
Splitting the data into training and test sets

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

# Define the target variable
target = "delirium"

# Separate features (X) and target (y)
X = df.drop(columns=[target])
y = df[target]

# Identify categorical variables
categorical_cols = ["admission_type", "admission_location", "discharge_location",
                    "insurance", "marital_status", "race", "gender", "age_group"]

# Use One-Hot Encoding for categorical variables with low cardinality
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Apply frequency encoding for high-cardinality categorical features
for col in ["primary_diagnosis", "drug"]:
    freq_map = X[col].value_counts(normalize=True)
    X[col] = X[col].map(freq_map)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Confirm dataset shapes
print(f"Training data shape: {X_train_scaled.shape}")
print(f"Test data shape: {X_test_scaled.shape}")


Training data shape: (436822, 107)
Test data shape: (109206, 107)


Train a Random Forest Classifier
Now that the dataset is prepared, we will:

Train a Random Forest Classifier using the preprocessed training data.
Evaluate the model on the test set.
Report accuracy, precision, recall, and F1-score to assess performance.

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define the Random Forest model with class weights to handle class imbalance
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")

# Train the model
rf_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test_scaled)


#  Step 3: Evaluate Random Forest Model Performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
report_rf = classification_report(y_test, y_pred_rf, output_dict=True)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf).tolist()

# Compute ROC AUC Score
y_pred_proba_rf = rf_model.predict_proba(X_test_scaled)[:, 1]
roc_auc_rf = roc_auc_score(y_test, y_pred_proba_rf)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_proba_rf)

# Save ROC curve plot
plt.figure(figsize=(8, 6))
plt.plot(fpr_rf, tpr_rf, label=f"ROC Curve (AUC = {roc_auc_rf:.4f})")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - RandomForest_Balanced")
plt.legend()

roc_plot_path_rf = "D:/MIMIC-IV-Data-Pipeline/ROC_RandomForest_Balanced.png"
plt.savefig(roc_plot_path_rf)
plt.close()

# Define Random Forest performance metrics
performance_metrics_rf = {
    "Model": "RandomForest_Balanced",
    "Accuracy": accuracy_rf,
    "Precision (Delirium = 1)": report_rf["1"]["precision"],
    "Recall (Delirium = 1)": report_rf["1"]["recall"],
    "F1-Score (Delirium = 1)": report_rf["1"]["f1-score"],
    "Confusion Matrix": conf_matrix_rf,
    "ROC AUC Score": roc_auc_rf,
    "ROC Curve Path": roc_plot_path_rf
}

# Define file path
performance_file = "D:/MIMIC-IV-Data-Pipeline/model_performance.json"

# Load existing data if available
if os.path.exists(performance_file):
    with open(performance_file, "r") as file:
        model_performance = json.load(file)
else:
    model_performance = []

# Append new results 
model_performance.append(performance_metrics_rf)

# Save to file
with open(performance_file, "w") as file:
    json.dump(model_performance, file, indent=4)

# Evaluate model performance
print("📊 Random Forest Model Performance saved successfully.")

# Evaluate model performance
print("Random Forest Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))


📊 Random Forest Model Performance saved successfully.
Random Forest Model Performance:
Accuracy: 0.9820522681903925

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    107248
           1       0.44      0.00      0.01      1958

    accuracy                           0.98    109206
   macro avg       0.71      0.50      0.50    109206
weighted avg       0.97      0.98      0.97    109206



Hyperparameters to Tune in RF
n_estimators (Number of trees) – More trees improve stability but increase computation time.
max_depth (Tree depth) – Prevents overly complex trees that overfit.
min_samples_split – Controls when a node splits; higher values prevent small splits.
min_samples_leaf – Ensures each leaf node has a minimum number of samples.
class_weight='balanced' – Adjusts the importance of minority classes.

In [28]:
#Tune hyperparameters in Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define hyperparameter grid
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, 30, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "class_weight": ["balanced"]
}

# Initialize Random Forest
rf = RandomForestClassifier(random_state=42)

# Perform Randomized Search
rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=10,  # Number of random settings to try
    cv=3,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available CPU cores
)

# Fit Randomized Search
rf_random.fit(X_train_scaled, y_train)

# Get best parameters
print("Best parameters found:", rf_random.best_params_)

# Train best model
best_rf = rf_random.best_estimator_
y_pred_rf_tuned = best_rf.predict(X_test_scaled)

# Evaluate model performance
from sklearn.metrics import accuracy_score, classification_report

# Step: Evaluate Tuned Random Forest Model Performance
accuracy_rf_tuned = accuracy_score(y_test, y_pred_rf_tuned)
report_rf_tuned = classification_report(y_test, y_pred_rf_tuned, output_dict=True)
 
conf_matrix_rf_tuned = confusion_matrix(y_test, y_pred_rf_tuned).tolist()

# Compute ROC AUC Score
y_pred_proba_rf_tuned = best_rf.predict_proba(X_test_scaled)[:, 1]
roc_auc_rf_tuned = roc_auc_score(y_test, y_pred_proba_rf_tuned)
fpr_rf_tuned, tpr_rf_tuned, _ = roc_curve(y_test, y_pred_proba_rf_tuned)

# Save ROC curve plot
plt.figure(figsize=(8, 6))
plt.plot(fpr_rf_tuned, tpr_rf_tuned, label=f"ROC Curve (AUC = {roc_auc_rf_tuned:.4f})")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - RandomForest_Tuned")
plt.legend()

roc_plot_path_rf_tuned = "D:/MIMIC-IV-Data-Pipeline/ROC_RandomForest_Tuned.png"
plt.savefig(roc_plot_path_rf_tuned)
plt.close()

# Define Tuned Random Forest performance metrics
performance_metrics_rf_tuned = {
    "Model": "RandomForest_Tuned",
    "Best Parameters": rf_random.best_params_,
    "Accuracy": accuracy_rf_tuned,
    "Precision (Delirium = 1)": report_rf_tuned["1"]["precision"],
    "Recall (Delirium = 1)": report_rf_tuned["1"]["recall"],
    "F1-Score (Delirium = 1)": report_rf_tuned["1"]["f1-score"],
    "Confusion Matrix": conf_matrix_rf_tuned,
    "ROC AUC Score": roc_auc_rf_tuned,
    "ROC Curve Path": roc_plot_path_rf_tuned
}

# Define file path
performance_file = "D:/MIMIC-IV-Data-Pipeline/model_performance.json"

# Load existing data if available
if os.path.exists(performance_file):
    with open(performance_file, "r") as file:
        model_performance = json.load(file)
else:
    model_performance = []

# Append new results
model_performance.append(performance_metrics_rf_tuned)

# Save to file
with open(performance_file, "w") as file:
    json.dump(model_performance, file, indent=4)

print("📊 Tuned Random Forest model performance saved successfully.")


print("Tuned Random Forest Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf_tuned))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf_tuned))


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters found: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None, 'class_weight': 'balanced'}
📊 Tuned Random Forest model performance saved successfully.
Tuned Random Forest Model Performance:
Accuracy: 0.982098053220519

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    107248
           1       0.60      0.00      0.01      1958

    accuracy                           0.98    109206
   macro avg       0.79      0.50      0.50    109206
weighted avg       0.98      0.98      0.97    109206



Apply SMOTE Before Training Random Forest
Since hyperparameter tuning did not sufficiently improve recall for delirium = 1, we will:

Apply SMOTE (Synthetic Minority Over-sampling Technique) to balance the training dataset.
Train Random Forest on the SMOTE-balanced data to ensure the model learns patterns for minority class cases.
Evaluate if SMOTE improves recall while maintaining precision.

In [29]:
# SMOTE on RF 
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Apply SMOTE to balance training data
smote = SMOTE(sampling_strategy="auto", random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Train Random Forest on SMOTE-balanced data
rf_smote = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=5, 
                                  min_samples_leaf=2, class_weight="balanced", random_state=42)
rf_smote.fit(X_train_smote, y_train_smote)

# Make predictions on test set
y_pred_rf_smote = rf_smote.predict(X_test_scaled)

# Step 1: Apply SMOTE to Balance Dataset
smote = SMOTE(sampling_strategy="auto", random_state=42)
X_train_smote_rf, y_train_smote_rf = smote.fit_resample(X_train_scaled, y_train)

# Step 2: Train Random Forest on SMOTE Data
rf_smote = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
rf_smote.fit(X_train_smote_rf, y_train_smote_rf)

# Step 3: Make Predictions
y_pred_rf_smote = rf_smote.predict(X_test_scaled)

#  Step 4: Evaluate SMOTE Random Forest Model Performance
accuracy_rf_smote = accuracy_score(y_test, y_pred_rf_smote)
report_rf_smote = classification_report(y_test, y_pred_rf_smote, output_dict=True)
conf_matrix_rf_smote = confusion_matrix(y_test, y_pred_rf_smote).tolist()

# Compute ROC AUC Score
y_pred_proba_rf_smote = rf_smote.predict_proba(X_test_scaled)[:, 1]
roc_auc_rf_smote = roc_auc_score(y_test, y_pred_proba_rf_smote)
fpr_rf_smote, tpr_rf_smote, _ = roc_curve(y_test, y_pred_proba_rf_smote)

# Save ROC curve plot
plt.figure(figsize=(8, 6))
plt.plot(fpr_rf_smote, tpr_rf_smote, label=f"ROC Curve (AUC = {roc_auc_rf_smote:.4f})")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - RandomForest_SMOTE")
plt.legend()

roc_plot_path_rf_smote = "D:/MIMIC-IV-Data-Pipeline/ROC_RandomForest_SMOTE.png"
plt.savefig(roc_plot_path_rf_smote)
plt.close()

# Define SMOTE Random Forest performance metrics
performance_metrics_rf_smote = {
    "Model": "RandomForest_SMOTE",
    "Accuracy": accuracy_rf_smote,
    "Precision (Delirium = 1)": report_rf_smote["1"]["precision"],
    "Recall (Delirium = 1)": report_rf_smote["1"]["recall"],
    "F1-Score (Delirium = 1)": report_rf_smote["1"]["f1-score"],
    "Confusion Matrix": conf_matrix_rf_smote,
    "ROC AUC Score": roc_auc_rf_smote,
    "ROC Curve Path": roc_plot_path_rf_smote
}

# Define file path
performance_file = "D:/MIMIC-IV-Data-Pipeline/model_performance.json"

# Load existing data if available
if os.path.exists(performance_file):
    with open(performance_file, "r") as file:
        model_performance = json.load(file)
else:
    model_performance = []

# Append new results
model_performance.append(performance_metrics_rf_smote)

# Save to file
with open(performance_file, "w") as file:
    json.dump(model_performance, file, indent=4)

print("📊 SMOTE Random Forest model performance saved successfully.")
 
# Evaluate model performance
print("Random Forest with SMOTE Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf_smote))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf_smote))


📊 SMOTE Random Forest model performance saved successfully.
Random Forest with SMOTE Performance:
Accuracy: 0.9800651978829003

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    107248
           1       0.28      0.07      0.11      1958

    accuracy                           0.98    109206
   macro avg       0.63      0.53      0.55    109206
weighted avg       0.97      0.98      0.97    109206



In [None]:
# RFE on RF 
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
# Number of features you want to keep
n_features_to_select = 20

# Create a base Random Forest model
# (You can specify any hyperparameters you want here)
base_rf = RandomForestClassifier(
    n_estimators=100, 
    random_state=42,
    class_weight="balanced"
)

# Wrap the Random Forest with RFE
rfe_selector = RFE(
    estimator=base_rf,
    n_features_to_select=n_features_to_select,
    step=1
)

# Fit RFE on your training data (already scaled)
rfe_selector.fit(X_train_scaled, y_train)

# Identify which features are kept (True) or dropped (False)
feature_support = rfe_selector.support_

# OPTIONAL: If you need the actual column names that were selected
selected_features = [col for col, keep in zip(X_train.columns, feature_support) if keep]
print("Selected features from RFE:\n", selected_features)


In [None]:
import json
import os
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

# Evaluate RFE-based Random Forest
accuracy_rf_rfe = accuracy_score(y_test, y_pred_rf_rfe)
report_rf_rfe = classification_report(y_test, y_pred_rf_rfe, output_dict=True)
conf_matrix_rf_rfe = confusion_matrix(y_test, y_pred_rf_rfe).tolist()

# Compute ROC AUC Score
y_pred_proba_rf_rfe = rf_rfe_model.predict_proba(X_test_rfe)[:, 1]
roc_auc_rf_rfe = roc_auc_score(y_test, y_pred_proba_rf_rfe)
fpr_rf_rfe, tpr_rf_rfe, _ = roc_curve(y_test, y_pred_proba_rf_rfe)

# Save ROC curve plot
plt.figure(figsize=(8, 6))
plt.plot(fpr_rf_rfe, tpr_rf_rfe, label=f"ROC Curve (AUC = {roc_auc_rf_rfe:.4f})")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - RandomForest_RFE")
plt.legend()

roc_plot_path_rf_rfe = "D:/MIMIC-IV-Data-Pipeline/ROC_RandomForest_RFE.png"
plt.savefig(roc_plot_path_rf_rfe)
plt.close()

# Define performance metrics dictionary
performance_metrics_rf_rfe = {
    "Model": "RandomForest_RFE",
    "n_features_selected": n_features_to_select,
    "Selected Features": selected_features,  # OPTIONAL: list of selected features
    "Accuracy": accuracy_rf_rfe,
    "Precision (Delirium = 1)": report_rf_rfe["1"]["precision"],
    "Recall (Delirium = 1)": report_rf_rfe["1"]["recall"],
    "F1-Score (Delirium = 1)": report_rf_rfe["1"]["f1-score"],
    "Confusion Matrix": conf_matrix_rf_rfe,
    "ROC AUC Score": roc_auc_rf_rfe,
    "ROC Curve Path": roc_plot_path_rf_rfe
}

# Path to your performance JSON
performance_file = "D:/MIMIC-IV-Data-Pipeline/model_performance.json"

# Load existing data if available
if os.path.exists(performance_file):
    with open(performance_file, "r") as file:
        model_performance = json.load(file)
else:
    model_performance = []

# Append new RFE-based results
model_performance.append(performance_metrics_rf_rfe)

# Save to JSON
with open(performance_file, "w") as file:
    json.dump(model_performance, file, indent=4)

print("✅ RFE-based Random Forest performance saved successfully.")
print("Random Forest RFE Model Performance:")
print("Accuracy:", accuracy_rf_rfe)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf_rfe))


In [None]:
# Optional Alternatives "SelectFromModel" with a random forest 
from sklearn.feature_selection import SelectFromModel

sfm = SelectFromModel(base_rf, threshold="median")  # or any threshold
sfm.fit(X_train_scaled, y_train)
X_train_sfm = sfm.transform(X_train_scaled)
X_test_sfm = sfm.transform(X_test_scaled)

