In [11]:
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os

import pandas as pd

# Define the file path
cleaned_file = "D:/MIMIC-IV-Data-Pipeline/processed_data/mimic_cleaned_v8.csv.gz"

# Load the dataset
df = pd.read_csv(cleaned_file, compression="gzip")

# Confirm successful load
print("Cleaned dataset loaded successfully.")
print(f"Dataset Shape: {df.shape}")  # Check rows & columns
print("\nData Preview:")
print(df.head())  # Display first 5 rows

# Check data types and missing values
print("\nData Info:")
print(df.info())


Cleaned dataset loaded successfully.
Dataset Shape: (546028, 40)

Data Preview:
   subject_id   hadm_id  admission_type      admission_location  \
0    10000032  22595853          URGENT  TRANSFER FROM HOSPITAL   
1    10000032  22841357        EW EMER.          EMERGENCY ROOM   
2    10000032  25742920        EW EMER.          EMERGENCY ROOM   
3    10000032  29079034        EW EMER.          EMERGENCY ROOM   
4    10000068  25022803  EU OBSERVATION          EMERGENCY ROOM   

  discharge_location insurance marital_status   race  ed_time_spent  los_hosp  \
0               HOME  Medicaid        WIDOWED  WHITE          253.0  0.786111   
1               HOME  Medicaid        WIDOWED  WHITE          337.0  1.015278   
2            HOSPICE  Medicaid        WIDOWED  WHITE          286.0  1.754167   
3               HOME  Medicaid        WIDOWED  WHITE          486.0  2.222222   
4            UNKNOWN   UNKNOWN         SINGLE  WHITE          511.0  0.298611   

   ... icu_airway_flag  icu_ve

Prepare Data for XGBoost
Encode categorical variables (One-Hot Encoding for low-cardinality, Frequency Encoding for high-cardinality).
Scale numerical features (optional for XGBoost, but recommended for consistency).
Split data into training and test sets.

In [3]:
# Encode Categorical Variables and Split Data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define the target variable
target = "delirium"

# Separate features (X) and target (y)
X = df.drop(columns=[target])
y = df[target]

# Identify categorical variables
low_cardinality_cols = [
    "admission_type", "admission_location", "discharge_location",
    "insurance", "marital_status", "race", "gender", "age_group"
]

high_cardinality_cols = ["primary_diagnosis", "drug"]

# Apply One-Hot Encoding for low-cardinality categorical features
X = pd.get_dummies(X, columns=low_cardinality_cols, drop_first=True)

# Apply Frequency Encoding for high-cardinality categorical features
for col in high_cardinality_cols:
    freq_map = X[col].value_counts(normalize=True)
    X[col] = X[col].map(freq_map)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scaling is not strictly necessary for XGBoost but improves stability
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Confirm dataset shapes
print(f"Training data shape: {X_train_scaled.shape}")
print(f"Test data shape: {X_test_scaled.shape}")


Training data shape: (436822, 107)
Test data shape: (109206, 107)


In [13]:
#Train and Evaluate XGBoost Model

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define the XGBoost model with class weights
xgb_model = XGBClassifier(
    n_estimators=200, 
    max_depth=6, 
    learning_rate=0.05, 
    scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),  # Handles class imbalance
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)

# Train the model
xgb_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test_scaled)
# Evaluate model performance
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
report_xgb = classification_report(y_test, y_pred_xgb, output_dict=True)
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb).tolist()

# Compute ROC AUC Score
y_pred_proba_xgb = xgb_model.predict_proba(X_test_scaled)[:, 1]
roc_auc_xgb = roc_auc_score(y_test, y_pred_proba_xgb)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_proba_xgb)

# Save ROC curve plot
plt.figure(figsize=(8, 6))
plt.plot(fpr_xgb, tpr_xgb, label=f"ROC Curve (AUC = {roc_auc_xgb:.4f})")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - XGBoost")
plt.legend()

roc_plot_path_xgb = "D:/MIMIC-IV-Data-Pipeline/ROC_XGBoost.png"
plt.savefig(roc_plot_path_xgb)
plt.close()

# Define XGBoost performance metrics
performance_metrics_xgb = {
    "Model": "XGBoost",
    "Accuracy": accuracy_xgb,
    "Precision (Delirium = 1)": report_xgb["1"]["precision"],
    "Recall (Delirium = 1)": report_xgb["1"]["recall"],
    "F1-Score (Delirium = 1)": report_xgb["1"]["f1-score"],
    "Confusion Matrix": conf_matrix_xgb,
    "ROC AUC Score": roc_auc_xgb,
    "ROC Curve Path": roc_plot_path_xgb
}

# Define file path
performance_file = "D:/MIMIC-IV-Data-Pipeline/model_performance.json"

# Load existing data if available
if os.path.exists(performance_file):
    with open(performance_file, "r") as file:
        model_performance = json.load(file)
else:
    model_performance = []

# Append new results
model_performance.append(performance_metrics_xgb)

# Save to file
with open(performance_file, "w") as file:
    json.dump(model_performance, file, indent=4)

print("📊 XGBoost model performance saved successfully.")
# Evaluate model performance
print("XGBoost Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb))


Parameters: { "use_label_encoder" } are not used.



📊 XGBoost model performance saved successfully.
XGBoost Model Performance:
Accuracy: 0.8252751680310606

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.82      0.90    107248
           1       0.08      0.85      0.15      1958

    accuracy                           0.83    109206
   macro avg       0.54      0.84      0.53    109206
weighted avg       0.98      0.83      0.89    109206



In [15]:
# "SelectFromModel" with XGBoost
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel

# 1. Train an initial XGBoost model on ALL features
xgb_base = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]), 
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)

xgb_base.fit(X_train_scaled, y_train)

# Inspect feature importances if you want
importances = xgb_base.feature_importances_


Parameters: { "use_label_encoder" } are not used.



In [17]:
from sklearn.feature_selection import SelectFromModel

# 2. Create a SelectFromModel object using the trained xgb_base
# threshold="median" keeps only the features with importance >= median importance
selector = SelectFromModel(xgb_base, threshold="median", prefit=True)

# 3. Transform the training & test sets
X_train_sfm = selector.transform(X_train_scaled)
X_test_sfm = selector.transform(X_test_scaled)

# OPTIONAL: If you want the indices of the selected features:
selected_indices = selector.get_support(indices=True)

# If you want to see which columns were selected:
all_cols = X_train.columns.tolist()  # your original feature names
selected_feature_names = [all_cols[i] for i in selected_indices]
print("Selected Features using XGB + SelectFromModel:", selected_feature_names)


Selected Features using XGB + SelectFromModel: ['ed_time_spent', 'los_hosp', 'anchor_age', 'primary_diagnosis', 'palliative_care_flag', 'cognitive_impairment_flag', 'num_comorbidities', 'prior_icu_admissions', 'unique_high_risk_med', 'high_risk_med_count', 'drug', 'icu_map_flag', 'icu_pao2_flag', 'icu_paco2_flag', 'icu_peep_flag', 'icu_ph_flag', 'icu_fio2_flag', 'icu_vent_mode_flag', 'icu_base_excess_flag', 'icu_lactate_flag', 'ed_time_missing', 'admission_type_DIRECT EMER.', 'admission_type_DIRECT OBSERVATION', 'admission_type_ELECTIVE', 'admission_type_EU OBSERVATION', 'admission_type_EW EMER.', 'admission_type_OBSERVATION ADMIT', 'admission_type_SURGICAL SAME DAY ADMISSION', 'admission_location_CLINIC REFERRAL', 'admission_location_EMERGENCY ROOM', 'admission_location_PHYSICIAN REFERRAL', 'admission_location_TRANSFER FROM SKILLED NURSING FACILITY', 'discharge_location_DIED', 'discharge_location_HOME', 'discharge_location_HOME HEALTH CARE', 'discharge_location_PSYCH FACILITY', 'disch

In [19]:
# 4. Retrain a new XGBoost model on the reduced feature set
xgb_sfm_model = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)

xgb_sfm_model.fit(X_train_sfm, y_train)

# 5. Predict on test set (transformed)
y_pred_xgb_sfm = xgb_sfm_model.predict(X_test_sfm)


Parameters: { "use_label_encoder" } are not used.



In [21]:
import json, os
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Evaluate
accuracy_xgb_sfm = accuracy_score(y_test, y_pred_xgb_sfm)
report_xgb_sfm = classification_report(y_test, y_pred_xgb_sfm, output_dict=True)
conf_matrix_xgb_sfm = confusion_matrix(y_test, y_pred_xgb_sfm).tolist()

# ROC AUC
y_pred_proba_xgb_sfm = xgb_sfm_model.predict_proba(X_test_sfm)[:, 1]
roc_auc_xgb_sfm = roc_auc_score(y_test, y_pred_proba_xgb_sfm)
fpr_xgb_sfm, tpr_xgb_sfm, _ = roc_curve(y_test, y_pred_proba_xgb_sfm)

# Plot ROC
plt.figure(figsize=(8, 6))
plt.plot(fpr_xgb_sfm, tpr_xgb_sfm, label=f"ROC Curve (AUC = {roc_auc_xgb_sfm:.4f})")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - XGBoost_SelectFromModel")
plt.legend()

roc_plot_path_xgb_sfm = "D:/MIMIC-IV-Data-Pipeline/ROC_XGB_SelectFromModel.png"
plt.savefig(roc_plot_path_xgb_sfm)
plt.close()

# Create metrics dict
performance_metrics_xgb_sfm = {
    "Model": "XGBoost_SelectFromModel",
    "Features Selected": len(selected_indices),
    "Selected Feature Names": selected_feature_names,  # optional
    "Accuracy": accuracy_xgb_sfm,
    "Precision (Delirium = 1)": report_xgb_sfm["1"]["precision"],
    "Recall (Delirium = 1)": report_xgb_sfm["1"]["recall"],
    "F1-Score (Delirium = 1)": report_xgb_sfm["1"]["f1-score"],
    "Confusion Matrix": conf_matrix_xgb_sfm,
    "ROC AUC Score": roc_auc_xgb_sfm,
    "ROC Curve Path": roc_plot_path_xgb_sfm
}

# Append to JSON
performance_file = "D:/MIMIC-IV-Data-Pipeline/model_performance.json"

# Load existing data if available
if os.path.exists(performance_file):
    with open(performance_file, "r") as file:
        model_performance = json.load(file)
else:
    model_performance = []

# Append new results
model_performance.append(performance_metrics_xgb_sfm)

# Save JSON
with open(performance_file, "w") as file:
    json.dump(model_performance, file, indent=4)

print("✅ XGBoost with SelectFromModel performance saved successfully.")
print("Accuracy:", accuracy_xgb_sfm)


✅ XGBoost with SelectFromModel performance saved successfully.
Accuracy: 0.8240389722176438


In [23]:
# RFE with XGBoost 
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier

# Choose how many features you want to keep
n_features_to_select = 20

# Base XGBoost model
xgb_estimator = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)

# Wrap XGBoost with RFE
rfe_xgb = RFE(
    estimator=xgb_estimator,
    n_features_to_select=n_features_to_select,
    step=1
)

# Fit on your scaled training data
rfe_xgb.fit(X_train_scaled, y_train)



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

In [24]:
# Transform both training and test sets to selected features
X_train_rfe = rfe_xgb.transform(X_train_scaled)
X_test_rfe = rfe_xgb.transform(X_test_scaled)

# (Optional) Identify chosen feature indices & names
feature_mask = rfe_xgb.support_
all_columns = X_train.columns.tolist()
selected_features = [col for col, keep in zip(all_columns, feature_mask) if keep]
print("RFE Selected Features:", selected_features)


RFE Selected Features: ['los_hosp', 'anchor_age', 'palliative_care_flag', 'cognitive_impairment_flag', 'num_comorbidities', 'unique_high_risk_med', 'high_risk_med_count', 'icu_pao2_flag', 'icu_paco2_flag', 'icu_peep_flag', 'icu_ph_flag', 'icu_fio2_flag', 'icu_base_excess_flag', 'icu_lactate_flag', 'admission_type_EW EMER.', 'discharge_location_HOME', 'discharge_location_HOME HEALTH CARE', 'discharge_location_PSYCH FACILITY', 'discharge_location_SKILLED NURSING FACILITY', 'discharge_location_UNKNOWN']


In [25]:
# Train new XGB on the reduced feature set
xgb_rfe_model = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)

xgb_rfe_model.fit(X_train_rfe, y_train)

# Predict on RFE test set
y_pred_xgb_rfe = xgb_rfe_model.predict(X_test_rfe)


Parameters: { "use_label_encoder" } are not used.



In [26]:
# Evaluate
accuracy_xgb_rfe = accuracy_score(y_test, y_pred_xgb_rfe)
report_xgb_rfe = classification_report(y_test, y_pred_xgb_rfe, output_dict=True)
conf_matrix_xgb_rfe = confusion_matrix(y_test, y_pred_xgb_rfe).tolist()

# ROC AUC
y_pred_proba_xgb_rfe = xgb_rfe_model.predict_proba(X_test_rfe)[:, 1]
roc_auc_xgb_rfe = roc_auc_score(y_test, y_pred_proba_xgb_rfe)
fpr_xgb_rfe, tpr_xgb_rfe, _ = roc_curve(y_test, y_pred_proba_xgb_rfe)

# Plot ROC
plt.figure(figsize=(8, 6))
plt.plot(fpr_xgb_rfe, tpr_xgb_rfe, label=f"ROC Curve (AUC = {roc_auc_xgb_rfe:.4f})")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - XGBoost_RFE")
plt.legend()

roc_plot_path_xgb_rfe = "D:/MIMIC-IV-Data-Pipeline/ROC_XGB_RFE.png"
plt.savefig(roc_plot_path_xgb_rfe)
plt.close()

# Prepare dictionary
performance_metrics_xgb_rfe = {
    "Model": "XGBoost_RFE",
    "n_features_selected": n_features_to_select,
    "Selected Feature Names": selected_features,
    "Accuracy": accuracy_xgb_rfe,
    "Precision (Delirium = 1)": report_xgb_rfe["1"]["precision"],
    "Recall (Delirium = 1)": report_xgb_rfe["1"]["recall"],
    "F1-Score (Delirium = 1)": report_xgb_rfe["1"]["f1-score"],
    "Confusion Matrix": conf_matrix_xgb_rfe,
    "ROC AUC Score": roc_auc_xgb_rfe,
    "ROC Curve Path": roc_plot_path_xgb_rfe
}

performance_file = "D:/MIMIC-IV-Data-Pipeline/model_performance.json"

# Load existing
if os.path.exists(performance_file):
    with open(performance_file, "r") as file:
        model_performance = json.load(file)
else:
    model_performance = []

# Append
model_performance.append(performance_metrics_xgb_rfe)

# Save
with open(performance_file, "w") as file:
    json.dump(model_performance, file, indent=4)

print("✅ XGBoost with RFE performance saved successfully.")
print("Selected Features:", selected_features)
print("Accuracy:", accuracy_xgb_rfe)


✅ XGBoost with RFE performance saved successfully.
Selected Features: ['los_hosp', 'anchor_age', 'palliative_care_flag', 'cognitive_impairment_flag', 'num_comorbidities', 'unique_high_risk_med', 'high_risk_med_count', 'icu_pao2_flag', 'icu_paco2_flag', 'icu_peep_flag', 'icu_ph_flag', 'icu_fio2_flag', 'icu_base_excess_flag', 'icu_lactate_flag', 'admission_type_EW EMER.', 'discharge_location_HOME', 'discharge_location_HOME HEALTH CARE', 'discharge_location_PSYCH FACILITY', 'discharge_location_SKILLED NURSING FACILITY', 'discharge_location_UNKNOWN']
Accuracy: 0.8161914180539531
