## **Deliruim Data Cohort from MIMIC-IV-3.2 **
Data Downloaded February 8th , 2025

Phase 1 Data Extraction
https://colab.research.google.com/drive/1IIYkR_CSGwakQJ5g47gtY1TnQIs91T02#scrollTo=71f2964d-361e-4a8a-b59f-429624b6e1ce

# Phase 2 Extracted Dataset Mounted  
Nth Attempt Feb 22, 24
#Phase 3 Modeling 
Model 1 LR
Model 2 NB
Model 3 RF


In [None]:
import pandas as pd
#define file path
file_path = "D:/MIMIC-IV-Data-Pipeline/processed_data/delirium_prediction_data_v3.csv.gz"

# Load dataset
df = pd.read_csv(file_path, compression="gzip", low_memory=False)

# creating a new variable 
df["past_icu_stay"] = df.groupby("subject_id")["stay_id"].shift(1)
df["past_icu_stay"] = df["past_icu_stay"].fillna("No Previous ICU Stay")

print("✅ Past ICU Stay Feature Added!")

# Select categorical columns
categorical_cols = ["admission_type", "admission_location", "discharge_location", 
                    "insurance", "race", "gender", "marital_status", "last_careunit","past_icu_stay"]

# One-hot encode categorical variables
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("✅ Categorical variables encoded!")


In [5]:
print("✅ Data Loaded! Shape:", df.shape)
print(df.head())  # Show first 5 rows


✅ Data Loaded! Shape: (555244, 98)
   subject_id   hadm_id  ed_time_spent  anchor_age  anchor_year     stay_id  \
0    10000032  22595853          253.0          52         2180         NaN   
1    10000032  22841357          337.0          52         2180         NaN   
2    10000032  25742920          286.0          52         2180         NaN   
3    10000032  29079034          486.0          52         2180  39553978.0   
4    10000068  25022803          511.0          19         2160         NaN   

        los  num_comorbidities  \
0       NaN                8.0   
1       NaN                8.0   
2       NaN               10.0   
3  0.410266               13.0   
4       NaN                1.0   

                                      diagnosis_list  palliative_care_flag  \
0  ['5723', '78959', '5715', '07070', '496', '296...                   0.0   
1  ['07071', '78959', '2875', '2761', '496', '571...                   0.0   
2  ['07054', '78959', 'V462', '5715', '2767', '27..

In [7]:
print("🔍 Dataset Overview:")
print(df.info())  # Check column types and memory usage
print("\nMissing Values:\n", df.isnull().sum())  # Count missing values


🔍 Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555244 entries, 0 to 555243
Data columns (total 98 columns):
 #   Column                                                          Non-Null Count   Dtype  
---  ------                                                          --------------   -----  
 0   subject_id                                                      555244 non-null  int64  
 1   hadm_id                                                         555244 non-null  int64  
 2   ed_time_spent                                                   385225 non-null  float64
 3   anchor_age                                                      555244 non-null  int64  
 4   anchor_year                                                     555244 non-null  int64  
 5   stay_id                                                         94458 non-null   float64
 6   los                                                             94444 non-null   float64
 7   num_comorbidities 

✅ Next Steps: Handling Missing Data



In [10]:
# Fill ICU LOS missing values (0 for non-ICU admissions)
df["los"] = df["los"].fillna(0)

# Fill num_comorbidities with median
df["num_comorbidities"] = df["num_comorbidities"].fillna(df["num_comorbidities"].median())

print("✅ Missing values handled!")



✅ Missing values handled!


In [12]:
# Drop identifiers that don't contribute to predictions
df = df.drop(columns=["subject_id", "hadm_id", "stay_id", "anchor_year"])

print("✅ Dropped unnecessary identifiers!")


✅ Dropped unnecessary identifiers!


In [14]:
print("🔍 Dataset Overview:")
print(df.info())  # Check column types and memory usage
print("\nMissing Values:\n", df.isnull().sum())  # Count missing values

🔍 Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555244 entries, 0 to 555243
Data columns (total 94 columns):
 #   Column                                                          Non-Null Count   Dtype  
---  ------                                                          --------------   -----  
 0   ed_time_spent                                                   385225 non-null  float64
 1   anchor_age                                                      555244 non-null  int64  
 2   los                                                             555244 non-null  float64
 3   num_comorbidities                                               555244 non-null  float64
 4   diagnosis_list                                                  554713 non-null  object 
 5   palliative_care_flag                                            554713 non-null  float64
 6   delirium                                                        554713 non-null  float64
 7   high_risk_med     

In [16]:
# Define target variable
y = df["delirium"]

# Drop target column from features
X = df.drop(columns=["delirium"])


In [22]:
# Check if there are missing values in the target variable
print("🔍 Missing values in target variable (y):", y.isnull().sum())

# Check dimensions of X and y
print("✅ Features Shape (X):", X.shape)
print("✅ Target Shape (y):", y.shape)


🔍 Missing values in target variable (y): 531
✅ Features Shape (X): (555244, 93)
✅ Target Shape (y): (555244,)


In [24]:
df = df.dropna(subset=["delirium"])  # Ensures no missing values in y
X = df.drop(columns=["delirium"])
y = df["delirium"]


In [26]:
# Drop non-numeric columns before splitting
non_numeric_cols = ["diagnosis_list"]
X = X.drop(columns=non_numeric_cols, errors="ignore")


In [28]:
from sklearn.model_selection import train_test_split

# Perform train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("✅ Train-test split completed!")
print(f"Training Set: {X_train.shape}, Test Set: {X_test.shape}")


✅ Train-test split completed!
Training Set: (443770, 92), Test Set: (110943, 92)


In [30]:
# Check for non-numeric columns
print("🔍 Non-Numeric Columns in X_train:")
print(X_train.select_dtypes(include=["object"]).columns)

# Check for missing values
print("🔍 Missing Values in X_train:", X_train.isnull().sum().sum())


🔍 Non-Numeric Columns in X_train:
Index([], dtype='object')
🔍 Missing Values in X_train: 201851


In [32]:
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)


In [34]:
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)


In [46]:
# Reapply SMOTE after fixes
from imblearn.over_sampling import SMOTE


In [50]:
# Initialize SMOTE with n_jobs=1 to disable multiprocessing
# smote = SMOTE(random_state=42,n_jobs=1)
# gave error , remove & redone
smote = SMOTE(random_state=42)

# Apply SMOTE only on the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("✅ SMOTE applied! New class distribution:")
print(pd.Series(y_train_resampled).value_counts(normalize=True))



✅ SMOTE applied! New class distribution:
delirium
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64


In [52]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Initialize Random Forest with class balancing
rf_model = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42)

# Train model
rf_model.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Evaluate performance
print("✅ Random Forest Model Performance:")
print(classification_report(y_test, y_pred_rf))


✅ Random Forest Model Performance:
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99    108812
         1.0       0.50      0.09      0.15      2131

    accuracy                           0.98    110943
   macro avg       0.74      0.54      0.57    110943
weighted avg       0.97      0.98      0.97    110943



In [54]:
import numpy as np

# Get feature importances
importances = rf_model.feature_importances_
feature_names = X_train_resampled.columns

# Sort and display top 10 features
sorted_indices = np.argsort(importances)[::-1]

print("🔍 Top 10 Important Features for Delirium Prediction:")
for idx in sorted_indices[:10]:
    print(f"{feature_names[idx]}: {importances[idx]:.4f}")


🔍 Top 10 Important Features for Delirium Prediction:
num_comorbidities: 0.1318
discharge_location_SKILLED NURSING FACILITY: 0.1044
los: 0.0864
anchor_age: 0.0636
high_risk_med: 0.0582
insurance_Medicare: 0.0510
ed_time_spent: 0.0330
admission_type_EW EMER.: 0.0306
admission_type_OBSERVATION ADMIT: 0.0269
discharge_location_HOME HEALTH CARE: 0.0251


In [56]:
rf_model = RandomForestClassifier(n_estimators=200, class_weight="balanced_subsample", random_state=42)


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

grid_search = GridSearchCV(RandomForestClassifier(class_weight="balanced_subsample", random_state=42), 
                           param_grid, cv=5, scoring="f1")
grid_search.fit(X_train_resampled, y_train_resampled)

print("✅ Best Parameters:", grid_search.best_params_)
