In [1]:
import pandas as pd
import os
import psutil

# Step 1: Define Data Path
data_path = "D:/mimic-iv-3.1"

# Step 2: Check memory usage before loading data
def check_memory():
    print(f"Memory Usage: {psutil.virtual_memory().percent}%")

check_memory()

# Step 3: Load MIMIC-IV Tables with optimized memory handling
# Load small tables directly
admissions = pd.read_csv(os.path.join(data_path, "hosp/admissions.csv.gz"), 
                          usecols=["subject_id", "hadm_id", "admission_type", "admission_location", "discharge_location", "insurance", "race", "marital_status", "edregtime", "edouttime"], 
                          low_memory=False)

Memory Usage: 43.2%


In [43]:
admissions = pd.read_csv(os.path.join(data_path, "hosp/admissions.csv.gz"), 
                          usecols=["subject_id", "hadm_id", "admission_type", "admission_location", "discharge_location", "insurance", "race", "marital_status", "edregtime", "edouttime"], 
                          low_memory=False)

# Calculate time spent in ED before admission
admissions["ed_time_spent"] = (pd.to_datetime(admissions["edouttime"]) - pd.to_datetime(admissions["edregtime"])).dt.total_seconds() / 60  # Convert to minutes

# Drop original ED timestamps
admissions = admissions.drop(columns=["edregtime", "edouttime"], errors="ignore")

In [3]:
 # Load patient demographics
patients = pd.read_csv(os.path.join(data_path, "hosp/patients.csv.gz"),
                        usecols=["subject_id", "gender", "anchor_age", "anchor_year"],
                        low_memory=False)

# Load ICU stays with length of stay
icustays = pd.read_csv(os.path.join(data_path, "icu/icustays.csv.gz"),
                        usecols=["subject_id", "hadm_id", "stay_id", "last_careunit", "los"],
                        low_memory=False)

In [24]:
# Load diagnoses and count unique conditions per admission
diagnoses = pd.read_csv(os.path.join(data_path, "hosp/diagnoses_icd.csv.gz"),
                         usecols=["subject_id", "hadm_id", "icd_code"],
                         low_memory=False)

diagnosis_counts = diagnoses.groupby(["subject_id", "hadm_id"]).agg(
    num_comorbidities=('icd_code', 'nunique'),  # Count unique diagnoses
    diagnosis_list=('icd_code', lambda x: list(x))  # Retain list of all diagnoses for admission
).reset_index()

In [25]:
# Define Palliative Care ICD Codes in Diagnosis Table 
palliative_care_codes = ["Z515", "V667"]

# Create Palliative Care Flag Based on Diagnoses
diagnosis_counts["palliative_care_flag"] = diagnosis_counts["diagnosis_list"].apply(
    lambda x: 1 if any(code in palliative_care_codes for code in x) else 0
)



In [17]:
print("🔍 Palliative Care Flag Distribution:")
print(diagnosis_counts["palliative_care_flag"].value_counts(normalize=True))


🔍 Palliative Care Flag Distribution:
palliative_care_flag
0    0.976385
1    0.023615
Name: proportion, dtype: float64


In [35]:
# Define ICD Codes for Delirium
delirium_icd_codes = ["F05", "2930", "2931","F051","F050","F059" ]  # ICD-10 & ICD-9 Codes
# Ensure diagnosis_list is a list, and handle missing values
diagnosis_counts["delirium"] = diagnosis_counts["diagnosis_list"].apply(
    lambda x: 1 if isinstance(x, list) and any(code in delirium_icd_codes for code in x) else 0
)


In [37]:
print(diagnosis_counts["delirium"].value_counts(normalize=True))

delirium
0    0.982277
1    0.017723
Name: proportion, dtype: float64


In [23]:
# Load prescriptions and flag high-risk medications
prescriptions = pd.read_csv(os.path.join(data_path, "hosp/prescriptions.csv.gz"),
                            usecols=["subject_id", "hadm_id", "drug"],
                            low_memory=False)

# Define expanded high-risk medications list based on literature
high_risk_meds = [
    "lorazepam", "midazolam", "diazepam", "clonazepam", "alprazolam", "temazepam", "chlordiazepoxide",
    "morphine", "fentanyl", "hydromorphone", "oxycodone", "meperidine", "codeine", "tramadol", "buprenorphine",
    "propofol", "dexmedetomidine",
    "haloperidol", "olanzapine", "quetiapine", "risperidone", "ziprasidone", "chlorpromazine", "aripiprazole",
    "diphenhydramine", "hydroxyzine", "promethazine", "oxybutynin", "scopolamine", "benztropine", "atropine",
    "dexamethasone", "prednisone", "methylprednisolone", "hydrocortisone",
    "metoclopramide", "cyclobenzaprine", "baclofen", "amantadine"
]

prescriptions["drug"] = prescriptions["drug"].astype(str).str.lower().str.strip()
prescriptions["high_risk_med"] = prescriptions["drug"].apply(lambda x: 1 if any(med in x for med in high_risk_meds) else 0)

high_risk_med_counts = prescriptions.groupby(["subject_id", "hadm_id"])['high_risk_med'].max().reset_index()


In [39]:
# Merge data to ensure one row per admission
core_data = admissions.merge(patients, on="subject_id", how="inner")
core_data = core_data.merge(icustays, on=["subject_id", "hadm_id"], how="left")
core_data = core_data.merge(diagnosis_counts, on=["subject_id", "hadm_id"], how="left")
core_data = core_data.merge(high_risk_med_counts, on=["subject_id", "hadm_id"], how="left")

In [21]:
#Skipped for now
# Fill missing values
core_data["num_comorbidities"] = core_data["num_comorbidities"].fillna(0).astype(int)
core_data["high_risk_med"] = core_data["high_risk_med"].fillna(0).astype(int)




In [41]:
# Step 4: Save Processed Data for ML Modeling
output_path = "D:/MIMIC-IV-Data-Pipeline/processed_data"
os.makedirs(output_path, exist_ok=True)
core_data.to_csv(os.path.join(output_path, "delirium_prediction_data_v3.csv.gz"), index=False, compression='gzip')

print("✅ Cleaned dataset saved successfully! One row per admission.")

✅ Cleaned dataset saved successfully! One row per admission.
