In [1]:
import pandas as pd
import os
import psutil
import dask.dataframe as dd

# Step 1: Define Data Path for Local Execution
data_path = "D:/mimic-iv-3.1"

# Step 2: Check memory usage before loading data
def check_memory():
    print(f"Memory Usage: {psutil.virtual_memory().percent}%")

check_memory()

Memory Usage: 30.6%


In [9]:
# Step 3: Load MIMIC-IV Tables with optimized memory handling
# Load small tables directlyadmissions = pd.read_csv(os.path.join(data_path, "hosp/admissions.csv.gz"), 
admissions = pd.read_csv(os.path.join(data_path, "hosp/admissions.csv.gz"), 
                          usecols=["subject_id", "hadm_id", "admission_type", "admission_location", "discharge_location", "insurance", "race", "marital_status", "edregtime", "edouttime"], 
                          low_memory=False)

# Calculate time spent in ED before admission
admissions["ed_time_spent"] = (pd.to_datetime(admissions["edouttime"]) - pd.to_datetime(admissions["edregtime"])).dt.total_seconds() / 60  # Convert to minutes

# Drop original ED timestamps
admissions = admissions.drop(columns=["edregtime", "edouttime"], errors="ignore")



In [11]:

patients = pd.read_csv(os.path.join(data_path, "hosp/patients.csv.gz"), 
                        usecols=["subject_id", "gender", "anchor_age", "anchor_year"], 
                        low_memory=False)

icustays = pd.read_csv(os.path.join(data_path, "icu/icustays.csv.gz"), 
                        usecols=["subject_id", "hadm_id", "stay_id","last_careunit", "los",], 
                        low_memory=False)

diagnoses = pd.read_csv(os.path.join(data_path, "hosp/diagnoses_icd.csv.gz"), 
                         usecols=["subject_id", "hadm_id", "icd_code"], 
                         low_memory=False)


In [None]:
# Load large tables in chunks
chunk_size = 250000  # Adjusted chunk size for stability

prescriptions = pd.read_csv(os.path.join(data_path, "hosp/prescriptions.csv.gz"),
                            usecols=["subject_id", "hadm_id", "starttime", "stoptime", "drug", "drug_type", "formulary_drug_cd"],
                            low_memory=False)

lab_events_iter = pd.read_csv(os.path.join(data_path, "hosp/labevents.csv.gz"), 
                               usecols=["subject_id", "hadm_id", "itemid", "valuenum"], 
                               low_memory=False, 
                               chunksize=chunk_size)
lab_events = pd.concat(lab_events_iter)

chartevents_iter = pd.read_csv(os.path.join(data_path, "icu/chartevents.csv.gz"), 
                                usecols=["subject_id", "hadm_id", "itemid", "valuenum"], 
                                low_memory=False, 
                                chunksize=chunk_size)
chartevents = pd.concat(chartevents_iter)


In [13]:

# Load item labels
d_items = pd.read_csv(os.path.join(data_path, "icu/d_items.csv.gz"), usecols=["itemid", "label"], low_memory=False)
d_labitems = pd.read_csv(os.path.join(data_path, "hosp/d_labitems.csv.gz"), usecols=["itemid", "label"], low_memory=False)
d_procedures = pd.read_csv(os.path.join(data_path, "hosp/d_icd_procedures.csv.gz"), usecols=["icd_code", "long_title"], low_memory=False)


In [15]:
# Step 4: Check memory usage after loading data
check_memory()

# Step 5: Merge Data for Delirium Prediction
core_data = pd.merge(admissions, patients, on="subject_id", how="inner")
core_data = pd.merge(core_data, icustays, on=["subject_id", "hadm_id"], how="left")
core_data = pd.merge(core_data, diagnoses, on=["subject_id", "hadm_id"], how="left") 

# Identify Delirium Cases Using ICD Codes
# Define ICD Codes for Delirium
delirium_icd_codes = ["F05", "2930", "2931","F051","F050","F059" ]  # ICD-10 & ICD-9 Codes
core_data["delirium"] = core_data["icd_code"].isin(delirium_icd_codes).astype(int)

# Comorbidity Identification
comorbidity_count = diagnoses.groupby(["subject_id", "hadm_id"])['icd_code'].nunique().reset_index()
comorbidity_count.rename(columns={'icd_code': 'num_comorbidities'}, inplace=True)
core_data = pd.merge(core_data, comorbidity_count, on=["subject_id", "hadm_id"], how="left")

Memory Usage: 30.9%


In [17]:
# Merge ICU procedures
diagnoses_labeled = pd.merge(diagnoses, d_procedures, on="icd_code", how="left")
primary_procedure = diagnoses_labeled.groupby(["subject_id", "hadm_id"])['long_title'].first().reset_index()
primary_procedure.rename(columns={'long_title': 'primary_procedure'}, inplace=True)
core_data = pd.merge(core_data, primary_procedure, on=["subject_id", "hadm_id"], how="left")



In [19]:
prescriptions = pd.read_csv(os.path.join(data_path, "hosp/prescriptions.csv.gz"),
                            usecols=["subject_id", "hadm_id", "starttime", "stoptime", "drug", "drug_type", "formulary_drug_cd"],
                            low_memory=False)


In [3]:
# Define expanded high-risk medications list based on literature
high_risk_meds = [
    # **Benzodiazepines**
    "lorazepam", "midazolam", "diazepam", "clonazepam", "alprazolam", "temazepam", "chlordiazepoxide",

    # **Opioids**
    "morphine", "fentanyl", "hydromorphone", "oxycodone", "meperidine", "codeine", "tramadol", "buprenorphine",

    # **Propofol & Sedatives**
    "propofol", "dexmedetomidine",

    # **Antipsychotics**
    "haloperidol", "olanzapine", "quetiapine", "risperidone", "ziprasidone", "chlorpromazine", "aripiprazole",

    # **Anticholinergics**
    "diphenhydramine", "hydroxyzine", "promethazine", "oxybutynin", "scopolamine", "benztropine", "atropine",

    # **Corticosteroids**
    "dexamethasone", "prednisone", "methylprednisolone", "hydrocortisone",

    # **Other elirium-Associated Medications**
    "metoclopramide", "cyclobenzaprine", "baclofen", "amantadine"
]

# Convert drug names to lowercase for consistency
prescriptions["drug"] = prescriptions["drug"].astype(str).str.lower().str.strip()

# Identify if patient received any high-risk medications
prescriptions["high_risk_med"] = prescriptions["drug"].apply(lambda x: 1 if any(med in x for med in high_risk_meds) else 0)

# Aggregate per admission
medication_use = prescriptions.groupby(["subject_id", "hadm_id"])["high_risk_med"].max().reset_index()



NameError: name 'prescriptions' is not defined

In [35]:
# Count the number of admissions that received high-risk meds
high_risk_med_counts = prescriptions["high_risk_med"].value_counts()

print("üîç High-Risk Medication Distribution:")
print(high_risk_med_counts)

# Retain specific high-risk medication type given per admission
high_risk_med_types = prescriptions[prescriptions["high_risk_med"] == 1].groupby(["subject_id", "hadm_id"])["drug"].unique().reset_index()
high_risk_med_types.rename(columns={"drug": "high_risk_med_type"}, inplace=True)

# Convert lists of medications to comma-separated strings for easy readability
high_risk_med_types["high_risk_med_type"] = high_risk_med_types["high_risk_med_type"].apply(lambda x: ", ".join(x))

# Merge high-risk medication types into `medication_use`
medication_use = pd.merge(medication_use, high_risk_med_types, on=["subject_id", "hadm_id"], how="left")

print("‚úÖ High-Risk Medication Counts Processed!")
print(medication_use.head())  # Display first few rows to verify


üîç High-Risk Medication Distribution:
high_risk_med
0    18051937
1     2240674
Name: count, dtype: int64
‚úÖ High-Risk Medication Counts Processed!
   subject_id   hadm_id  high_risk_med                 high_risk_med_type_x  \
0    10000032  22595853              0                                  NaN   
1    10000032  22841357              0                                  NaN   
2    10000032  25742920              1  tramadol (ultram), morphine sulfate   
3    10000032  29079034              1                    tramadol (ultram)   
4    10000084  23052089              1                  quetiapine fumarate   

                  high_risk_med_type_y  
0                                  NaN  
1                                  NaN  
2  tramadol (ultram), morphine sulfate  
3                    tramadol (ultram)  
4                  quetiapine fumarate  


In [51]:
# Merge high-risk medication flag and counts into core_data
core_data = pd.merge(core_data, medication_use, on=["subject_id", "hadm_id"], how="left")

print("‚úÖ Prescriptions merged successfully. Updated shape:", core_data.shape)

‚úÖ Prescriptions merged successfully. Updated shape: (6599888, 28)


In [39]:
print("‚úÖ Core Data Shape:", core_data.shape)
print(core_data.head())  # Show first 5 rows


‚úÖ Core Data Shape: (6599888, 22)
   subject_id   hadm_id admission_type      admission_location  \
0    10000032  22595853         URGENT  TRANSFER FROM HOSPITAL   
1    10000032  22595853         URGENT  TRANSFER FROM HOSPITAL   
2    10000032  22595853         URGENT  TRANSFER FROM HOSPITAL   
3    10000032  22595853         URGENT  TRANSFER FROM HOSPITAL   
4    10000032  22595853         URGENT  TRANSFER FROM HOSPITAL   

  discharge_location insurance marital_status   race  ed_time_spent gender  \
0               HOME  Medicaid        WIDOWED  WHITE          253.0      F   
1               HOME  Medicaid        WIDOWED  WHITE          253.0      F   
2               HOME  Medicaid        WIDOWED  WHITE          253.0      F   
3               HOME  Medicaid        WIDOWED  WHITE          253.0      F   
4               HOME  Medicaid        WIDOWED  WHITE          253.0      F   

   ...  stay_id  last_careunit  los icd_code  delirium num_comorbidities  \
0  ...      NaN        

In [41]:
print("üîç Core Data Columns:")
print(core_data.columns.tolist())


üîç Core Data Columns:
['subject_id', 'hadm_id', 'admission_type', 'admission_location', 'discharge_location', 'insurance', 'marital_status', 'race', 'ed_time_spent', 'gender', 'anchor_age', 'anchor_year', 'stay_id', 'last_careunit', 'los', 'icd_code', 'delirium', 'num_comorbidities', 'primary_procedure', 'high_risk_med', 'high_risk_med_type_x', 'high_risk_med_type_y']


In [43]:
print("üîç Missing Values Summary:")
print(core_data.isnull().sum())


üîç Missing Values Summary:
subject_id                    0
hadm_id                       0
admission_type                0
admission_location            3
discharge_location      1098655
insurance                 63470
marital_status           210998
race                          0
ed_time_spent           1960984
gender                        0
anchor_age                    0
anchor_year                   0
stay_id                 4840622
last_careunit           4840622
los                     4840963
icd_code                    531
delirium                      0
num_comorbidities           531
primary_procedure       3879371
high_risk_med            443166
high_risk_med_type_x    1524367
high_risk_med_type_y    1524367
dtype: int64


In [45]:
print("üîç Unique values in high_risk_med:")
print(core_data["high_risk_med"].value_counts())


üîç Unique values in high_risk_med:
high_risk_med
1.0    5075521
0.0    1081201
Name: count, dtype: int64


In [47]:
print("üîç Data Types:")
print(core_data.dtypes)


üîç Data Types:
subject_id                int64
hadm_id                   int64
admission_type           object
admission_location       object
discharge_location       object
insurance                object
marital_status           object
race                     object
ed_time_spent           float64
gender                   object
anchor_age                int64
anchor_year               int64
stay_id                 float64
last_careunit            object
los                     float64
icd_code                 object
delirium                  int32
num_comorbidities       float64
primary_procedure        object
high_risk_med           float64
high_risk_med_type_x     object
high_risk_med_type_y     object
dtype: object


In [None]:
# DID NOT RUN SAVE FOR EDA ??  # Fill NaN values for missing medications
core_data["high_risk_med"] = core_data["high_risk_med"].fillna(0).astype(int)

In [53]:
# Step 6: Save Processed Data for ML Modeling
output_path = "D:/MIMIC-IV-Data-Pipeline/processed_data"
os.makedirs(output_path, exist_ok=True)
core_data.to_csv(os.path.join(output_path, "delirium_prediction_data_v2.csv.gz"), index=False, compression='gzip')

print("‚úÖ Delirium prediction dataset saved successfully!")


‚úÖ Delirium prediction dataset saved successfully!


In [1]:
#Check if icd_code was kept in core_data
print("üîç Unique Diagnoses in Dataset:")
print(core_data["icd_code"].nunique())  # Count unique ICD codes
#List Sample of Diagnoses
print("üîç Sample Diagnoses in Dataset:")
print(core_data["icd_code"].value_counts().head(20))  # Show top 20 diagnoses
#Check If Palliative Care ICD Codes Exist  
# didnt work
palliative_care_codes = ["Z51.5", "V66.7"]
print("üîç Palliative Care Diagnoses Found:")
print(core_data[core_data["icd_code"].isin(palliative_care_codes)])

üîç Unique Diagnoses in Dataset:


NameError: name 'core_data' is not defined