In [1]:
import pandas as pd
import os
import psutil

# Step 1: Define Data Path
data_path = "D:/mimic-iv-3.1"

# Step 2: Check memory usage before loading data
def check_memory():
    print(f"Memory Usage: {psutil.virtual_memory().percent}%")
check_memory()


Memory Usage: 55.8%


In [289]:
#DID NOT NEED TO  RUN SKIPP IT # Create an empty DataFrame with the key columns
admissions = pd.DataFrame(columns=["subject_id", "hadm_id"])
diagnosis_count = pd.DataFrame(columns=["subject_id", "hadm_id"])
diagnoses = pd.DataFrame(columns=["subject_id", "hadm_id"])

In [3]:
# Step 3: Load MIMIC-IV Tables with optimized memory handling
# =============================
# 1. Load and Process Admissions & Patients (One row per admission)
# =============================
admissions = pd.read_csv(
    os.path.join(data_path, "hosp/admissions.csv.gz"),
    usecols=["subject_id", "hadm_id", "admission_type", "admission_location", "discharge_location", 
             "admittime", "dischtime", "insurance", "race", "marital_status", "edregtime", "edouttime"],
    low_memory=False
)

# Calculate ED time spent (in minutes) and hospital length of stay (in days)
admissions["ed_time_spent"] = (pd.to_datetime(admissions["edouttime"]) - pd.to_datetime(admissions["edregtime"])).dt.total_seconds() / 60
admissions["los_hosp"] = (pd.to_datetime(admissions["dischtime"]) - pd.to_datetime(admissions["admittime"])).dt.total_seconds() / (60*60*24)

# Drop raw timestamp columns to reduce redundancy
admissions = admissions.drop(columns=["edregtime", "edouttime", "admittime", "dischtime"], errors="ignore")
 # Load patient demographics
patients = pd.read_csv(
    os.path.join(data_path, "hosp/patients.csv.gz"),
    usecols=["subject_id", "gender", "anchor_age", "anchor_year"],
    low_memory=False
)

In [5]:
print(admissions.head())  # Show first 5 rows

   subject_id   hadm_id  admission_type      admission_location  \
0    10000032  22595853          URGENT  TRANSFER FROM HOSPITAL   
1    10000032  22841357        EW EMER.          EMERGENCY ROOM   
2    10000032  25742920        EW EMER.          EMERGENCY ROOM   
3    10000032  29079034        EW EMER.          EMERGENCY ROOM   
4    10000068  25022803  EU OBSERVATION          EMERGENCY ROOM   

  discharge_location insurance marital_status   race  ed_time_spent  los_hosp  
0               HOME  Medicaid        WIDOWED  WHITE          253.0  0.786111  
1               HOME  Medicaid        WIDOWED  WHITE          337.0  1.015278  
2            HOSPICE  Medicaid        WIDOWED  WHITE          286.0  1.754167  
3               HOME  Medicaid        WIDOWED  WHITE          486.0  2.222222  
4                NaN       NaN         SINGLE  WHITE          511.0  0.298611  


In [56]:
# =============================
# 2A.  Diagnoses Data ICD Codes conversion
# =============================
# Load GEMs ICD-9 to ICD-10 Mapping File
gem_file_path = "D:/MIMIC-IV-Data-Pipeline/2015_I9gem.txt"  # Replace with the correct path
col_names = ["icd9_code", "icd10_code", "flags"]

# Read the file (it's space-separated)
icd9_to_icd10_gem = pd.read_csv(gem_file_path, sep="\t", header=None, names=col_names, dtype=str)

# Display first few rows
print(icd9_to_icd10_gem.head())


             icd9_code icd10_code flags
0  0010  A000    00000        NaN   NaN
1  0011  A001    00000        NaN   NaN
2  0019  A009    00000        NaN   NaN
3  0020  A0100   10000        NaN   NaN
4  0021  A011    00000        NaN   NaN


In [88]:
# =============================
# 2. Load and Aggregate Diagnoses Data
# =============================
diagnoses = pd.read_csv(
    os.path.join(data_path, "hosp/diagnoses_icd.csv.gz"),
    usecols=["subject_id", "hadm_id", "icd_code","icd_version"],
    low_memory=False
)

#=============================
# 2A.  Diagnoses Data ICD Codes conversion
# =============================
# Load GEMs ICD-9 to ICD-10 Mapping File
gem_file_path = "D:/MIMIC-IV-Data-Pipeline/2015_I9gem.txt"  # Replace with the correct path
col_names = ["icd9_code", "icd10_code", "flags"]

# Read the file (it's space-separated)
icd9_to_icd10_gem = pd.read_csv(gem_file_path, sep=r"\s+", header=None, names=col_names, dtype=str)


  icd9_code icd10_code  flags
0      0010       A000  00000
1      0011       A001  00000
2      0019       A009  00000
3      0020      A0100  10000
4      0021       A011  00000


In [90]:
# Display first few rows
print(icd9_to_icd10_gem.head())
print(icd9_to_icd10_gem.dtypes)


  icd9_code icd10_code  flags
0      0010       A000  00000
1      0011       A001  00000
2      0019       A009  00000
3      0020      A0100  10000
4      0021       A011  00000
icd9_code     object
icd10_code    object
flags         object
dtype: object


In [92]:
print("Diagnoses Data Shape:", diagnoses.shape)
print(diagnoses.head())
print(diagnoses.dtypes)
print("Unique values in icd_version:", diagnoses["icd_version"].unique())


Diagnoses Data Shape: (6364488, 4)
   subject_id   hadm_id icd_code  icd_version
0    10000032  22595853     5723            9
1    10000032  22595853    78959            9
2    10000032  22595853     5715            9
3    10000032  22595853    07070            9
4    10000032  22595853      496            9
subject_id      int64
hadm_id         int64
icd_code       object
icd_version     int64
dtype: object
Unique values in icd_version: [ 9 10]


In [94]:
icd9_diagnoses = diagnoses[diagnoses["icd_version"] == 9].copy()

print("ICD-9 Diagnoses Shape:", icd9_diagnoses.shape)
print(icd9_diagnoses.head())
print("Unique ICD-9 codes in Diagnoses:", icd9_diagnoses["icd_code"].nunique())


ICD-9 Diagnoses Shape: (2908741, 4)
   subject_id   hadm_id icd_code  icd_version
0    10000032  22595853     5723            9
1    10000032  22595853    78959            9
2    10000032  22595853     5715            9
3    10000032  22595853    07070            9
4    10000032  22595853      496            9
Unique ICD-9 codes in Diagnoses: 9143


In [96]:
# Remove spaces and ensure consistency
icd9_diagnoses["icd_code"] = icd9_diagnoses["icd_code"].str.strip()
icd9_to_icd10_gem["icd9_code"] = icd9_to_icd10_gem["icd9_code"].str.strip()

# Remove trailing decimals in MIMIC-IV ICD-9 codes (e.g., "123.0" → "123")
icd9_diagnoses["icd_code"] = icd9_diagnoses["icd_code"].str.split(".").str[0]

# Ensure codes are uppercase (if GEM uses uppercase)
icd9_diagnoses["icd_code"] = icd9_diagnoses["icd_code"].str.upper()
icd9_to_icd10_gem["icd9_code"] = icd9_to_icd10_gem["icd9_code"].str.upper()

# Print unique values after fixing
print("First 5 ICD-9 codes in Diagnoses (after fix):", icd9_diagnoses["icd_code"].unique()[:5])
print("First 5 ICD-9 codes in GEM Mapping (after fix):", icd9_to_icd10_gem["icd9_code"].unique()[:5])


First 5 ICD-9 codes in Diagnoses (after fix): ['5723' '78959' '5715' '07070' '496']
First 5 ICD-9 codes in GEM Mapping (after fix): ['0010' '0011' '0019' '0020' '0021']


In [98]:
# Remove leading zeros from GEM ICD-9 codes
icd9_to_icd10_gem["icd9_code"] = icd9_to_icd10_gem["icd9_code"].astype(str).str.lstrip("0")

# Print first 5 values again after fixing
print("First 5 ICD-9 codes in GEM Mapping (after fix):", icd9_to_icd10_gem["icd9_code"].unique()[:5])


First 5 ICD-9 codes in GEM Mapping (after fix): ['10' '11' '19' '20' '21']


In [84]:
# =============================
# 2B. Perform ICD-9 to ICD-10 Mapping
# =============================
# Ensure ICD codes are string type
diagnoses["icd_code"] = diagnoses["icd_code"].astype(str)

# Separate ICD-9 and ICD-10 diagnoses
icd9_diagnoses = diagnoses[diagnoses["icd_version"] == "9"]
icd10_diagnoses = diagnoses[diagnoses["icd_version"] == "10"]


# QA Steps pre-merge 
# Remove spaces
icd9_diagnoses["icd_code"] = icd9_diagnoses["icd_code"].str.strip()
icd9_to_icd10_gem["icd9_code"] = icd9_to_icd10_gem["icd9_code"].str.strip()

# Remove trailing decimals in MIMIC-IV ICD-9 codes (if present)
icd9_diagnoses["icd_code"] = icd9_diagnoses["icd_code"].str.split(".").str[0]

# Ensure codes are uppercase (if GEM uses uppercase)
icd9_diagnoses["icd_code"] = icd9_diagnoses["icd_code"].str.upper()
icd9_to_icd10_gem["icd9_code"] = icd9_to_icd10_gem["icd9_code"].str.upper()

# Print unique values again after fixing
print("First 5 ICD-9 codes in Diagnoses (after fix):", icd9_diagnoses["icd_code"].unique()[:5])
print("First 5 ICD-9 codes in GEM Mapping (after fix):", icd9_to_icd10_gem["icd9_code"].unique()[:5])



  icd9_to_icd10_gem = pd.read_csv(gem_file_path, sep="\s+", header=None, names=col_names, dtype=str)


  icd9_code icd10_code  flags
0      0010       A000  00000
1      0011       A001  00000
2      0019       A009  00000
3      0020      A0100  10000
4      0021       A011  00000
First 5 ICD-9 codes in Diagnoses (after fix): []
First 5 ICD-9 codes in GEM Mapping (after fix): ['0010' '0011' '0019' '0020' '0021']


In [60]:

#=====================================
# Merge ICD-9 diagnoses with GEM mapping
mapped_icd10 = icd9_diagnoses.merge(
    icd9_to_icd10_gem, left_on="icd_code", right_on="icd9_code", how="left"
)

# Use ICD-10 codes where available, otherwise keep original ICD-9 codes
mapped_icd10["final_icd_code"] = mapped_icd10["icd10_code"].fillna(mapped_icd10["icd_code"])

# Keep only relevant columns
mapped_icd10 = mapped_icd10[["subject_id", "hadm_id", "final_icd_code"]]

# Rename ICD-10 column for consistency
icd10_diagnoses = icd10_diagnoses.rename(columns={"icd_code": "final_icd_code"})
icd10_diagnoses = icd10_diagnoses[["subject_id", "hadm_id", "final_icd_code"]]

# Combine mapped ICD-10 diagnoses with original ICD-10 data
final_diagnoses = pd.concat([icd10_diagnoses, mapped_icd10], ignore_index=True)

             icd9_code icd10_code flags
0  0010  A000    00000        NaN   NaN
1  0011  A001    00000        NaN   NaN
2  0019  A009    00000        NaN   NaN
3  0020  A0100   10000        NaN   NaN
4  0021  A011    00000        NaN   NaN


In [66]:
print("Diagnoses Data Shape:", diagnoses.shape)
print(diagnoses.head())


Diagnoses Data Shape: (6364488, 4)
   subject_id   hadm_id icd_code  icd_version
0    10000032  22595853     5723            9
1    10000032  22595853    78959            9
2    10000032  22595853     5715            9
3    10000032  22595853    07070            9
4    10000032  22595853      496            9


In [68]:
print(diagnoses["icd_version"].unique())  # Check unique values


[ 9 10]


In [70]:
print("ICD-9 to ICD-10 GEM Mapping Shape:", icd9_to_icd10_gem.shape)
print(icd9_to_icd10_gem.head())


ICD-9 to ICD-10 GEM Mapping Shape: (23947, 3)
             icd9_code icd10_code flags
0  0010  A000    00000        NaN   NaN
1  0011  A001    00000        NaN   NaN
2  0019  A009    00000        NaN   NaN
3  0020  A0100   10000        NaN   NaN
4  0021  A011    00000        NaN   NaN


In [72]:
print("Mapped ICD-10 Diagnoses Shape:", mapped_icd10.shape)
print(mapped_icd10.head())


Mapped ICD-10 Diagnoses Shape: (0, 3)
Empty DataFrame
Columns: [subject_id, hadm_id, final_icd_code]
Index: []


In [74]:
print("ICD-9 Diagnoses Shape:", icd9_diagnoses.shape)
print(icd9_diagnoses.head())

print("Unique ICD-9 Codes in Diagnoses:", icd9_diagnoses["icd_code"].nunique())


ICD-9 Diagnoses Shape: (0, 4)
Empty DataFrame
Columns: [subject_id, hadm_id, icd_code, icd_version]
Index: []
Unique ICD-9 Codes in Diagnoses: 0


In [76]:
print("GEM Mapping Shape:", icd9_to_icd10_gem.shape)
print(icd9_to_icd10_gem.head())

print("Unique ICD-9 Codes in GEM Mapping:", icd9_to_icd10_gem["icd9_code"].nunique())


GEM Mapping Shape: (23947, 3)
             icd9_code icd10_code flags
0  0010  A000    00000        NaN   NaN
1  0011  A001    00000        NaN   NaN
2  0019  A009    00000        NaN   NaN
3  0020  A0100   10000        NaN   NaN
4  0021  A011    00000        NaN   NaN
Unique ICD-9 Codes in GEM Mapping: 23947


In [78]:
print("First 5 ICD-9 codes in Diagnoses:", icd9_diagnoses["icd_code"].unique()[:5])
print("First 5 ICD-9 codes in GEM Mapping:", icd9_to_icd10_gem["icd9_code"].unique()[:5])


First 5 ICD-9 codes in Diagnoses: []
First 5 ICD-9 codes in GEM Mapping: ['0010  A000    00000' '0011  A001    00000' '0019  A009    00000'
 '0020  A0100   10000' '0021  A011    00000']


In [80]:
print(diagnoses["icd_version"].unique())  # Check unique values


[ 9 10]


In [62]:
print(diagnoses.head())

   subject_id   hadm_id icd_code  icd_version
0    10000032  22595853     5723            9
1    10000032  22595853    78959            9
2    10000032  22595853     5715            9
3    10000032  22595853    07070            9
4    10000032  22595853      496            9


In [64]:
print(final_diagnoses.head())

Empty DataFrame
Columns: [subject_id, hadm_id, final_icd_code]
Index: []


In [58]:


# =====================================
# 2C. Aggregate Diagnoses Per Admission
# ======================================

# Aggregate diagnoses per admission: count unique diagnoses and collect all codes as a list
diagnosis_counts = diagnoses.groupby(["subject_id", "hadm_id"]).agg(
    num_comorbidities=('final_icd_code', 'nunique'),
    diagnosis_list=('final_icd_code', lambda x: list(x))
).reset_index()
# =============================
# 2D. Create Key Clinical Flags
# =============================
# Create key clinical flags based on diagnosis_list
# (These lambda functions operate on the aggregated list for each admission.)

# Palliative Care Flag: 1 if any ICD code in the list matches these codes
palliative_care_codes = ["Z515", "V667"]
diagnosis_counts["palliative_care_flag"] = diagnosis_counts["diagnosis_list"].apply(
    lambda x: 1 if any(code in palliative_care_codes for code in x) else 0
)

# Cognitive Impairment Flag: 1 if any code is in the specified list (ensure codes are strings)
cognitive_impairment_codes = [
    "G30", "G300", "G301", "G308", "G309",   # Alzheimer's
    "F0150", "F0151", "F01.A0", "F01A1", "F01B0", "F01B1", "F01C0", "F01C1",  # Vascular Dementia
    "F0280", "F0281",                         # Dementia in other diseases
    "F03", "F0390", "F0391",                   # Unspecified Dementia
    "G31", "G3184"                            # Mild Cognitive Impairment
]
diagnosis_counts["cognitive_impairment_flag"] = diagnosis_counts["diagnosis_list"].apply(
    lambda x: 1 if any(str(code) in cognitive_impairment_codes for code in x) else 0
)

# Delirium Flag: 1 if any code in the list matches these ICD codes
delirium_icd_codes = ["F05", "2930", "2931", "F051", "F050", "F059"]
diagnosis_counts["delirium"] = diagnosis_counts["diagnosis_list"].apply(
    lambda x: 1 if isinstance(x, list) and any(code in delirium_icd_codes for code in x) else 0
)

             icd9_code icd10_code flags
0  0010  A000    00000        NaN   NaN
1  0011  A001    00000        NaN   NaN
2  0019  A009    00000        NaN   NaN
3  0020  A0100   10000        NaN   NaN
4  0021  A011    00000        NaN   NaN


KeyError: "Column(s) ['final_icd_code'] do not exist"

In [None]:
# =============================
# 6. Save Final Dataset
# =============================
import ace_tools as tools
tools.display_dataframe_to_user(name="Final Diagnosis Data with ICD Mapping", dataframe=diagnosis_counts)

In [9]:
print(diagnosis_counts.head())

   subject_id   hadm_id  num_comorbidities  \
0    10000032  22595853                  8   
1    10000032  22841357                  8   
2    10000032  25742920                 10   
3    10000032  29079034                 13   
4    10000068  25022803                  1   

                                      diagnosis_list  palliative_care_flag  \
0  [5723, 78959, 5715, 07070, 496, 29680, 30981, ...                     0   
1   [07071, 78959, 2875, 2761, 496, 5715, V08, 3051]                     0   
2  [07054, 78959, V462, 5715, 2767, 2761, 496, V0...                     0   
3  [45829, 07044, 7994, 2761, 78959, 2767, 3051, ...                     0   
4                                            [30500]                     0   

   cognitive_impairment_flag  delirium  
0                          0         0  
1                          0         0  
2                          0         0  
3                          0         0  
4                          0         0  


In [21]:
# # =============================
# 3. Load and Aggregate ICU Data
# =============================
# Load ICU stays and aggregate: keep the first ICU stay per admission.
icustays = pd.read_csv(
    os.path.join(data_path, "icu/icustays.csv.gz"),
    usecols=["subject_id", "hadm_id", "stay_id", "last_careunit", "los"],
    low_memory=False
)
icustays_agg = icustays.sort_values(["subject_id", "hadm_id", "stay_id"]).groupby(
    ["subject_id", "hadm_id"]
).first().reset_index()

# Count total ICU stays per patient (for history)
icu_history = icustays.groupby("subject_id")["stay_id"].count().reset_index()
icu_history.rename(columns={"stay_id": "prior_icu_admissions"}, inplace=True)

# ---- ICU Chartevents: Process only a subset of items ----
chunksize = 200000
icu_item_ids = [
    # 1. Physiological Parameters
    "220045",  # Heart Rate
    "220052",  # Mean Arterial Blood Pressure
    "220210",  # Respiratory Rate
    "220277",  # Oxygen Saturation (pulseoxymetry)
    "223761",  # Temperature Fahrenheit
    "220739",  # GCS – Eye Opening
    "223900",  # GCS – Verbal Response
    "223901",  # GCS – Motor Response
    "228096",  # Richmond-RAS Scale

    # 2. Laboratory Results
    "220546",  # WBC
    "220228",  # Hemoglobin
    "227457",  # Platelet Count
    "220645",  # Sodium (serum)
    "227442",  # Potassium (serum)
    "220602",  # Chloride (serum)
    "227443",  # HCO3 (serum)
    "225624",  # BUN
    "220615",  # Creatinine (serum)
    "220621",  # Glucose (serum)
    "225668",  # Lactic Acid
    "223830",  # pH (Arterial)
    "220224",  # Arterial O2 pressure (PaO2)
    "220235",  # Arterial CO2 pressure (PaCO2)
    "224828",  # Arterial Base Excess # 3. Intervention-Based Variables
    # 3. Intervention-Based Variables
    "223849",  # Ventilator Mode
    "223836",  # Airway Type
    "223835",  # Inspired O2 Fraction (FiO2)
    "220339",  # PEEP set
    "227210",  # Propofol (Intubation)
    "227211",  # Ketamine (Intubation)
    "227213",  # Vecuronium (Intubation)
    "220120"   # Intra-Aortic Balloon Pump (IABP)
]

chart_file = os.path.join(data_path, "icu/chartevents.csv.gz")
chart_chunks = []
for chunk in pd.read_csv(
    chart_file,
    usecols=["subject_id", "hadm_id", "stay_id", "itemid", "charttime", "valuenum"],
    chunksize=chunksize,
    low_memory=True
):
    # Filter: convert itemid to string and keep only our subset
    chunk = chunk[chunk["itemid"].astype(str).isin(icu_item_ids)]
    chart_chunks.append(chunk)
icu_chartevents = pd.concat(chart_chunks, ignore_index=True)
icu_chartevents.sort_values(["subject_id", "hadm_id", "charttime"], inplace=True)
icu_first = icu_chartevents.groupby(["subject_id", "hadm_id", "itemid"]).first().reset_index()

# Pivot so that each admission is one row with a column per itemid
icu_pivot = icu_first.pivot(
    index=["subject_id", "hadm_id"],
    columns="itemid",
    values="valuenum"
).reset_index()

# (Optional) You may later rename these columns using a mapping if desired.
print("ICU pivot shape:", icu_pivot.shape)
print(icu_pivot.head())

ICU pivot shape: (85242, 31)
itemid  subject_id   hadm_id  220045  220052  220120  220210  220224  220228  \
0         10000032  29079034    91.0     NaN     NaN    24.0     NaN     NaN   
1         10000690  25860671    79.0     NaN     NaN    23.0   123.0     9.5   
2         10000980  26913865    77.0     NaN     NaN    23.0     NaN     NaN   
3         10001217  24597018    86.0     NaN     NaN    18.0     NaN    11.2   
4         10001217  27703517    96.0     NaN     NaN    11.0     NaN    12.3   

itemid  220235  220277  ...  223849  223900  223901  224828  225624  225668  \
0          NaN    98.0  ...     NaN     4.0     6.0     NaN    33.0     NaN   
1         54.0   100.0  ...     NaN     5.0     6.0     0.0    21.0     NaN   
2          NaN   100.0  ...    11.0     5.0     6.0     NaN     NaN     1.7   
3          NaN    99.0  ...     NaN     5.0     6.0     NaN     9.0     NaN   
4          NaN   100.0  ...     NaN     5.0     6.0     NaN    10.0     NaN   

itemid  227442 

In [34]:
id_to_label = {
    220045: "heart_rate",      # Heart Rate
    220052: "map",             # Mean Arterial Blood Pressure
    220210: "resp_rate",       # Respiratory Rate
    220277: "spo2",            # Oxygen Saturation
    223761: "temp",            # Temperature Fahrenheit
    220739: "gcs_eye",         # GCS – Eye Opening
    223900: "gcs_verbal",      # GCS – Verbal Response
    223901: "gcs_motor",       # GCS – Motor Response
    228096: "ras_scale",       # Richmond-RAS Scale
    220546: "wbc",             # White Blood Cell Count
    220228: "hb",              # Hemoglobin
    227457: "plt",             # Platelet Count
    220645: "sodium",          # Sodium (serum)
    227442: "potassium",       # Potassium (serum)
    220602: "chloride",        # Chloride (serum)
    227443: "hco3",            # HCO3 (serum)
    225624: "bun",             # BUN
    220615: "creatinine",      # Creatinine (serum)
    220621: "glucose",         # Glucose (serum)
    225668: "lactate",         # Lactic Acid
    223830: "ph",              # pH (Arterial)
    220224: "pao2",            # Arterial O2 pressure (PaO₂)
    220235: "paco2",           # Arterial CO₂ pressure (PaCO₂)
    224828: "base_excess",     # Arterial Base Excess
    223849: "vent_mode",       # Ventilator Mode
    223836: "airway",          # Airway Type
    223835: "fio2",            # Inspired O₂ Fraction (FiO₂)
    220339: "peep",            # PEEP set
    227210: "propofol",        # Propofol (Intubation)
    227211: "ketamine",        # Ketamine (Intubation)
    227213: "vecuronium",      # Vecuronium (Intubation)
    220120: "iabp"             # Intra-Aortic Balloon Pump (IABP)
}

icu_pivot.rename(columns=lambda col: "icu_" + id_to_label[col]
                                  if col not in ["subject_id", "hadm_id"] and col in id_to_label
                                  else col,
                 inplace=True)



In [36]:
print("ICU pivot shape:", icu_pivot.shape)
print(icu_pivot.head())

ICU pivot shape: (85242, 31)
itemid  subject_id   hadm_id  icu_heart_rate  icu_map  icu_iabp  \
0         10000032  29079034            91.0      NaN       NaN   
1         10000690  25860671            79.0      NaN       NaN   
2         10000980  26913865            77.0      NaN       NaN   
3         10001217  24597018            86.0      NaN       NaN   
4         10001217  27703517            96.0      NaN       NaN   

itemid  icu_resp_rate  icu_pao2  icu_hb  icu_paco2  icu_spo2  ...  \
0                24.0       NaN     NaN        NaN      98.0  ...   
1                23.0     123.0     9.5       54.0     100.0  ...   
2                23.0       NaN     NaN        NaN     100.0  ...   
3                18.0       NaN    11.2        NaN      99.0  ...   
4                11.0       NaN    12.3        NaN     100.0  ...   

itemid  icu_vent_mode  icu_gcs_verbal  icu_gcs_motor  icu_base_excess  \
0                 NaN             4.0            6.0              NaN   
1      

In [22]:
# =============================
# 4. Load and Process Lab Data
# =============================
lab_ids = [
    "51300",  # WBC Count
    "51111",  # Hemoglobin
    "51265",  # Platelet Count
    "52623",  # Sodium
    "52610",  # Potassium
    "52535",  # Chloride
    "51739",  # Total CO2 (as surrogate for bicarbonate)
    "52569",  # Glucose
    "52647",  # BUN
    "52546",  # Creatinine
    "53084",  # ALT
    "53088",  # AST
    "53086",  # Alkaline Phosphatase
    "53089",  # Total Bilirubin
    "50889",  # CRP
    "50862"   # Albumin
]
lab_chunks = []
lab_file = os.path.join(data_path, "hosp/labevents.csv.gz")
for chunk in pd.read_csv(
    lab_file,
    usecols=["subject_id", "hadm_id", "itemid", "charttime", "valuenum"],
    chunksize=200000,
    low_memory=False
):
    # Filter for desired lab tests (cast itemid to string for safe comparison)
    chunk = chunk[chunk["itemid"].astype(str).isin(lab_ids)]
    lab_chunks.append(chunk)
lab_events = pd.concat(lab_chunks, ignore_index=True)
lab_events.sort_values(["subject_id", "hadm_id", "charttime"], inplace=True)
# Keep only the first recorded value for each lab test per admission
lab_first = lab_events.groupby(["subject_id", "hadm_id", "itemid"]).first().reset_index()
lab_pivot = lab_first.pivot(
    index=["subject_id", "hadm_id"],
    columns="itemid",
    values="valuenum"
).reset_index()
# Rename lab columns using a mapping for clarity
lab_pivot.rename(columns={
    50862: "LabH_Alb",
    50889: "LabH_CRP",
    51111: "LabH_Hb",
    51265: "LabH_Plt",
    51300: "LabH_WBC"
}, inplace=True)


In [None]:
print(lab_pivot.head())

In [23]:
# =============================
# 5. Load and Aggregate Prescriptions (High-Risk Medications & Drug List)
# =============================
prescriptions = pd.read_csv(
    os.path.join(data_path, "hosp/prescriptions.csv.gz"),
    usecols=["subject_id", "hadm_id", "drug"],
    low_memory=False
)
# Standardize drug names
prescriptions["drug"] = prescriptions["drug"].astype(str).str.lower().str.strip()

# Aggregate all prescribed drugs per admission
all_drugs = prescriptions.groupby(["subject_id", "hadm_id"])["drug"].agg(list).reset_index()

# Define expanded high-risk medications list (based on literature)
high_risk_meds = [
    "lorazepam", "midazolam", "diazepam", "clonazepam", "alprazolam", "temazepam", "chlordiazepoxide",
    "morphine", "fentanyl", "hydromorphone", "oxycodone", "meperidine", "codeine", "tramadol", "buprenorphine",
    "propofol", "dexmedetomidine",
    "haloperidol", "olanzapine", "quetiapine", "risperidone", "ziprasidone", "chlorpromazine", "aripiprazole",
    "diphenhydramine", "hydroxyzine", "promethazine", "oxybutynin", "scopolamine", "benztropine", "atropine",
    "dexamethasone", "prednisone", "methylprednisolone", "hydrocortisone",
    "metoclopramide", "cyclobenzaprine", "baclofen", "amantadine"
]
prescriptions["high_risk_med"] = prescriptions["drug"].apply(
    lambda x: 1 if any(med in x for med in high_risk_meds) else 0
)
# Compute per-admission metrics:
unique_high_risk_med = prescriptions[prescriptions["high_risk_med"] == 1].groupby(
    ["subject_id", "hadm_id"]
)["drug"].nunique().reset_index(name="unique_high_risk_med")
high_risk_med_count = prescriptions.groupby(
    ["subject_id", "hadm_id"]
)["high_risk_med"].sum().reset_index(name="high_risk_med_count")
high_risk_med_flag = prescriptions.groupby(
    ["subject_id", "hadm_id"]
)["high_risk_med"].max().reset_index(name="high_risk_med_flag")
high_risk_med_summary = high_risk_med_flag.merge(
    unique_high_risk_med, on=["subject_id", "hadm_id"], how="left"
).merge(high_risk_med_count, on=["subject_id", "hadm_id"], how="left")
high_risk_med_summary["unique_high_risk_med"] = high_risk_med_summary["unique_high_risk_med"].fillna(0).astype(int)


In [38]:
# Create an empty DataFrame with the key columns
core_data = pd.DataFrame(columns=["subject_id", "hadm_id"])

In [40]:
# =============================
# 6. Merge All Components into a Final Core Dataset
# =============================

# Start with admissions merged with patients (ensuring one row per admission)
core_data = admissions.merge(patients, on="subject_id", how="inner")

# Merge aggregated diagnosis_counts (one row per admission)
core_data = core_data.merge(
    diagnosis_counts[["subject_id", "hadm_id", "palliative_care_flag", "delirium", "cognitive_impairment_flag", "num_comorbidities"]],
    on=["subject_id", "hadm_id"],
    how="left"
)
# Fill missing flag values with 0 and convert to integer
core_data["palliative_care_flag"] = core_data["palliative_care_flag"].fillna(0).astype(int)
core_data["delirium"] = core_data["delirium"].fillna(0).astype(int)
core_data["cognitive_impairment_flag"] = core_data["cognitive_impairment_flag"].fillna(0).astype(int)

# (Optional) Avoid merging raw diagnoses to prevent duplication
# core_data = core_data.merge(diagnoses, on=["subject_id", "hadm_id"], how="left")

# Merge aggregated ICU stays (first ICU stay per admission)
core_data = core_data.merge(icustays_agg, on=["subject_id", "hadm_id"], how="left")

# Merge ICU history (number of ICU stays per patient)
core_data = core_data.merge(icu_history, on="subject_id", how="left")

# Merge aggregated ICU vitals (icu_pivot)
core_data = core_data.merge(icu_pivot, on=["subject_id", "hadm_id"], how="left")

# Merge high-risk medication summary and the full drug list (all_drugs)
core_data = core_data.merge(high_risk_med_summary, on=["subject_id", "hadm_id"], how="left")
core_data = core_data.merge(all_drugs, on=["subject_id", "hadm_id"], how="left")

# Merge aggregated lab results (lab_pivot)
core_data = core_data.merge(lab_pivot, on=["subject_id", "hadm_id"], how="left")

# Fill any remaining missing numeric fields
core_data["num_comorbidities"] = core_data["num_comorbidities"].fillna(0).astype(int)
core_data["prior_icu_admissions"] = core_data["prior_icu_admissions"].fillna(0).astype(int)


In [42]:
#Linkage Data Quality 
df=core_data
#Count occurrences of each patient (subject_id)
patient_counts = df["subject_id"].value_counts()

# Display summary
print("🔍 Number of Unique Patients:", df["subject_id"].nunique())
print("🔍 Total Rows in Dataset:", df.shape[0])
print("🔍 Distribution of Patient Admissions:")
print(patient_counts.describe())  # Summary statistics for occurrences

🔍 Number of Unique Patients: 223452
🔍 Total Rows in Dataset: 546028
🔍 Distribution of Patient Admissions:
count    223452.000000
mean          2.443603
std           3.574410
min           1.000000
25%           1.000000
50%           1.000000
75%           3.000000
max         238.000000
Name: count, dtype: float64


In [48]:
#test any merge issues
print(diagnosis_counts.duplicated(subset=["subject_id", "hadm_id"]).sum())


0


In [50]:
# Print the column names
print(core_data.columns)

# Display the first 5 rows
print(core_data.head())

# Alternatively, get a concise summary including non-null counts and dtypes
print(core_data.info())


Index(['subject_id', 'hadm_id', 'admission_type', 'admission_location',
       'discharge_location', 'insurance', 'marital_status', 'race',
       'ed_time_spent', 'los_hosp', 'gender', 'anchor_age', 'anchor_year',
       'palliative_care_flag', 'delirium', 'cognitive_impairment_flag',
       'num_comorbidities', 'stay_id', 'last_careunit', 'los',
       'prior_icu_admissions', 'icu_heart_rate', 'icu_map', 'icu_iabp',
       'icu_resp_rate', 'icu_pao2', 'icu_hb', 'icu_paco2', 'icu_spo2',
       'icu_peep', 'icu_wbc', 'icu_chloride', 'icu_creatinine', 'icu_glucose',
       'icu_sodium', 'icu_gcs_eye', 'icu_temp', 'icu_ph', 'icu_fio2',
       'icu_airway', 'icu_vent_mode', 'icu_gcs_verbal', 'icu_gcs_motor',
       'icu_base_excess', 'icu_bun', 'icu_lactate', 'icu_potassium',
       'icu_hco3', 'icu_plt', 'icu_ras_scale', 'high_risk_med_flag',
       'unique_high_risk_med', 'high_risk_med_count', 'drug', 'LabH_Alb',
       'LabH_CRP', 'LabH_Hb', 'LabH_Plt', 'LabH_WBC'],
      dtype='objec

In [52]:
# =============================
# 7. Final Summary and Save Output
# =============================
print("✅ Cleaned dataset prepared. One row per admission.")
print("Unique Patients:", core_data["subject_id"].nunique())
print("Total Rows:", core_data.shape[0])

# Optionally, save the final dataset (compressed CSV)
output_path = "D:/MIMIC-IV-Data-Pipeline/processed_data"
os.makedirs(output_path, exist_ok=True)
core_data.to_csv(os.path.join(output_path, "delirium_prediction_data_v7.csv.gz"), index=False, compression="gzip")
print("✅ Cleaned dataset saved successfully!")

✅ Cleaned dataset prepared. One row per admission.
Unique Patients: 223452
Total Rows: 546028
✅ Cleaned dataset saved successfully!
