In [2]:
# autoreload
%load_ext autoreload
%autoreload 2

In [3]:
import os
import sys
import numpy as np
import pandas as pd

In [4]:
mimic_iv_path = "/data/wang/junh/datasets/physionet.org/files/mimiciv/2.2"
mm_dir = "/data/wang/junh/datasets/multimodal"

output_dir = os.path.join(mm_dir, "preprocessing")
os.makedirs(output_dir, exist_ok=True)

In [5]:
f_path = os.path.join(mimic_iv_path, "hosp", "admissions.csv.gz")
admissions_df = pd.read_csv(f_path, low_memory=False)
admissions_df['admittime'] = pd.to_datetime(admissions_df['admittime'])
admissions_df['dischtime'] = pd.to_datetime(admissions_df['dischtime'])

icustays_df = pd.read_csv(os.path.join(mimic_iv_path, "icu", "icustays.csv.gz"), low_memory=False)
icustays_df['intime'] = pd.to_datetime(icustays_df['intime'])
icustays_df['outtime'] = pd.to_datetime(icustays_df['outtime'])

procedureevents_df = pd.read_csv(os.path.join(mimic_iv_path, "icu", "procedureevents.csv.gz"), low_memory=False)
procedureevents_df['starttime'] = pd.to_datetime(procedureevents_df['starttime'])
procedureevents_df['endtime'] = pd.to_datetime(procedureevents_df['endtime'])
# format='mixed' is not valid, use errors='coerce' instead
procedureevents_df['storetime'] = pd.to_datetime(procedureevents_df['storetime'], errors='coerce')

In [6]:
chartevents_df = pd.read_csv(os.path.join(mimic_iv_path, "icu", "chartevents.csv.gz"), low_memory=False)
chartevents_df['charttime'] = pd.to_datetime(chartevents_df['charttime'])
chartevents_df['storetime'] = pd.to_datetime(chartevents_df['storetime'])
print(chartevents_df.columns)

Index(['subject_id', 'hadm_id', 'stay_id', 'caregiver_id', 'charttime',
      dtype='object')


In [7]:
hosp_lab_events = pd.read_csv(os.path.join(mimic_iv_path, "hosp", "labevents.csv.gz"), low_memory=False)
hosp_lab_events['charttime'] = pd.to_datetime(hosp_lab_events['charttime'])
hosp_lab_events['storetime'] = pd.to_datetime(hosp_lab_events['storetime'])

# Drop hosp_lab_events where hadm_id is nan
hosp_lab_events = hosp_lab_events.dropna(subset=['hadm_id'])

In [8]:
d_lab_items_df = pd.read_csv(os.path.join(mimic_iv_path, "hosp", "d_labitems.csv.gz"), low_memory=False)

# Drop rows with missing values
d_lab_items_df = d_lab_items_df.dropna()

# Search labels for something that looks like ph
ph_labels = d_lab_items_df[d_lab_items_df['label'].str.contains('Glucose', case=False)]
print(ph_labels)

      itemid                 label                fluid    category
7      50809               Glucose                Blood   Blood Gas
40     50842      Glucose, Ascites              Ascites   Chemistry
129    50931               Glucose                Blood   Chemistry
210    51022  Glucose, Joint Fluid          Joint Fluid   Chemistry
222    51034   Glucose, Body Fluid     Other Body Fluid   Chemistry
241    51053      Glucose, Pleural              Pleural   Chemistry
272    51084        Glucose, Urine                Urine   Chemistry
638    51478               Glucose                Urine  Hematology
908    51790          Glucose, CSF  Cerebrospinal Fluid   Chemistry
1034   51941        Glucose, Stool                Stool   Chemistry
1074   51981               Glucose                Urine   Chemistry
1120   52027  Glucose, Whole Blood                Blood   Blood Gas
1528   52569               Glucose                Blood   Chemistry


In [9]:
print(hosp_lab_events.head)

<bound method NDFrame.head of            labevent_id  subject_id     hadm_id  specimen_id  itemid  \
110                111    10000032  22595853.0     39504011   51464   
111                112    10000032  22595853.0     39504011   51466   
112                113    10000032  22595853.0     39504011   51478   
113                114    10000032  22595853.0     39504011   51484   
114                115    10000032  22595853.0     39504011   51486   
...                ...         ...         ...          ...     ...   
118171359    118352498    19999987  23865745.0     85842100   51250   
118171360    118352499    19999987  23865745.0     85842100   51265   
118171361    118352500    19999987  23865745.0     85842100   51277   
118171362    118352501    19999987  23865745.0     85842100   51279   
118171363    118352502    19999987  23865745.0     85842100   51301   

          order_provider_id           charttime           storetime value  \
110                     NaN 2180-05-07 0

In [10]:
d_items_df = pd.read_csv(os.path.join(mimic_iv_path, "icu", "d_items.csv.gz"), low_memory=False)
# d_items_df = d_items_df[d_items_df['category'] == "Labs"]
# ph_labels = d_items_df[d_items_df['label'].str.contains('pressure', case=False)]
# print(ph_labels)

def get_procedures_of_interest(df):
    df = df.copy()

    event_list = ['Foley Catheter', 'PICC Line', 'Intubation', 'Peritoneal Dialysis', 
                            'Bronchoscopy', 'EEG', 'Dialysis - CRRT', 'Dialysis Catheter', 
                            'Chest Tube Removed', 'Hemodialysis']
    event_links_df = pd.DataFrame()
    for event in event_list:
        curr_event_item_id = d_items_df[d_items_df["label"] == event]["itemid"].values[0]

        tmp_dict = {"event": event, "itemid": curr_event_item_id}
        event_links_df = pd.concat([event_links_df, pd.DataFrame(tmp_dict, index=[0])], axis=0, ignore_index=True)

    df = df[df["itemid"].isin(event_links_df['itemid'])]
    df = df.merge(event_links_df, on="itemid", how="left")
    df.drop(columns=["itemid"], inplace=True)
    return df

def get_labs_of_interest(df):
    df = df.copy()

    event_list = ['Glucose', 'Potassium', 'Sodium', 'Chloride', 'Creatinine',
           'Urea Nitrogen', 'Bicarbonate', 'Anion Gap', 'Hemoglobin', 'Hematocrit',
           'Magnesium', 'Platelet Count', 'Phosphate', 'White Blood Cells',
           'Calcium, Total', 'MCH', 'Red Blood Cells', 'MCHC', 'MCV', 'RDW', 
                      'Platelet Count', 'Neutrophils', 'Vancomycin'
                  ]

    event_links_df = pd.DataFrame()
    for event in event_list:
        # print(event)
        curr_event_item_id = d_lab_items_df[d_lab_items_df["label"] == event]["itemid"].values[0]

        tmp_dict = {"event": event, "itemid": curr_event_item_id}
        event_links_df = pd.concat([event_links_df, pd.DataFrame(tmp_dict, index=[0])], axis=0, ignore_index=True)

    df = df[df["itemid"].isin(event_links_df['itemid'])]
    df = df.merge(event_links_df, on="itemid", how="left")
    df.drop(columns=["itemid"], inplace=True)

    print("Labs of interest: ", df['event'].unique())
    print(event_links_df)
    print("Number of labs of interest: ", df.columns, "\n", df.shape)

    return df

def get_vitals_of_interest(df):
    df = df.copy()

    event_list = [ #CHART EVENTS
                  'Heart Rate','Non Invasive Blood Pressure systolic',
                    'Non Invasive Blood Pressure diastolic', 'Non Invasive Blood Pressure mean', 
                    'Respiratory Rate','O2 saturation pulseoxymetry', 
                    'GCS - Verbal Response', 'GCS - Eye Opening', 'GCS - Motor Response']

    event_links_df = pd.DataFrame()
    for event in event_list:
        # print(event)
        curr_event_item_id = d_items_df[d_items_df["label"] == event]["itemid"].values[0]

        tmp_dict = {"event": event, "itemid": curr_event_item_id}
        event_links_df = pd.concat([event_links_df, pd.DataFrame(tmp_dict, index=[0])], axis=0, ignore_index=True)

    df = df[df["itemid"].isin(event_links_df['itemid'])]
    df = df.merge(event_links_df, on="itemid", how="left")
    df.drop(columns=["itemid"], inplace=True)

    rename_dict = {
        'Non Invasive Blood Pressure systolic': 'Systolic BP',
        'Non Invasive Blood Pressure diastolic': 'Diastolic BP',
        'Non Invasive Blood Pressure mean': 'Mean BP',
        'O2 saturation pulseoxymetry': 'O2 Saturation'
    }

    df['event'] = df['event'].replace(rename_dict)
    print("Vitals of interest: ", df['event'].unique())
    print(event_links_df)
    print("Number of vitals of interest: ", df.columns, "\n", df.shape)
    
    return df


# procedureevents_df = get_procedures_of_interest(procedureevents_df)
print("labevents_df")
labevents_df = get_labs_of_interest(hosp_lab_events)
print("vitals_df")
vitals_df = get_vitals_of_interest(chartevents_df)


labevents_df
Labs of interest:  ['Hematocrit' 'MCH' 'MCHC' 'MCV' 'Platelet Count' 'RDW' 'Red Blood Cells'
 'White Blood Cells' 'Anion Gap' 'Bicarbonate' 'Calcium, Total' 'Chloride'
 'Creatinine' 'Magnesium' 'Phosphate' 'Sodium' 'Urea Nitrogen'
 'Neutrophils' 'Vancomycin' 'Hemoglobin' 'Glucose']
                event  itemid
0             Glucose   50809
1           Potassium   50833
2              Sodium   50983
3            Chloride   50902
4          Creatinine   50912
5       Urea Nitrogen   51006
6         Bicarbonate   50882
7           Anion Gap   50868
8          Hemoglobin   50811
9          Hematocrit   51221
10          Magnesium   50960
11     Platelet Count   51265
12          Phosphate   50970
13  White Blood Cells   51301
14     Calcium, Total   50893
15                MCH   51248
16    Red Blood Cells   51279
17               MCHC   51249
18                MCV   51250
19                RDW   51277
20     Platelet Count   51265
21        Neutrophils   51256
22         Van

In [11]:
print(labevents_df['event'].unique())
print(vitals_df['event'].unique())

['Hematocrit' 'MCH' 'MCHC' 'MCV' 'Platelet Count' 'RDW' 'Red Blood Cells'
 'White Blood Cells' 'Anion Gap' 'Bicarbonate' 'Calcium, Total' 'Chloride'
 'Creatinine' 'Magnesium' 'Phosphate' 'Sodium' 'Urea Nitrogen'
 'Neutrophils' 'Vancomycin' 'Hemoglobin' 'Glucose']
['Systolic BP' 'Diastolic BP' 'Mean BP' 'Heart Rate' 'Respiratory Rate'
 'O2 Saturation' 'GCS - Eye Opening' 'GCS - Verbal Response'
 'GCS - Motor Response']


In [12]:
labevents_df = labevents_df[['subject_id', 'hadm_id', 'charttime', 'event', 'valuenum']]
vitals_df = vitals_df[['subject_id', 'hadm_id', 'stay_id', 'charttime', 'event', 'valuenum']]
# procedureevents_df = procedureevents_df[['subject_id', 'hadm_id', 'stay_id', 'starttime', 'endtime', 'storetime', 'value', 'event']]

# labs_df = get_labs_of_interest(hosp_lab_events)
# vitals_df = get_vitals_of_interest(chartevents_df)
# labs_vitals_df = labs_vitals_df[['subject_id', 'hadm_id', 'stay_id', 'charttime', 'event', 'valuenum']]

In [13]:
labevents_df.columns

Index(['subject_id', 'hadm_id', 'charttime', 'event', 'valuenum'], dtype='object')

In [14]:
# d_items_df = pd.read_csv(os.path.join(mimic_iv_path, "icu", "d_items.csv"), low_memory=False)
# # d_items_df = d_items_df[d_items_df['category'] == "Labs"]

# def get_procedures_of_interest(df):
#     df = df.copy()

#     event_list = ['Foley Catheter', 'PICC Line', 'Intubation', 'Peritoneal Dialysis', 
#                             'Bronchoscopy', 'EEG', 'Dialysis - CRRT', 'Dialysis Catheter', 
#                             'Chest Tube Removed', 'Hemodialysis']
#     event_links_df = pd.DataFrame()
#     for event in event_list:
#         curr_event_item_id = d_items_df[d_items_df["label"] == event]["itemid"].values[0]

#         tmp_dict = {"event": event, "itemid": curr_event_item_id}
#         event_links_df = pd.concat([event_links_df, pd.DataFrame(tmp_dict, index=[0])], axis=0, ignore_index=True)

#     df = df[df["itemid"].isin(event_links_df['itemid'])]
#     df = df.merge(event_links_df, on="itemid", how="left")
#     df.drop(columns=["itemid"], inplace=True)
#     return df

# def get_labs_of_interest(df):
#     df = df.copy()

#     event_list = [  #LAB EVENTS
#                   'Glucose (serum)', 'Glucose (whole blood)',
#                   'Potassium (serum)', 'Potassium (whole blood)', 
#                   'Sodium (serum)', 'Sodium (whole blood)',
#                   'Chloride (serum)', 'Chloride (whole blood)',
#                   'Creatinine (serum)', 'Creatinine (whole blood)',
#                   'BUN', #   'Urea Nitrogen', 
#                   'HCO3 (serum)', #   'Bicarbonate', 
#                   'Anion gap', 
#                   'Hemoglobin', 
#                   'Hematocrit (serum)', 'Hematocrit (whole blood - calc)',
#                   'Magnesium', 
#                   'Platelet Count', 
#                   'Alkaline Phosphate', 
#                   'WBC', #'White Blood Cells',
#                   'Calcium non-ionized', 'Ionized Calcium', #'Calcium, Total', 
#                 #   'MCH', 
#                 #   'Red Blood Cells', 
#                 #   'MCHC', 
#                 #   'MCV', 
#                 #   'RDW', 
#                   'Absolute Neutrophil Count', #  'Neutrophils', 
#                   'Vancomycin (Peak)', 'Vancomycin (Random)', 'Vancomycin (Trough)',
#                   # NEW
#                   'PH (Arterial)', 'PH (dipstick)', 'PH (SOFT)', 'PH (Venous)',
#                   'Capillary Refill R', 'Capillary Refill L',
#                   'Temperature Celsius',
#                   'Daily Weight', 'Admission Weight (Kg)',
#                   'Inspired O2 Fraction'
#                   ]

#     event_links_df = pd.DataFrame()
#     for event in event_list:
#         # print(event)
#         curr_event_item_id = d_items_df[d_items_df["label"] == event]["itemid"].values[0]

#         tmp_dict = {"event": event, "itemid": curr_event_item_id}
#         event_links_df = pd.concat([event_links_df, pd.DataFrame(tmp_dict, index=[0])], axis=0, ignore_index=True)

#     df = df[df["itemid"].isin(event_links_df['itemid'])]
#     df = df.merge(event_links_df, on="itemid", how="left")
#     df.drop(columns=["itemid"], inplace=True)

#     rename_dict = {
#         'Glucose (serum)': 'Glucose',
#         'Glucose (whole blood)': 'Glucose',
#         'Potassium (serum)': 'Potassium',
#         'Potassium (whole blood)': 'Potassium',
#         'Sodium (serum)': 'Sodium',
#         'Sodium (whole blood)': 'Sodium',
#         'Chloride (serum)': 'Chloride',
#         'Chloride (whole blood)': 'Chloride',
#         'Creatinine (serum)': 'Creatinine',
#         'Creatinine (whole blood)': 'Creatinine',
#         'BUN': 'Urea Nitrogen',
#         'HCO3 (serum)': 'Bicarbonate',
#         'Hematocrit (serum)': 'Hematocrit',
#         'Hematocrit (whole blood - calc)': 'Hematocrit',
#         'Calcium non-ionized': 'Calcium',
#         'Ionized Calcium': 'Calcium',
#         'Vancomycin (Peak)': 'Vancomycin',
#         'Vancomycin (Random)': 'Vancomycin',
#         'Vancomycin (Trough)': 'Vancomycin',
#         'PH (Arterial)': 'PH',
#         'PH (dipstick)': 'PH',
#         'PH (SOFT)': 'PH',
#         'PH (Venous)': 'PH',
#         'Capillary Refill R': 'Capillary Refill',
#         'Capillary Refill L': 'Capillary Refill',
#         'Temperature Celsius': 'Temperature',
#         'Daily Weight': 'Weight',
#         'Admission Weight (Kg)': 'Weight',
#         'Inspired O2 Fraction': 'Inspired O2 Fraction'
#     }

#     df['event'] = df['event'].replace(rename_dict)

#     return df

# def get_vitals_of_interest(df):
#     df = df.copy()

#     event_list = [ #CHART EVENTS
#                   'Heart Rate','Non Invasive Blood Pressure systolic',
#                     'Non Invasive Blood Pressure diastolic', 'Non Invasive Blood Pressure mean', 
#                     'Respiratory Rate','O2 saturation pulseoxymetry', 
#                     'GCS - Verbal Response', 'GCS - Eye Opening', 'GCS - Motor Response']

#     event_links_df = pd.DataFrame()
#     for event in event_list:
#         # print(event)
#         curr_event_item_id = d_items_df[d_items_df["label"] == event]["itemid"].values[0]

#         tmp_dict = {"event": event, "itemid": curr_event_item_id}
#         event_links_df = pd.concat([event_links_df, pd.DataFrame(tmp_dict, index=[0])], axis=0, ignore_index=True)

#     df = df[df["itemid"].isin(event_links_df['itemid'])]
#     df = df.merge(event_links_df, on="itemid", how="left")
#     df.drop(columns=["itemid"], inplace=True)

#     rename_dict = {
#         'Non Invasive Blood Pressure systolic': 'Systolic BP',
#         'Non Invasive Blood Pressure diastolic': 'Diastolic BP',
#         'Non Invasive Blood Pressure mean': 'Mean BP',
#         'O2 saturation pulseoxymetry': 'O2 Saturation'
#     }

#     df['event'] = df['event'].replace(rename_dict)
    
#     return df

# def get_labs_vitals(df):
#     df = df.copy()

#     event_list = [ #LAB EVENTS
#                   'Glucose (serum)', 'Glucose (whole blood)',
#                   'Potassium (serum)', 'Potassium (whole blood)', 
#                   'Sodium (serum)', 'Sodium (whole blood)',
#                   'Chloride (serum)', 'Chloride (whole blood)',
#                   'Creatinine (serum)', 'Creatinine (whole blood)',
#                   'BUN', #   'Urea Nitrogen', 
#                   'HCO3 (serum)', #   'Bicarbonate', 
#                   'Anion gap', 
#                   'Hemoglobin', 
#                   'Hematocrit (serum)', 'Hematocrit (whole blood - calc)',
#                   'Magnesium', 
#                   'Platelet Count', 
#                   'Alkaline Phosphate', 
#                   'WBC', #'White Blood Cells',
#                   'Calcium non-ionized', 'Ionized Calcium', #'Calcium, Total', 
#                 #   'MCH', 
#                 #   'Red Blood Cells', 
#                 #   'MCHC', 
#                 #   'MCV', 
#                 #   'RDW', 
#                   'Absolute Neutrophil Count', #  'Neutrophils', 
#                   'Vancomycin (Peak)', 'Vancomycin (Random)', 'Vancomycin (Trough)',
                  
#                   # NEW
#                   'PH (Arterial)', 'PH (dipstick)', 'PH (SOFT)', 'PH (Venous)',
#                   'Capillary Refill R', 'Capillary Refill L',
#                   'Temperature Celsius',
#                   'Daily Weight', 'Admission Weight (Kg)',
#                   'Inspired O2 Fraction',

#                   #CHART EVENTS
#                   'Heart Rate','Non Invasive Blood Pressure systolic',
#                     'Non Invasive Blood Pressure diastolic', 'Non Invasive Blood Pressure mean', 
#                     'Respiratory Rate','O2 saturation pulseoxymetry', 
#                     'GCS - Verbal Response', 'GCS - Eye Opening', 'GCS - Motor Response'
#                     ]

#     event_links_df = pd.DataFrame()
#     for event in event_list:
#         curr_event_item_id = d_items_df[d_items_df["label"] == event]["itemid"].values[0]

#         tmp_dict = {"event": event, "itemid": curr_event_item_id}
#         event_links_df = pd.concat([event_links_df, pd.DataFrame(tmp_dict, index=[0])], axis=0, ignore_index=True)

#     df = df[df["itemid"].isin(event_links_df['itemid'])]
#     df = df.merge(event_links_df, on="itemid", how="left")
#     df.drop(columns=["itemid"], inplace=True)

#     rename_dict = {
#         'Glucose (serum)': 'Glucose',
#         'Glucose (whole blood)': 'Glucose',
#         'Potassium (serum)': 'Potassium',
#         'Potassium (whole blood)': 'Potassium',
#         'Sodium (serum)': 'Sodium',
#         'Sodium (whole blood)': 'Sodium',
#         'Chloride (serum)': 'Chloride',
#         'Chloride (whole blood)': 'Chloride',
#         'Creatinine (serum)': 'Creatinine',
#         'Creatinine (whole blood)': 'Creatinine',
#         'BUN': 'Urea Nitrogen',
#         'HCO3 (serum)': 'Bicarbonate',
#         'Hematocrit (serum)': 'Hematocrit',
#         'Hematocrit (whole blood - calc)': 'Hematocrit',
#         'Calcium non-ionized': 'Calcium',
#         'Ionized Calcium': 'Calcium',
#         'Vancomycin (Peak)': 'Vancomycin',
#         'Vancomycin (Random)': 'Vancomycin',
#         'Vancomycin (Trough)': 'Vancomycin',
#         'PH (Arterial)': 'PH',
#         'PH (dipstick)': 'PH',
#         'PH (SOFT)': 'PH',
#         'PH (Venous)': 'PH',
#         'Capillary Refill R': 'Capillary Refill',
#         'Capillary Refill L': 'Capillary Refill',
#         'Temperature Celsius': 'Temperature',
#         'Daily Weight': 'Weight',
#         'Admission Weight (Kg)': 'Weight',
#         'Inspired O2 Fraction': 'Inspired O2 Fraction',

#         'Non Invasive Blood Pressure systolic': 'Systolic BP',
#         'Non Invasive Blood Pressure diastolic': 'Diastolic BP',
#         'Non Invasive Blood Pressure mean': 'Mean BP',
#         'O2 saturation pulseoxymetry': 'O2 Saturation'
#     }

#     df['event'] = df['event'].replace(rename_dict)

#     return df



# # procedureevents_df = get_procedures_of_interest(procedureevents_df)
# labevents_df = get_labs_of_interest(chartevents_df)
# vitals_df = get_vitals_of_interest(chartevents_df)
# # labevents_df = labevents_df[['subject_id', 'hadm_id', 'stay_id', 'charttime', 'event', 'valuenum']]
# # vitals_df = vitals_df[['subject_id', 'hadm_id', 'stay_id', 'charttime', 'event', 'valuenum']]
# # procedureevents_df = procedureevents_df[['subject_id', 'hadm_id', 'stay_id', 'starttime', 'endtime', 'storetime', 'value', 'event']]



# # labs_vitals_df = get_labs_vitals(chartevents_df)
# # labs_vitals_df = labs_vitals_df[['subject_id', 'hadm_id', 'stay_id', 'charttime', 'event', 'valuenum']]

In [15]:
from tqdm import tqdm

def calc_time_delta_hrs(icu_intime, charttime):
    return (charttime - icu_intime).total_seconds() / 3600



def add_time_delta(df):
    df = df.copy()
    print(df.columns)
    if 'stay_id' in df.columns:
        stay_id_in_cols = True
    else:
        stay_id_in_cols = False
        df['stay_id'] = None
        
    df['icu_time_delta'] = None
    df['hosp_time_delta'] = None

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        if 'charttime' in row:
            ref_time = row['charttime']
        elif 'storetime' in row:
            ref_time = row['storetime']
        else:
            print(f"No reference time found for row index {index}")
            continue  # Skip this iteration if no time reference is found

        curr_admission = admissions_df[(admissions_df['subject_id'] == row['subject_id']) & (admissions_df['hadm_id'] == row['hadm_id'])]
        
        df.loc[index, 'hosp_time_delta'] = calc_time_delta_hrs(curr_admission['admittime'].iloc[0], ref_time)

        if stay_id_in_cols:
            curr_icu_stay = icustays_df[(icustays_df['subject_id'] == row['subject_id']) & (icustays_df['stay_id'] == row['stay_id'])]
            df.loc[index, 'icu_time_delta'] = calc_time_delta_hrs(curr_icu_stay['intime'].iloc[0], ref_time)
        else:
            curr_pts_icustays = icustays_df[icustays_df['subject_id'] == row['subject_id']]

            for icu_index, icu_row in curr_pts_icustays.iterrows():
                if icu_row['intime'] <= ref_time <= icu_row['outtime']:
                    df.loc[index, 'stay_id'] = icu_row['stay_id']
                    df.loc[index, 'icu_time_delta'] = calc_time_delta_hrs(icu_row['intime'], ref_time)
            

    df = df.sort_values(by=['subject_id', 'hadm_id', 'stay_id', 'hosp_time_delta'])
    return df


#procedureevents_df = add_time_delta(procedureevents_df)
labevents_df_time = add_time_delta(labevents_df)
vitals_df_time = add_time_delta(vitals_df)

# labs_df = add_time_delta(labs_df)
# labs_df = labs_df[['subject_id', 'hadm_id', 'stay_id', 'hosp_time_delta', 'icu_time_delta', 'charttime', 'storetime', 'event', 'valuenum']]
# labs_df.sort_values(by=['subject_id', 'hadm_id', 'stay_id', 'hosp_time_delta'], inplace=True)
# vitals_df = add_time_delta(vitals_df)
# vitals_df = vitals_df[['subject_id', 'hadm_id', 'stay_id', 'hosp_time_delta', 'icu_time_delta', 'charttime', 'storetime', 'event', 'valuenum']]
# vitals_df.sort_values(by=['subject_id', 'hadm_id', 'stay_id', 'hosp_time_delta'], inplace=True)

Index(['subject_id', 'hadm_id', 'charttime', 'event', 'valuenum'], dtype='object')


100%|██████████| 33094639/33094639 [11:43:08<00:00, 784.45it/s]  


Index(['subject_id', 'hadm_id', 'stay_id', 'charttime', 'event', 'valuenum'], dtype='object')


100%|██████████| 36259441/36259441 [12:23:41<00:00, 812.60it/s]  


In [20]:
labevents_df_time.to_csv(os.path.join(output_dir, "labevents.csv"), index=False)

In [29]:
print(labevents_df_time.event.unique())

['Hematocrit' 'MCH' 'MCHC' 'MCV' 'Platelet Count' 'RDW' 'Red Blood Cells'
 'White Blood Cells' 'Anion Gap' 'Bicarbonate' 'Calcium, Total' 'Chloride'
 'Creatinine' 'Magnesium' 'Phosphate' 'Sodium' 'Urea Nitrogen'
 'Neutrophils' 'Vancomycin' 'Hemoglobin' 'Glucose']


In [21]:
vitals_df_time.to_csv(os.path.join(output_dir, "vitals.csv"), index=False)

In [30]:
print(vitals_df_time.event.unique())

['Systolic BP' 'Diastolic BP' 'Mean BP' 'Heart Rate' 'Respiratory Rate'
 'O2 Saturation' 'GCS - Eye Opening' 'GCS - Motor Response'
 'GCS - Verbal Response']


In [26]:
concat_df_time = pd.concat([labevents_df_time, vitals_df_time], axis=0, ignore_index=True)

In [27]:
def convert_events_table_to_ts_array(df):
    # Ensure 'valuenum' or 'value' columns exist
    value_column = 'valuenum' if 'valuenum' in df.columns else 'value'

    # Create a pivot table
    print(df.columns)
    pivot_df = df.pivot_table(index=['hadm_id', 'hosp_time_delta'], 
                              columns='event', 
                              values=value_column, 
                              aggfunc='first').reset_index()

    # Join with the original DataFrame to get other required columns
    keys = ['subject_id', 'hadm_id', 'stay_id', 'hosp_time_delta', 'icu_time_delta']
    merged_df = pd.merge(df[keys].drop_duplicates(), pivot_df, on=['hadm_id', 'hosp_time_delta'])

    # Reorder the columns
    cols = merged_df.columns.tolist()
    cols = [col for col in keys if col in cols] + [col for col in cols if col not in keys]
    merged_df = merged_df[cols]

    # Sort the DataFrame
    merged_df.sort_values(by=['subject_id', 'hadm_id', 'stay_id', 'hosp_time_delta'], inplace=True)

    return merged_df

# procedureevents_ts_df = convert_events_table_to_ts_array(procedureevents_df)
labevents_ts_df = convert_events_table_to_ts_array(labevents_df_time)
vitals_ts_df = convert_events_table_to_ts_array(vitals_df_time)

concat_df_pivot = convert_events_table_to_ts_array(concat_df_time)

Index(['subject_id', 'hadm_id', 'charttime', 'event', 'valuenum', 'stay_id',
       'icu_time_delta', 'hosp_time_delta'],
      dtype='object')
Index(['subject_id', 'hadm_id', 'stay_id', 'charttime', 'event', 'valuenum',
       'icu_time_delta', 'hosp_time_delta'],
      dtype='object')
Index(['subject_id', 'hadm_id', 'charttime', 'event', 'valuenum', 'stay_id',
       'icu_time_delta', 'hosp_time_delta'],
      dtype='object')


In [32]:
print(concat_df_pivot.head())

   subject_id     hadm_id stay_id hosp_time_delta icu_time_delta  Anion Gap  \
0    10000032  22595853.0    None             6.7           None        9.0   
1    10000032  22841357.0    None       10.716667           None       14.0   
2    10000032  25742920.0    None        6.866667           None       11.0   
3    10000032  25742920.0    None       15.933333           None        NaN   
4    10000032  25742920.0    None           25.25           None        6.0   

   Bicarbonate  Calcium, Total  Chloride  Creatinine  ...  Phosphate  \
0         28.0             7.8     105.0         0.3  ...        3.6   
1         25.0             7.8      92.0         0.3  ...        3.3   
2         26.0             8.6      95.0         0.6  ...        3.9   
3          NaN             NaN      94.0         NaN  ...        NaN   
4         25.0             8.7      93.0         0.4  ...        3.6   

   Platelet Count   RDW  Red Blood Cells  Respiratory Rate  Sodium  \
0            71.0  15.

In [28]:
# procedureevents_ts_df.to_pickle(os.path.join(output_dir, "ts_procedureevents_icu.pkl"))
labevents_ts_df.to_pickle(os.path.join(output_dir, "ts_labs_icu_new.pkl"))
vitals_ts_df.to_pickle(os.path.join(output_dir, "ts_vitals_icu_new.pkl"))

concat_df_pivot.to_pickle(os.path.join(output_dir, "ts_labs_vitals_new.pkl"))