In [29]:
EXPORT_PATH = "../data/processed/"
PATH = "../data/raw/"

import os
import pandas as pd
import numpy as np

def export_to_csv(df, filename):
    """
    Exports a DataFrame to a CSV file.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to export.
    filename (str): The name of the file to save the DataFrame to.
    """
    if not os.path.exists(EXPORT_PATH):
        os.makedirs(EXPORT_PATH)
    df.to_csv(os.path.join(EXPORT_PATH, filename), index=False)

In [30]:
# Load filtered sepsis cohort (previously generated)
sepsis_ids = pd.read_csv(os.path.join(EXPORT_PATH, 'sepsis_cohort.csv'))
cohort = sepsis_ids[['SUBJECT_ID', 'HADM_ID']].drop_duplicates()
display(cohort.head())
cohort['SUBJECT_ID'].nunique()

Unnamed: 0,SUBJECT_ID,HADM_ID
0,51797,104616
1,44534,183659
2,14828,144708
3,14828,125239
4,44500,101872


3068

In [31]:
icustays = pd.read_csv(os.path.join(PATH, 'icustays.csv'))

cohort_icu = icustays.merge(cohort, on=["SUBJECT_ID", "HADM_ID"], how="inner")
cohort_icu = cohort_icu[cohort_icu["LOS"].notnull() & (cohort_icu["LOS"] > 0)]
cohort_icu = cohort_icu[cohort_icu["OUTTIME"] > cohort_icu["INTIME"]]
cohort_icu = cohort_icu.drop_duplicates(subset=["ICUSTAY_ID"])

print(f"Valid ICU Admissions for Cohort: {cohort_icu.shape[0]}")
display(cohort_icu[["SUBJECT_ID", "HADM_ID", "ICUSTAY_ID", "LOS"]].head())

Valid ICU Admissions for Cohort: 3685


Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,LOS
0,269,106296,206613,3.2788
1,275,129886,219649,7.1314
2,292,179726,222505,0.8854
3,305,194340,217232,2.437
4,323,143334,264375,3.0252


In [32]:
patients = pd.read_csv(os.path.join(PATH, 'PATIENTS.csv'), parse_dates=["DOB"])
admissions = pd.read_csv(os.path.join(PATH, 'ADMISSIONS.csv'), parse_dates=["ADMITTIME"])

df = cohort_icu.merge(patients[["SUBJECT_ID", "GENDER", "DOB"]], on="SUBJECT_ID", how="left")
df = df.merge(admissions[[
    "SUBJECT_ID", "HADM_ID", "ADMITTIME", "ADMISSION_TYPE", "ADMISSION_LOCATION",
    "INSURANCE", "HOSPITAL_EXPIRE_FLAG"
]], on=["SUBJECT_ID", "HADM_ID"], how="left")

df["AGE"] = df["ADMITTIME"].dt.year - df["DOB"].dt.year
adjust = ((df["ADMITTIME"].dt.month < df["DOB"].dt.month) |
          ((df["ADMITTIME"].dt.month == df["DOB"].dt.month) & 
           (df["ADMITTIME"].dt.day < df["DOB"].dt.day)))
df["AGE"] -= adjust.astype(int)
df["AGE"] = df["AGE"].clip(upper=91)

In [33]:
timestamp_cols = ["INTIME", "ADMITTIME"]
for col in timestamp_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")
    df[f"{col}_HOUR"] = df[col].dt.hour
    df[f"{col}_WEEKDAY"] = df[col].dt.weekday

In [34]:
df_final = df[[
    "SUBJECT_ID", "HADM_ID", "ICUSTAY_ID", "AGE", "GENDER",
    "ADMISSION_TYPE", "ADMISSION_LOCATION", "INSURANCE",
    "FIRST_CAREUNIT", "LOS", "HOSPITAL_EXPIRE_FLAG",
    "INTIME_HOUR", "INTIME_WEEKDAY", "ADMITTIME_HOUR", "ADMITTIME_WEEKDAY", "INTIME"
]]

df_final = df_final[df_final["LOS"].notnull()]
print(f"df_final shape: {df_final.shape}")
df_final.to_csv(EXPORT_PATH + "df_final_static.csv", index=False)
display(df_final.head())

df_final shape: (3685, 16)


Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,AGE,GENDER,ADMISSION_TYPE,ADMISSION_LOCATION,INSURANCE,FIRST_CAREUNIT,LOS,HOSPITAL_EXPIRE_FLAG,INTIME_HOUR,INTIME_WEEKDAY,ADMITTIME_HOUR,ADMITTIME_WEEKDAY,INTIME
0,269,106296,206613,40,M,EMERGENCY,EMERGENCY ROOM ADMIT,Medicaid,MICU,3.2788,0,11,0,11,0,2170-11-05 11:05:29
1,275,129886,219649,82,M,EMERGENCY,EMERGENCY ROOM ADMIT,Medicare,CCU,7.1314,1,11,6,3,5,2170-10-07 11:28:53
2,292,179726,222505,57,F,URGENT,TRANSFER FROM HOSP/EXTRAM,Private,MICU,0.8854,1,18,3,18,3,2103-09-27 18:29:30
3,305,194340,217232,76,F,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,Medicare,SICU,2.437,1,12,5,18,5,2129-09-03 12:31:31
4,323,143334,264375,57,M,EMERGENCY,EMERGENCY ROOM ADMIT,Medicare,MICU,3.0252,0,15,3,15,3,2120-01-11 15:48:28
