In [35]:
import os
import polars as pl

In [36]:
# read data from csv.gz files into a dictionary of DataFrames
dataframes = {}
data_path = "data"
filenames = ["admissions", "chartevents", "d_icd_diagnoses", "d_items", "d_labitems", "diagnoses_icd", "icustays", "labevents", "patients"]

for name in filenames:
    path = os.path.join(data_path, f"{name}.csv.gz")
    dataframes[name] = pl.read_csv(path, infer_schema_length=1000)#ignore_errors=True)

In [37]:
# filter out pneumonia diagnosis based on ICD codes
filtered_diagnosis_df = dataframes["d_icd_diagnoses"].filter(
    pl.col("long_title").str.contains("pneumonia") &
    (
        ( # ICD 9 = "84..."
            (pl.col("icd_version") == 9) &
            (pl.col("icd_code").cast(pl.Utf8).str.starts_with("48"))
        ) |
        ( # ICD 10 = "J1..."
            (pl.col("icd_version") == 10) &
            (pl.col("icd_code").cast(pl.Utf8).str.starts_with("J1"))
        )
    ))

#print(filtered_diagnosis_df)
dataframes["d_icd_diagnoses"] = filtered_diagnosis_df # save back in df dictionary

In [38]:
# Filter patients with diagnosis based on ICD codes by inner join
dataframes["diagnoses_icd"] = dataframes["diagnoses_icd"].select(['subject_id', 'hadm_id', 'icd_code', 'icd_version'])

pneumonia_diagnosis_temp0 = dataframes["diagnoses_icd"].join(
    filtered_diagnosis_df,
    on=["icd_code", "icd_version"],
    how="inner"
)

#print(pneumonia_diagnosis_temp0) # Leaves 9 patients with pneumonia diagnosis

In [39]:
# Filter patients dataframe for relevant columns
dataframes["patients"] = dataframes["patients"].select(["subject_id", "gender", "anchor_age"])

# Add information to diagnosis dataframe
pneumonia_diagnosis_temp1 = pneumonia_diagnosis_temp0.join(
    dataframes["patients"],
    on="subject_id",
    how="inner"
)

#print(pneumonia_diagnosis_temp1)

In [40]:
# Filter admissions dataframe for relevant columns
dataframes["admissions"] = dataframes["admissions"].select(["subject_id", "hadm_id", "admission_location"])
#print(dataframes["admissions"])


# Add information to diagnosis dataframe
pneumonia_patients_df = pneumonia_diagnosis_temp1.join(
    dataframes["admissions"],
    on=["subject_id", "hadm_id"],
    how="inner"
)

print(pneumonia_patients_df)
# finished patient dataframe

shape: (9, 8)
┌────────────┬──────────┬──────────┬─────────────┬─────────────┬────────┬────────────┬─────────────┐
│ subject_id ┆ hadm_id  ┆ icd_code ┆ icd_version ┆ long_title  ┆ gender ┆ anchor_age ┆ admission_l │
│ ---        ┆ ---      ┆ ---      ┆ ---         ┆ ---         ┆ ---    ┆ ---        ┆ ocation     │
│ i64        ┆ i64      ┆ str      ┆ i64         ┆ str         ┆ str    ┆ i64        ┆ ---         │
│            ┆          ┆          ┆             ┆             ┆        ┆            ┆ str         │
╞════════════╪══════════╪══════════╪═════════════╪═════════════╪════════╪════════════╪═════════════╡
│ 10037975   ┆ 27617929 ┆ 4820     ┆ 9           ┆ Pneumonia   ┆ M      ┆ 60         ┆ TRANSFER    │
│            ┆          ┆          ┆             ┆ due to      ┆        ┆            ┆ FROM        │
│            ┆          ┆          ┆             ┆ Klebsiella  ┆        ┆            ┆ HOSPITAL    │
│            ┆          ┆          ┆             ┆ pn…         ┆        ┆    

In [41]:
# Filter out icustays for relevant patients by subject_id & hadm_id from pneumonia_patients_df

filtered_icustays = dataframes["icustays"].join(
    pneumonia_patients_df.select(["subject_id", "hadm_id"]),
    on=["subject_id", "hadm_id"],
    how="inner"
).select(["subject_id", "hadm_id", "stay_id"])

print(filtered_icustays) # 7 patients with icu stay

shape: (7, 3)
┌────────────┬──────────┬──────────┐
│ subject_id ┆ hadm_id  ┆ stay_id  │
│ ---        ┆ ---      ┆ ---      │
│ i64        ┆ i64      ┆ i64      │
╞════════════╪══════════╪══════════╡
│ 10021487   ┆ 28998349 ┆ 38197705 │
│ 10020944   ┆ 29974575 ┆ 30757476 │
│ 10037975   ┆ 27617929 ┆ 39061571 │
│ 10027602   ┆ 28166872 ┆ 32391858 │
│ 10009049   ┆ 22995465 ┆ 35636875 │
│ 10004733   ┆ 27411876 ┆ 39635619 │
│ 10035631   ┆ 29276678 ┆ 30932571 │
└────────────┴──────────┴──────────┘


In [44]:
# TODO: Reduce by filtered d_items, then join with selected patients on subject_id & hadm_id & stay_id
dataframes["chartevents"] = dataframes["chartevents"].select(['subject_id', 'hadm_id', 'stay_id', 'itemid', 'value', 'valuenum', 'valueuom', 'warning'])

filtered_chartevents = dataframes["chartevents"].join(
    filtered_icustays.select(["subject_id", "hadm_id", "stay_id"]),
    on=["subject_id", "hadm_id", "stay_id"],
    how="inner"
)

print(filtered_chartevents)

shape: (101_630, 8)
┌────────────┬──────────┬──────────┬────────┬──────────────────────┬──────────┬──────────┬─────────┐
│ ---        ┆ ---      ┆ ---      ┆ ---    ┆ ---                  ┆ ---      ┆ ---      ┆ ---     │
│ i64        ┆ i64      ┆ i64      ┆ i64    ┆ str                  ┆ f64      ┆ str      ┆ i64     │
╞════════════╪══════════╪══════════╪════════╪══════════════════════╪══════════╪══════════╪═════════╡
│ 10037975   ┆ 27617929 ┆ 39061571 ┆ 223988 ┆ Rhonchi              ┆ null     ┆ null     ┆ 0       │
│ 10037975   ┆ 27617929 ┆ 39061571 ┆ 223837 ┆ 8mm                  ┆ null     ┆ null     ┆ 0       │
│ 10037975   ┆ 27617929 ┆ 39061571 ┆ 224696 ┆ 22                   ┆ 22.0     ┆ cmH2O    ┆ 0       │
│ 10037975   ┆ 27617929 ┆ 39061571 ┆ 224415 ┆ 24cm                 ┆ null     ┆ null     ┆ 0       │
│ 10037975   ┆ 27617929 ┆ 39061571 ┆ 227517 ┆ Active               ┆ null     ┆ null     ┆ 0       │
│ …          ┆ …        ┆ …        ┆ …      ┆ …                    ┆ … 

In [43]:
# TODO: Filter for relevant parameters & columns
dataframes["d_items"] = dataframes["d_items"].select(['itemid', 'label','linksto', 'abbreviation','category','unitname'])

print(dataframes["d_items"].columns)
print(dataframes["d_items"])

['itemid', 'label', 'linksto', 'abbreviation', 'category', 'unitname']
shape: (4_014, 6)
┌────────┬──────────────────────────┬─────────────┬──────────────────────────┬──────────┬──────────┐
│ itemid ┆ label                    ┆ linksto     ┆ abbreviation             ┆ category ┆ unitname │
│ ---    ┆ ---                      ┆ ---         ┆ ---                      ┆ ---      ┆ ---      │
│ i64    ┆ str                      ┆ str         ┆ str                      ┆ str      ┆ str      │
╞════════╪══════════════════════════╪═════════════╪══════════════════════════╪══════════╪══════════╡
│ 226228 ┆ Gender                   ┆ chartevents ┆ Gender                   ┆ ADT      ┆ null     │
│ 226545 ┆ Race                     ┆ chartevents ┆ Race                     ┆ ADT      ┆ null     │
│ 229877 ┆ Suction events (CH)      ┆ chartevents ┆ Suction events (CH)      ┆ ECMO     ┆ null     │
│ 229875 ┆ Oxygenator visible (CH)  ┆ chartevents ┆ Oxygenator visible (CH)  ┆ ECMO     ┆ null     │
│ 

In [23]:
# TODO: Filter for relevant parameters & columns
#print(dataframes["d_labitems"].head(2))

# TODO: Reduce by filtered d_labitems, then join with selected patients on subject_id & hadm_id
#print(dataframes["labevents"].head(2))

In [None]:
# TODO: Save that shit as a .parquet or similar