In [64]:
import os
import polars as pl

In [65]:
# read data from csv.gz files into a dictionary of DataFrames
dataframes = {}
data_path = "data"
filenames = ["admissions", "chartevents", "d_icd_diagnoses", "d_items", "d_labitems", "diagnoses_icd", "icustays", "labevents", "patients"]

for name in filenames:
    path = os.path.join(data_path, f"{name}.csv.gz")
    dataframes[name] = pl.read_csv(path, infer_schema_length=1000)#ignore_errors=True)

In [66]:
# filter dataframes for relevant columns
print(dataframes["diagnoses_icd"].columns)


['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version']


In [67]:
# filter out pneumonia diagnosis based on ICD codes
filtered_diagnosis_df = dataframes["d_icd_diagnoses"].filter(
    pl.col("long_title").str.contains("pneumonia") &
    (
        (
            (pl.col("icd_version") == 9) &
            (pl.col("icd_code").cast(pl.Utf8).str.starts_with("48"))
        ) |
        (
            (pl.col("icd_version") == 10) &
            (pl.col("icd_code").cast(pl.Utf8).str.starts_with("J1"))
        )
    ))

#print(filtered_diagnosis_df)
dataframes["d_icd_diagnoses"] = filtered_diagnosis_df # save back in df dictionary

In [68]:
# Filter patients with diagnosis based on ICD codes by inner join
dataframes["diagnoses_icd"] = dataframes["diagnoses_icd"].select(['subject_id', 'hadm_id', 'icd_code', 'icd_version'])

pneumonia_diagnosis_temp0 = dataframes["diagnoses_icd"].join(
    filtered_diagnosis_df,
    on=["icd_code", "icd_version"],
    how="inner"
)

#print(pneumonia_diagnosis_temp0) # Leaves 9 patients with pneumonia diagnosis

In [69]:
# Filter patients dataframe for relevant columns
dataframes["patients"] = dataframes["patients"].select(["subject_id", "gender", "anchor_age"])

# Add information to diagnosis dataframe
pneumonia_diagnosis_temp1 = pneumonia_diagnosis_temp0.join(
    dataframes["patients"],
    on="subject_id",
    how="inner"
)

#print(pneumonia_diagnosis_temp1)

In [70]:
# Filter admissions dataframe for relevant columns
dataframes["admissions"] = dataframes["admissions"].select(["subject_id", "hadm_id", "admission_location"])

# Add information to diagnosis dataframe
pneumonia_patients_df = pneumonia_diagnosis_temp1.join(
    dataframes["admissions"],
    on=["subject_id", "hadm_id"],
    how="inner"
)

print(pneumonia_patients_df)

shape: (9, 8)
┌────────────┬──────────┬──────────┬─────────────┬─────────────┬────────┬────────────┬─────────────┐
│ subject_id ┆ hadm_id  ┆ icd_code ┆ icd_version ┆ long_title  ┆ gender ┆ anchor_age ┆ admission_l │
│ ---        ┆ ---      ┆ ---      ┆ ---         ┆ ---         ┆ ---    ┆ ---        ┆ ocation     │
│ i64        ┆ i64      ┆ str      ┆ i64         ┆ str         ┆ str    ┆ i64        ┆ ---         │
│            ┆          ┆          ┆             ┆             ┆        ┆            ┆ str         │
╞════════════╪══════════╪══════════╪═════════════╪═════════════╪════════╪════════════╪═════════════╡
│ 10037975   ┆ 27617929 ┆ 4820     ┆ 9           ┆ Pneumonia   ┆ M      ┆ 60         ┆ TRANSFER    │
│            ┆          ┆          ┆             ┆ due to      ┆        ┆            ┆ FROM        │
│            ┆          ┆          ┆             ┆ Klebsiella  ┆        ┆            ┆ HOSPITAL    │
│            ┆          ┆          ┆             ┆ pn…         ┆        ┆    

In [None]:
# TODO: Filter for relevant parameters & columns
print(dataframes["d_labitems"].head(2))

shape: (2, 4)
┌────────┬──────────────┬───────┬───────────┐
│ itemid ┆ label        ┆ fluid ┆ category  │
│ ---    ┆ ---          ┆ ---   ┆ ---       │
│ i64    ┆ str          ┆ str   ┆ str       │
╞════════╪══════════════╪═══════╪═══════════╡
│ 50808  ┆ Free Calcium ┆ Blood ┆ Blood Gas │
│ 50826  ┆ Tidal Volume ┆ Blood ┆ Blood Gas │
└────────┴──────────────┴───────┴───────────┘


In [None]:
# TODO: Reduce by filtered d_labitems, then join with selected patients on subject_id & hadm_id
print(dataframes["labevents"].head(2))

shape: (2, 16)
┌────────────┬────────────┬──────────┬────────────┬───┬───────────┬──────────┬──────────┬──────────┐
│ labevent_i ┆ subject_id ┆ hadm_id  ┆ specimen_i ┆ … ┆ ref_range ┆ flag     ┆ priority ┆ comments │
│ d          ┆ ---        ┆ ---      ┆ d          ┆   ┆ _upper    ┆ ---      ┆ ---      ┆ ---      │
│ ---        ┆ i64        ┆ i64      ┆ ---        ┆   ┆ ---       ┆ str      ┆ str      ┆ str      │
│ i64        ┆            ┆          ┆ i64        ┆   ┆ f64       ┆          ┆          ┆          │
╞════════════╪════════════╪══════════╪════════════╪═══╪═══════════╪══════════╪══════════╪══════════╡
│ 172061     ┆ 10014354   ┆ 29600294 ┆ 1808066    ┆ … ┆ 15.5      ┆ null     ┆ ROUTINE  ┆ null     │
│ 172062     ┆ 10014354   ┆ 29600294 ┆ 1808066    ┆ … ┆ 6.1       ┆ abnormal ┆ ROUTINE  ┆ null     │
└────────────┴────────────┴──────────┴────────────┴───┴───────────┴──────────┴──────────┴──────────┘


In [None]:
# TODO: Filter for relevant parameters & columns
print(dataframes["d_items"].head(2))

shape: (2, 9)
┌────────┬────────┬─────────────┬────────────┬───┬──────────┬────────────┬────────────┬────────────┐
│ itemid ┆ label  ┆ abbreviatio ┆ linksto    ┆ … ┆ unitname ┆ param_type ┆ lownormalv ┆ highnormal │
│ ---    ┆ ---    ┆ n           ┆ ---        ┆   ┆ ---      ┆ ---        ┆ alue       ┆ value      │
│ i64    ┆ str    ┆ ---         ┆ str        ┆   ┆ str      ┆ str        ┆ ---        ┆ ---        │
│        ┆        ┆ str         ┆            ┆   ┆          ┆            ┆ str        ┆ str        │
╞════════╪════════╪═════════════╪════════════╪═══╪══════════╪════════════╪════════════╪════════════╡
│ 226228 ┆ Gender ┆ Gender      ┆ chartevent ┆ … ┆ null     ┆ Text       ┆ null       ┆ null       │
│        ┆        ┆             ┆ s          ┆   ┆          ┆            ┆            ┆            │
│ 226545 ┆ Race   ┆ Race        ┆ chartevent ┆ … ┆ null     ┆ Text       ┆ null       ┆ null       │
│        ┆        ┆             ┆ s          ┆   ┆          ┆            ┆   

In [None]:
# TODO: filter for relevant patients by subject_id & hadm_id
print(dataframes["icustays"].head(2))

In [None]:
# TODO: Reduce by filtered d_items, then join with selected patients on subject_id & hadm_id & stay_id
print(dataframes["chartevents"].head(2))

shape: (2, 11)
┌────────────┬──────────┬──────────┬──────────────┬───┬───────┬──────────┬──────────┬─────────┐
│ ---        ┆ ---      ┆ ---      ┆ ---          ┆   ┆ ---   ┆ ---      ┆ ---      ┆ ---     │
│ i64        ┆ i64      ┆ i64      ┆ i64          ┆   ┆ str   ┆ f64      ┆ str      ┆ i64     │
╞════════════╪══════════╪══════════╪══════════════╪═══╪═══════╪══════════╪══════════╪═════════╡
│ 10005817   ┆ 20626031 ┆ 32604416 ┆ 6770         ┆ … ┆ On    ┆ null     ┆ null     ┆ 0       │
│ 10005817   ┆ 20626031 ┆ 32604416 ┆ 6770         ┆ … ┆ 100   ┆ 100.0    ┆ %        ┆ 0       │
└────────────┴──────────┴──────────┴──────────────┴───┴───────┴──────────┴──────────┴─────────┘


In [None]:
# TODO: Save that shit as a .parquet or similar