In [126]:
import os
import polars as pl

pl.Config.set_tbl_rows(20)      # Increase max rows (default is 10)
pl.Config.set_tbl_cols(15)        # Increase max columns (default is 10)

polars.config.Config

In [127]:
# read data from csv.gz files into a dictionary of DataFrames
dataframes = {}
data_path = "data"
filenames = ["admissions", "chartevents", "d_icd_diagnoses", "d_items", "d_labitems", "diagnoses_icd", "icustays", "labevents", "patients"]

for name in filenames:
    path = os.path.join(data_path, f"{name}.csv.gz")
    dataframes[name] = pl.read_csv(path, infer_schema_length=1000)#ignore_errors=True)

In [128]:
# filter out pneumonia diagnosis based on ICD codes
filtered_diagnosis_df = dataframes["d_icd_diagnoses"].filter(
    pl.col("long_title").str.contains("pneumonia") &
    (
        ( # ICD 9 = "84..."
            (pl.col("icd_version") == 9) &
            (pl.col("icd_code").cast(pl.Utf8).str.starts_with("48"))
        ) |
        ( # ICD 10 = "J1..."
            (pl.col("icd_version") == 10) &
            (pl.col("icd_code").cast(pl.Utf8).str.starts_with("J1"))
        )
    ))

#print(filtered_diagnosis_df)
dataframes["d_icd_diagnoses"] = filtered_diagnosis_df # save back in df dictionary

In [129]:
# Filter patients with diagnosis based on ICD codes by inner join
dataframes["diagnoses_icd"] = dataframes["diagnoses_icd"].select(['subject_id', 'hadm_id', 'icd_code', 'icd_version'])

pneumonia_diagnosis_temp0 = dataframes["diagnoses_icd"].join(
    filtered_diagnosis_df,
    on=["icd_code", "icd_version"],
    how="inner"
)

#print(pneumonia_diagnosis_temp0) # Leaves 9 patients with pneumonia diagnosis

In [130]:
# Filter patients dataframe for relevant columns
dataframes["patients"] = dataframes["patients"].select(["subject_id", "gender", "anchor_age"])

# Add information to diagnosis dataframe
pneumonia_diagnosis_temp1 = pneumonia_diagnosis_temp0.join(
    dataframes["patients"],
    on="subject_id",
    how="inner"
)

#print(pneumonia_diagnosis_temp1)

In [131]:
# Filter admissions dataframe for relevant columns
dataframes["admissions"] = dataframes["admissions"].select(["subject_id", "hadm_id", "admission_location",'admittime'])

# Add information to diagnosis dataframe
pneumonia_patients_df = pneumonia_diagnosis_temp1.join(
    dataframes["admissions"],
    on=["subject_id", "hadm_id"],
    how="inner"
)

print(pneumonia_patients_df.columns)
print(pneumonia_patients_df) # finished patient dataframe with 9 cases

['subject_id', 'hadm_id', 'icd_code', 'icd_version', 'long_title', 'gender', 'anchor_age', 'admission_location', 'admittime']
shape: (9, 9)
┌───────────┬──────────┬──────────┬───────────┬──────────┬────────┬──────────┬──────────┬──────────┐
│ subject_i ┆ hadm_id  ┆ icd_code ┆ icd_versi ┆ long_tit ┆ gender ┆ anchor_a ┆ admissio ┆ admittim │
│ d         ┆ ---      ┆ ---      ┆ on        ┆ le       ┆ ---    ┆ ge       ┆ n_locati ┆ e        │
│ ---       ┆ i64      ┆ str      ┆ ---       ┆ ---      ┆ str    ┆ ---      ┆ on       ┆ ---      │
│ i64       ┆          ┆          ┆ i64       ┆ str      ┆        ┆ i64      ┆ ---      ┆ str      │
│           ┆          ┆          ┆           ┆          ┆        ┆          ┆ str      ┆          │
╞═══════════╪══════════╪══════════╪═══════════╪══════════╪════════╪══════════╪══════════╪══════════╡
│ 10037975  ┆ 27617929 ┆ 4820     ┆ 9         ┆ Pneumoni ┆ M      ┆ 60       ┆ TRANSFER ┆ 2185-01- │
│           ┆          ┆          ┆           ┆ a du

In [132]:
# Filter out icustays for relevant patients by subject_id & hadm_id from pneumonia_patients_df
filtered_icustays = dataframes["icustays"].join(
    pneumonia_patients_df.select(["subject_id", "hadm_id"]),
    on=["subject_id", "hadm_id"],
    how="inner"
).select(["subject_id", "hadm_id", "stay_id"])

#print(filtered_icustays) # 7 patients with icu stay for pneumonia

In [133]:
# Filter chartevents for relevant patients by subject_id, hadm_id, and stay_id
dataframes["chartevents"] = dataframes["chartevents"].select(['subject_id', 'hadm_id', 'stay_id', 'itemid', 'value', 'valuenum', 'valueuom','charttime'])

chartevents_temp0 = dataframes["chartevents"].join(
    filtered_icustays.select(["subject_id", "hadm_id", "stay_id"]),
    on=["subject_id", "hadm_id", "stay_id"],
    how="inner"
)

#print(chartevents_temp0)

In [134]:
# Filter d_items for chartevents for joining
dataframes["d_items"] = dataframes["d_items"].select(['itemid', 'label','linksto', 'abbreviation'])

chartevents_temp1 = dataframes["d_items"].filter(
    pl.col("linksto") == "chartevents"
).drop("linksto")

# Join d_items with filtered chartevents based on itemid
patients_all_chartevents = chartevents_temp0.join(
    chartevents_temp1,
    on="itemid",
    how="inner"
)

#print(patients_all_chartevents)

In [135]:
# list of all relevant itemids for filtering chartevents
itemid_list = [226984, 226228, 228395, 228394, 229381, 223898, 228396, 226104, 229382, 228688, 2930, 2903, 2931, 2982, 29281, 29011, 78097, 29041, 230040, 225309, 227243, 220179, 220050, 224167, 224027, 223761, 223762, 226329, 50825, 229770, 223942, 223936, 223948, 223941, 223946, 223949, 223945, 223939, 223944, 223940, 223938, 225624, 51842, 50820, 223830, 220645, 226534, 228389, 228390, 50983, 52623, 50809, 50931, 52569, 226537, 225664, 220621, 228388, 52028, 51638, 51639, 51221, 226540, 220545, 220227, 220277, 223835, 50817, 51181, 5119]

# filter all patients chartevents by itemis list
filtered_chartevents = patients_all_chartevents.filter(
    pl.col("itemid").is_in(itemid_list)
)

print(filtered_chartevents.columns)
print(filtered_chartevents)

['subject_id', 'hadm_id', 'stay_id', 'itemid', 'value', 'valuenum', 'valueuom', 'charttime', 'label', 'abbreviation']
shape: (6_111, 10)
┌─────────┬─────────┬─────────┬────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┐
│ subject ┆ hadm_id ┆ stay_id ┆ itemid ┆ value   ┆ valuenu ┆ valueuo ┆ chartti ┆ label   ┆ abbrevi │
│ _id     ┆ ---     ┆ ---     ┆ ---    ┆ ---     ┆ m       ┆ m       ┆ me      ┆ ---     ┆ ation   │
│ ---     ┆ i64     ┆ i64     ┆ i64    ┆ str     ┆ ---     ┆ ---     ┆ ---     ┆ str     ┆ ---     │
│ i64     ┆         ┆         ┆        ┆         ┆ f64     ┆ str     ┆ str     ┆         ┆ str     │
╞═════════╪═════════╪═════════╪════════╪═════════╪═════════╪═════════╪═════════╪═════════╪═════════╡
│ 1003797 ┆ 2761792 ┆ 3906157 ┆ 223835 ┆ 100     ┆ 100.0   ┆ null    ┆ 2185-01 ┆ Inspire ┆ FiO2    │
│ 5       ┆ 9       ┆ 1       ┆        ┆         ┆         ┆         ┆ -17 20: ┆ d O2    ┆         │
│         ┆         ┆         ┆        ┆         ┆     

In [136]:
# Melt metadata columns into long format
patient_metadata = pneumonia_patients_df.select([
    "subject_id", "hadm_id", "gender", "anchor_age", "admission_location",'admittime', 'long_title'
]).rename({
    "admittime": "charttime",
    "long_title": "diagnosis"
}).unpivot(
    index=["subject_id", "hadm_id",'charttime'],
    on=["gender", "anchor_age", "admission_location", 'diagnosis'],
    variable_name="label",
    value_name="value"
)

#Join stay_id from filtered_chartevents (distinct) into metadata
stay_keys = filtered_chartevents.select(["subject_id", "hadm_id", "stay_id"]).unique()
patient_metadata_with_stay = patient_metadata.join(
    stay_keys,
    on=["subject_id", "hadm_id"],
    how="inner"
)

In [137]:
# Prepare for concatenation by renaming and filling in None values
patient_metadata_structured = patient_metadata_with_stay.with_columns([
    pl.lit(None).alias("itemid"),
    pl.lit(None).alias("valuenum"),
    pl.lit(None).alias("valueuom"),
    pl.lit(None).alias("abbreviation")
]).select([
    "subject_id","hadm_id","stay_id","itemid","value","valuenum","valueuom","charttime","label","abbreviation"
])

# Concatenate patient metadata with filtered chartevents
bronze_df = pl.concat([
    filtered_chartevents,
    patient_metadata_structured
], how="vertical")

print(bronze_df.columns)
print(bronze_df.head(10))

['subject_id', 'hadm_id', 'stay_id', 'itemid', 'value', 'valuenum', 'valueuom', 'charttime', 'label', 'abbreviation']
shape: (10, 10)
┌─────────┬─────────┬─────────┬────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┐
│ subject ┆ hadm_id ┆ stay_id ┆ itemid ┆ value   ┆ valuenu ┆ valueuo ┆ chartti ┆ label   ┆ abbrevi │
│ _id     ┆ ---     ┆ ---     ┆ ---    ┆ ---     ┆ m       ┆ m       ┆ me      ┆ ---     ┆ ation   │
│ ---     ┆ i64     ┆ i64     ┆ i64    ┆ str     ┆ ---     ┆ ---     ┆ ---     ┆ str     ┆ ---     │
│ i64     ┆         ┆         ┆        ┆         ┆ f64     ┆ str     ┆ str     ┆         ┆ str     │
╞═════════╪═════════╪═════════╪════════╪═════════╪═════════╪═════════╪═════════╪═════════╪═════════╡
│ 1003797 ┆ 2761792 ┆ 3906157 ┆ 223835 ┆ 100     ┆ 100.0   ┆ null    ┆ 2185-01 ┆ Inspire ┆ FiO2    │
│ 5       ┆ 9       ┆ 1       ┆        ┆         ┆         ┆         ┆ -17 20: ┆ d O2    ┆         │
│         ┆         ┆         ┆        ┆         ┆        

In [27]:
# Save as parquet file
bronze_df.write_parquet("data/bronze_df.parquet")