In [2]:
!ls ~/data/Diagnoses_ITA_Procedures/

bitabel_newest.parquet	    intensive_care.parquet
blood_tests_newest.parquet  procedures_newest.parquet
diagnoses_newest.parquet    SKS_Codes_Cookbook.parquet


In [3]:
import math

import numpy as np
import pandas as pd
import polars as pl

In [4]:
data_folder = "/home/nclow23/data/Diagnoses_ITA_Procedures"

def read_parquet(fname):
    df = pd.read_parquet(f"{data_folder}/{fname}.parquet")

    print(df.shape)

    return df


def get_sks_kapitler(fname_in="public_data/sks.csv", fname_out="public_data/sks_kapitler.csv"):
    real_sks = pd.read_csv(fname_in, sep=";", header=2)

    real_sks.query("Tekst.str.startswith('Kap.')", inplace=True)

    real_sks.drop("Kode", axis=1, inplace=True)

    sks_kaps = real_sks.Tekst.str.split(": ", expand=True).rename(
        columns={0: "Kap", 1: "Desc"}
    )

    sks_kaps[["Desc", "Range"]] = sks_kaps.Desc.str.split("[", expand=True)

    sks_kaps.Range = sks_kaps.Range.apply(lambda x: x[1:-1])

    sks_kaps[["Range_Min", "Range_Max"]] = sks_kaps.Range.str.split("-", expand=True)

    sks_kaps.set_index("Kap", inplace=True)

    sks_kaps.to_csv(fname_out)


def get_sks_mapping(fname_in="public_data/sks.csv", fname_out="public_data/sks_mapping.csv"):
    real_sks = pd.read_csv(fname_in, sep=";", header=2)

    kap = ""

    data = {}

    for idx, row in real_sks.iterrows():
        if row.Tekst.startswith("Kap."):
            kap = row.Tekst
        try:
            if math.isnan(row.Kode):
                continue
        except TypeError:
            if len(row.Kode) > 2:
                data[row.Kode] = kap

    data["D"] = np.nan
    data[np.nan] = np.nan

    real_sks["Kapitel"] = real_sks.Kode.apply(lambda x: data[x])

    real_sks[["Kap_n", "Kap_desc"]] = real_sks.Kapitel.str.split(": ", expand=True)

    real_sks.dropna(inplace=True)

    real_sks["Kap_n"] = real_sks.Kap_n.apply(lambda x: x.split(".")[1].strip())

    real_sks.set_index("Kode").to_csv(fname_out)

In [3]:
sks_mapping = pd.read_csv("public_data/sks_mapping.csv", index_col=0)

sks_dict = sks_mapping["Kap_desc"].to_dict()

## Pandas

### Apply SKS chapter mapping

In [3]:
di = read_parquet("diagnoses_newest")

(65632899, 6)


In [130]:
di["SKS_group"] = di["Aktionsdiagnose kode"].apply(
    lambda x: sks_dict.get(
        x,
        sks_dict.get(
            x[:-1], float("nan")
        )
    )
)

In [134]:
di.to_parquet(f"{data_folder}/diagnoses_newest_with_sks_group.parquet")

### Impute median in biomarker columns from the blood tests

## Polars

### Apply SKS chapter mapping

In [4]:
dipl = pl.scan_parquet(f"{data_folder}/diagnoses_newest.parquet")

In [9]:
dipl.select(
    "Aktionsdiagnose kode",
    pl.col("Aktionsdiagnose kode")
    .map_elements(
        lambda x: sks_dict.get(x, sks_dict.get(x[:-1], float("nan"))),
        return_dtype=pl.String,
    )
    .alias("SKS_group"),
).collect()

Aktionsdiagnose kode,SKS_group
str,str
"""DZ031DA""","""Faktorer af betydning for sund…"
"""DR339B""","""Symptomer og abnorme fund IKA …"
"""DI109""","""Sygdomme i kredsløbsorganer [D…"
"""DM199""","""Sygdomme i knogler, muskler og…"
"""DS828D""","""Læsioner, forgiftninger og vis…"
…,…
"""DG403F""","""Sygdomme i nervesystemet [DG00…"
"""DC509""","""Neoplasmer [DC00-DD48]"""
"""DJ189""","""Sygdomme i åndedrætsorganer [D…"
"""DC209""","""Neoplasmer [DC00-DD48]"""


In [13]:
blood.head().collect()

EnterpriseID,PT_ID,Blood_Test_Status,Blood_Test_Start,Blood_Test_End,Hæmoglobin;B,Leukocytter;B,Trombocytter;B,Kreatinin;P,Alanintransaminase [ALAT];P,Laktatdehydrogenase [LDH];P,Albumin;P,C-reaktivt protein [CRP];P,Laktat;P(aB),Troponin T;P,Laktat;P(vB)
str,str,str,datetime[μs],datetime[μs],str,str,str,str,str,str,str,str,str,str,str
"""E999""","""Z1000""","""Endelig""",2019-05-11 12:06:00,2019-05-11 13:12:00,,,,,"""13""",,,,,,
"""E999""","""Z1000""","""Endelig""",2019-05-11 12:06:00,2019-05-11 12:22:00,"""7.9""","""6.2""","""364""",,,,,,,,
"""E999""","""Z1000""","""Endelig""",2019-05-11 12:06:00,2019-05-11 13:00:00,,,,,,,,"""<3""",,,
"""E999""","""Z1000""","""Endelig""",2019-05-11 12:06:00,2019-05-11 12:55:00,,,,"""58""",,"""161""","""46""",,,,
"""E999""","""Z1000""","""Endelig""",2019-05-13 13:33:00,2019-05-13 16:09:00,"""8.0""","""7.8""","""347""",,,,,,,,


### Impute median in biomarker columns from the blood tests

In [182]:
blpl = pl.scan_parquet(f"{data_folder}/blood_tests_newest.parquet")

In [184]:
columns = blpl.collect_schema().names()[-11:]

In [174]:
for col in columns:
    mapping = {}

    # Create a temporary column with only the "sure"/"true" numbers in that column
    sure_col = blpl.with_columns(
        pl.col(col).cast(pl.Float64, strict=False).alias(f"{col}_sure")
    ).select(f"{col}_sure")

    # Collect all values prepended by ">" or "<" in a standard list
    old_vals = (
        blpl.select(col)
        .filter(pl.col(col).str.starts_with(">") | pl.col(col).str.starts_with("<"))
        .unique()
        .collect()
        .to_numpy()
        .squeeze()
        .tolist()
    )

    # Prepare a new list for old_vals
    new_vals = []

    for elm in old_vals:
        # Operator ">" or "<"
        operator = elm[0]
        # The bounding number
        bound = float(elm[1:])

        if operator == "<":
            # Collect lower median
            sure_col_filtered = sure_col.filter(pl.col(f"{col}_sure") < bound)
        else:
            # Collect upper median
            sure_col_filtered = sure_col.filter(pl.col(f"{col}_sure") > bound)

        mapping[elm] = sure_col_filtered.median().collect().item()

    blpl = blpl.with_columns(
        pl.col(col)
        # Replace old values by new values
        .replace(mapping)
        # Force cast to float again now that we fixed all the problematic values
        .cast(pl.Float64, strict=False)
        # Create a new column
        .alias(f"{col}_imputed")
    )

In [175]:
# Collect everything
blpl.collect()

EnterpriseID,PT_ID,Blood_Test_Status,Blood_Test_Start,Blood_Test_End,Hæmoglobin;B,Leukocytter;B,Trombocytter;B,Kreatinin;P,Alanintransaminase [ALAT];P,Laktatdehydrogenase [LDH];P,Albumin;P,C-reaktivt protein [CRP];P,Laktat;P(aB),Troponin T;P,Laktat;P(vB),Hæmoglobin;B_imputed,Leukocytter;B_imputed,Trombocytter;B_imputed,Kreatinin;P_imputed,Alanintransaminase [ALAT];P_imputed,Laktatdehydrogenase [LDH];P_imputed,Albumin;P_imputed,C-reaktivt protein [CRP];P_imputed,Laktat;P(aB)_imputed,Troponin T;P_imputed,Laktat;P(vB)_imputed
str,str,str,datetime[μs],datetime[μs],str,str,str,str,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""E999""","""Z1000""","""Endelig""",2019-05-11 12:06:00,2019-05-11 13:12:00,,,,,"""13""",,,,,,,,,,,13.0,,,,,,
"""E999""","""Z1000""","""Endelig""",2019-05-11 12:06:00,2019-05-11 12:22:00,"""7.9""","""6.2""","""364""",,,,,,,,,7.9,6.2,364.0,,,,,,,,
"""E999""","""Z1000""","""Endelig""",2019-05-11 12:06:00,2019-05-11 13:00:00,,,,,,,,"""<3""",,,,,,,,,,,1.4,,,
"""E999""","""Z1000""","""Endelig""",2019-05-11 12:06:00,2019-05-11 12:55:00,,,,"""58""",,"""161""","""46""",,,,,,,,58.0,,161.0,46.0,,,,
"""E999""","""Z1000""","""Endelig""",2019-05-13 13:33:00,2019-05-13 16:09:00,"""8.0""","""7.8""","""347""",,,,,,,,,8.0,7.8,347.0,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""E999995""","""Z999996""","""Endelig""",2021-07-25 15:17:00,2021-07-25 15:19:00,,,,,,,,,,,"""0.9""",,,,,,,,,,,0.9
"""E999995""","""Z999996""","""Endelig""",2021-07-25 15:26:00,2021-07-25 15:43:00,"""8.2""","""7.2""","""224""",,,,,,,,,8.2,7.2,224.0,,,,,,,,
"""E999995""","""Z999996""","""Endelig""",2021-07-25 15:26:00,2021-07-25 16:08:00,,,,"""59""","""33""",,"""38""","""<1""",,,,,,,59.0,33.0,,38.0,0.7,,,
"""E999995""","""Z999996""","""Endelig""",2022-08-19 09:38:00,2022-08-19 14:28:00,"""9.1""","""6.3""","""213""",,,,,,,,,9.1,6.3,213.0,,,,,,,,
