In [2]:
!ls ~/data/Diagnoses_ITA_Procedures/

bitabel_newest.parquet	    intensive_care.parquet
blood_tests_newest.parquet  procedures_newest.parquet
diagnoses_newest.parquet    SKS_Codes_Cookbook.parquet


In [1]:
import math

import numpy as np
import pandas as pd
import polars as pl

In [3]:
data_folder = "/home/nclow23/data/Diagnoses_ITA_Procedures"

def read_parquet(fname):
    df = pd.read_parquet(f"{data_folder}/{fname}.parquet")

    print(df.shape)

    return df


def get_sks_kapitler(fname_in="public_data/sks.csv", fname_out="public_data/sks_kapitler.csv"):
    real_sks = pd.read_csv(fname_in, sep=";", header=2)

    real_sks.query("Tekst.str.startswith('Kap.')", inplace=True)

    real_sks.drop("Kode", axis=1, inplace=True)

    sks_kaps = real_sks.Tekst.str.split(": ", expand=True).rename(
        columns={0: "Kap", 1: "Desc"}
    )

    sks_kaps[["Desc", "Range"]] = sks_kaps.Desc.str.split("[", expand=True)

    sks_kaps.Range = sks_kaps.Range.apply(lambda x: x[1:-1])

    sks_kaps[["Range_Min", "Range_Max"]] = sks_kaps.Range.str.split("-", expand=True)

    sks_kaps.set_index("Kap", inplace=True)

    sks_kaps.to_csv(fname_out)


def get_sks_mapping(fname_in="public_data/sks.csv", fname_out="public_data/sks_mapping.csv"):
    real_sks = pd.read_csv(fname_in, sep=";", header=2)

    kap = ""

    data = {}

    for idx, row in real_sks.iterrows():
        if row.Tekst.startswith("Kap."):
            kap = row.Tekst
        try:
            if math.isnan(row.Kode):
                continue
        except TypeError:
            if len(row.Kode) > 2:
                data[row.Kode] = kap

    data["D"] = np.nan
    data[np.nan] = np.nan

    real_sks["Kapitel"] = real_sks.Kode.apply(lambda x: data[x])

    real_sks[["Kap_n", "Kap_desc"]] = real_sks.Kapitel.str.split(": ", expand=True)

    real_sks.dropna(inplace=True)

    real_sks["Kap_n"] = real_sks.Kap_n.apply(lambda x: x.split(".")[1].strip())

    real_sks.set_index("Kode").to_csv(fname_out)

In [3]:
sks_mapping = pd.read_csv("public_data/sks_mapping.csv", index_col=0)

sks_dict = sks_mapping["Kap_desc"].to_dict()

## Pandas

### Apply SKS chapter mapping

In [3]:
di = read_parquet("diagnoses_newest")

(65632899, 6)


In [130]:
di["SKS_group"] = di["Aktionsdiagnose kode"].apply(
    lambda x: sks_dict.get(
        x,
        sks_dict.get(
            x[:-1], float("nan")
        )
    )
)

In [134]:
di.to_parquet(f"{data_folder}/diagnoses_newest_with_sks_group.parquet")

## Polars

### Apply SKS chapter mapping

In [4]:
dipl = pl.scan_parquet(f"{data_folder}/diagnoses_newest.parquet")

In [9]:
dipl.select(
    "Aktionsdiagnose kode",
    pl.col("Aktionsdiagnose kode")
    .map_elements(
        lambda x: sks_dict.get(x, sks_dict.get(x[:-1], float("nan"))),
        return_dtype=pl.String,
    )
    .alias("SKS_group"),
).collect()

Aktionsdiagnose kode,SKS_group
str,str
"""DZ031DA""","""Faktorer af betydning for sund…"
"""DR339B""","""Symptomer og abnorme fund IKA …"
"""DI109""","""Sygdomme i kredsløbsorganer [D…"
"""DM199""","""Sygdomme i knogler, muskler og…"
"""DS828D""","""Læsioner, forgiftninger og vis…"
…,…
"""DG403F""","""Sygdomme i nervesystemet [DG00…"
"""DC509""","""Neoplasmer [DC00-DD48]"""
"""DJ189""","""Sygdomme i åndedrætsorganer [D…"
"""DC209""","""Neoplasmer [DC00-DD48]"""


In [13]:
blood.head().collect()

EnterpriseID,PT_ID,Blood_Test_Status,Blood_Test_Start,Blood_Test_End,Hæmoglobin;B,Leukocytter;B,Trombocytter;B,Kreatinin;P,Alanintransaminase [ALAT];P,Laktatdehydrogenase [LDH];P,Albumin;P,C-reaktivt protein [CRP];P,Laktat;P(aB),Troponin T;P,Laktat;P(vB)
str,str,str,datetime[μs],datetime[μs],str,str,str,str,str,str,str,str,str,str,str
"""E999""","""Z1000""","""Endelig""",2019-05-11 12:06:00,2019-05-11 13:12:00,,,,,"""13""",,,,,,
"""E999""","""Z1000""","""Endelig""",2019-05-11 12:06:00,2019-05-11 12:22:00,"""7.9""","""6.2""","""364""",,,,,,,,
"""E999""","""Z1000""","""Endelig""",2019-05-11 12:06:00,2019-05-11 13:00:00,,,,,,,,"""<3""",,,
"""E999""","""Z1000""","""Endelig""",2019-05-11 12:06:00,2019-05-11 12:55:00,,,,"""58""",,"""161""","""46""",,,,
"""E999""","""Z1000""","""Endelig""",2019-05-13 13:33:00,2019-05-13 16:09:00,"""8.0""","""7.8""","""347""",,,,,,,,


### Impute median in biomarker columns from the blood tests

In [None]:
blpl = pl.scan_parquet(f"{data_folder}/blood_tests_newest.parquet")

In [None]:
for col in columns:
    blpl = blpl.with_columns(
        pl.col(col).cast(pl.Float64, strict=False).alias(f"{col}_sure")
    )

    old_vals = (
        blpl.select(col)
        .filter(pl.col(col).str.starts_with(">") | pl.col(col).str.starts_with("<"))
        .unique()
        .collect()
        .to_numpy()
        .squeeze()
        .tolist()
    )

    new_vals = []

    for elm in old_vals:
        operator = elm[0]
        bound = float(elm[1:])

        if operator == "<":
            new_val = (
                blpl.select(f"{col}_sure")
                .filter(pl.col(f"{col}_sure") < bound)
                .median()
                .collect()
                .item()
            )
        else:
            new_val = (
                blpl.select(f"{col}_sure")
                .filter(pl.col(f"{col}_sure") > bound)
                .median()
                .collect()
                .item()
            )
        new_vals.append(new_val)

    old_vals = pl.Series(old_vals)
    new_vals = pl.Series(new_vals)

    blpl = blpl.with_columns(
        pl.col(col)
        .replace(old_vals, new_vals)
        .cast(pl.Float64, strict=False)
        .alias(f"{col}_imputed")
    ).drop(f"{col}_sure")