In [2]:
!ls ~/data/Diagnoses_ITA_Procedures/

bitabel_newest.parquet	    intensive_care.parquet
blood_tests_newest.parquet  procedures_newest.parquet
diagnoses_newest.parquet    SKS_Codes_Cookbook.parquet


In [135]:
import math

import numpy as np
import pandas as pd
import polars as pl

In [136]:
data_folder = "/home/nclow23/data/Diagnoses_ITA_Procedures"

def read_parquet(fname):
    df = pd.read_parquet(f"{data_folder}/{fname}.parquet")

    print(df.shape)

    return df


def get_sks_kapitler(fname_in="public_data/sks.csv", fname_out="public_data/sks_kapitler.csv"):
    real_sks = pd.read_csv(fname_in, sep=";", header=2)

    real_sks.query("Tekst.str.startswith('Kap.')", inplace=True)

    real_sks.drop("Kode", axis=1, inplace=True)

    sks_kaps = real_sks.Tekst.str.split(": ", expand=True).rename(
        columns={0: "Kap", 1: "Desc"}
    )

    sks_kaps[["Desc", "Range"]] = sks_kaps.Desc.str.split("[", expand=True)

    sks_kaps.Range = sks_kaps.Range.apply(lambda x: x[1:-1])

    sks_kaps[["Range_Min", "Range_Max"]] = sks_kaps.Range.str.split("-", expand=True)

    sks_kaps.set_index("Kap", inplace=True)

    sks_kaps.to_csv(fname_out)


def get_sks_mapping(fname_in="public_data/sks.csv", fname_out="public_data/sks_mapping.csv"):
    real_sks = pd.read_csv(fname_in, sep=";", header=2)

    kap = ""

    data = {}

    for idx, row in real_sks.iterrows():
        if row.Tekst.startswith("Kap."):
            kap = row.Tekst
        try:
            if math.isnan(row.Kode):
                continue
        except TypeError:
            if len(row.Kode) > 2:
                data[row.Kode] = kap

    data["D"] = np.nan
    data[np.nan] = np.nan

    real_sks["Kapitel"] = real_sks.Kode.apply(lambda x: data[x])

    real_sks[["Kap_n", "Kap_desc"]] = real_sks.Kapitel.str.split(": ", expand=True)

    real_sks.dropna(inplace=True)

    real_sks["Kap_n"] = real_sks.Kap_n.apply(lambda x: x.split(".")[1].strip())

    real_sks.set_index("Kode").to_csv(fname_out)

In [137]:
di = read_parquet("diagnoses_newest")

(65632899, 6)


In [8]:
di.head()

Unnamed: 0,EnterpriseID,PT_ID,CSN,Department ID,Aktionsdiagnose kode,Aktionsdiagnose
0,E2910733,Z2910734,62770176,1481,DZ031DA,Obs. pga mistanke om kræft i tyktarmen
1,E1405190,Z1405191,65657549,1002394,DR339B,Akut urinretention
2,E2092618,Z2092619,57352128,1000333,DI109,Essentiel hypertension
3,E1021114,Z1021115,77401770,426,DM199,Artrose UNS
4,E1875864,Z1875865,65922690,1002226,DS828D,Ankelfraktur UNS


## Apply SKS chapter mapping in Pandas

In [124]:
sks_mapping = pd.read_csv("public_data/sks_mapping.csv", index_col=0)

sks_dict = sks_mapping["Kap_n"].to_dict()

In [130]:
di["SKS_group"] = di["Aktionsdiagnose kode"].apply(
    lambda x: sks_dict.get(
        x,
        sks_dict.get(
            x[:-1], float("nan")
        )
    )
)

In [134]:
di.to_parquet(f"{data_folder}/diagnoses_newest_with_sks_group.parquet")

## Same thing in Polars

In [142]:
dipl = pl.scan_parquet(f"{data_folder}/diagnoses_newest.parquet")

In [171]:
dipl.select(
    "Aktionsdiagnose kode",
    pl.col("Aktionsdiagnose kode")
    .map_elements(
        lambda x: sks_dict.get(x, sks_dict.get(x[:-1], float("nan"))),
        return_dtype=pl.String,
    )
    .alias("SKS_group"),
).sink_parquet(f"{data_folder}/diagnoses_newest_with_sks_group.parquet")