In [8]:
import re
from pathlib import Path
import pandas as pd

# -----------------------------
# Config
# -----------------------------
LABKA_PATH = Path(r"C:\Users\kfq6\Documents\Data\LABKA.xlsx")
OUT_XLSX   = Path(r"C:\Users\kfq6\Documents\Data\LABKA_wide_numeric.xlsx")
OUT_PARQ   = Path(r"C:\Users\kfq6\Documents\Data\LABKA_wide_numeric.parquet")

ID_COL   = "DW_EK_Borger"
DATE_SRC = "Dato_Proevetagningstid"   # source datetime col
DATE_COL = "Testdato"                 # derived date col

In [11]:
# -----------------------------
# Helpers
# -----------------------------
def decode_excel_xml(s):
    if not isinstance(s, str):
        return s
    return re.sub(r"_x([0-9A-Fa-f]{4})_", lambda m: chr(int(m.group(1), 16)), s)

def make_col_name(analy, unit):
    analy = "" if pd.isna(analy) else str(analy).strip()
    unit  = "" if pd.isna(unit)  else str(unit).strip()
    return f"{analy} [{unit}]" if unit else analy

def coerce_numeric_series(s: pd.Series):
    x = s.astype(str).str.strip()
    # Decode Excel escapes
    x = x.str.replace(r"_x([0-9A-Fa-f]{4})_", lambda m: chr(int(m.group(1),16)), regex=True)
    # Censor flags
    censor = pd.Series(0, index=x.index, dtype="int8")
    censor = censor.mask(x.str.contains(r"^\s*[<≤]\s*"), -1)
    censor = censor.mask(x.str.contains(r"^\s*[>≥]\s*"), +1)
    # Strip qualifiers and parentheses negatives
    x = x.str.replace(r"^\s*[<≤>≥=]\s*", "", regex=True)
    x = x.str.replace(r"^\((.*)\)$", r"-\1", regex=True)
    # Map Pos/Neg
    lower = x.str.lower()
    x = x.mask(lower.isin({"pos","positiv","positive"}), "1")
    x = x.mask(lower.isin({"neg","negativ","negative"}), "0")
    # Remove thousand dots only when followed by 3 digits
    x = x.str.replace(r"\.(?=\d{3}(\D|$))", "", regex=True)
    # BUG FIX: must use .str.replace here, not Series.replace
    x = x.str.replace(",", ".", regex=False)
    num = pd.to_numeric(x, errors="coerce")
    return num, censor

def sanitize_header(c: str) -> str:
    c = str(c).replace("\n"," ").replace("/", "_").replace("  "," ").strip()
    return c.replace(" ", "_")

In [12]:
# -----------------------------
# Load once, as strings; decode
# -----------------------------
df = pd.read_excel(LABKA_PATH, engine="openpyxl", dtype=str)
df.columns = [decode_excel_xml(c).strip() for c in df.columns]
df = df.applymap(decode_excel_xml)

# Required cols
cols = [
    "DW_EK_Borger","Dato_Proevetagningstid","Klok_Proevetagningstid",
    "Alder_Proevetagningstid","Analysenavn","Svar","Enhed",
]
missing = [c for c in cols if c not in df.columns]
if missing:
    raise KeyError(f"Mangler forventede kolonner: {missing}")
df = df[cols].copy()

# Parse datetime (Danish), derive date
df[DATE_SRC] = pd.to_datetime(df[DATE_SRC], errors="coerce", dayfirst=True)
df[DATE_COL] = df[DATE_SRC].dt.date

# Coerce numeric ONCE on long data
df["Svar_num"], df["Svar_censor"] = coerce_numeric_series(df["Svar"])

# Build key; try to backfill missing unit with most common unit per analyte
df["Analysenavn_clean"] = df["Analysenavn"].astype(str).str.strip()
unit_mode = (df.dropna(subset=["Enhed"])
               .groupby("Analysenavn_clean")["Enhed"]
               .agg(lambda s: s.mode().iloc[0] if not s.mode().empty else None))
df["Enhed_filled"] = df.apply(
    lambda r: r["Enhed"] if pd.notna(r["Enhed"]) and str(r["Enhed"]).strip() != "" 
              else unit_mode.get(r["Analysenavn_clean"], None),
    axis=1
)
df["AnalyseKolonne"] = [make_col_name(a, u) for a, u in zip(df["Analysenavn_clean"], df["Enhed_filled"])]

# Deduplicate: last result per patient-day-test
df_last = (
    df.sort_values([ID_COL, DATE_COL, "AnalyseKolonne", DATE_SRC])
      .groupby([ID_COL, DATE_COL, "AnalyseKolonne"], as_index=False)
      .agg(
          Svar_num=("Svar_num","mean"),   # mean if multiple within day after dedup safety
          n_meas=("Svar_num","count"),
          Alder_Proevetagningstid=("Alder_Proevetagningstid","first"),
          Dato_Proevetagningstid=(DATE_SRC,"last"),
          Klok_Proevetagningstid=("Klok_Proevetagningstid","last"),
      )
)

# Pivot numeric wide
wide = df_last.pivot(
    index=[ID_COL, DATE_COL, "Alder_Proevetagningstid","Dato_Proevetagningstid","Klok_Proevetagningstid"],
    columns="AnalyseKolonne",
    values="Svar_num"
).reset_index()

# Sanitize headers
wide.columns = [sanitize_header(c) for c in wide.columns]


In [6]:
# -----------------------------
# Collapse duplicate pairs: no-unit vs unit
# Keep unit version if both exist; drop empty duplicates
# -----------------------------
meta_keep = {
    ID_COL, DATE_COL, "Alder_Proevetagningstid","Dato_Proevetagningstid","Klok_Proevetagningstid"
}

cols_set = set(wide.columns)
unit_cols = [c for c in wide.columns if "_[" in c and c not in meta_keep]
drop_cols = []

# Map unitless -> unit candidates
# e.g. "P-Kalium" vs "P-Kalium_[mmol_l]"
base_names = {}
for c in unit_cols:
    base = c.split("_[", 1)[0]  # strip unit suffix
    base_names.setdefault(base, []).append(c)

for base, with_units in base_names.items():
    if base in cols_set:
        # If unitless exists and any unit column exists, prefer unit column(s)
        # If unitless is fully empty or duplicates existing non-null values, drop it
        u = base
        # consider it empty if all NaN
        if wide[u].isna().all():
            drop_cols.append(u)
        else:
            # if exactly one unit column, and unitless has no extra info, collapse: fillna unitless into unit col, drop unitless
            if len(with_units) == 1:
                wu = with_units[0]
                # if unit col has many nulls but unitless has values, move them across
                wide[wu] = wide[wu].fillna(wide[u])
                drop_cols.append(u)
            else:
                # multiple units: keep them; drop unitless if it adds nothing
                if wide[u].isna().all():
                    drop_cols.append(u)

# Drop fully empty analyte columns (all NaN)
empty_cols = [c for c in wide.columns if c not in meta_keep and wide[c].isna().all()]
drop_cols = sorted(set(drop_cols + empty_cols))
if drop_cols:
    wide = wide.drop(columns=drop_cols)

Numeric lab columns: 17
Sample lab columns: ['B-Hæmoglobin_[mmol_l]', 'Hb(B)-Hæmoglobin_A1c_(IFCC)_[mmol_mol]', 'P-25-Hydroxy-Vitamin_D(D3+D2)_[nmol_l]', 'P-Albumin_[g_l]', 'P-Calcium_(albuminkorrigeret)_[mmol_l]', 'P-Calcium_[mmol_l]', 'P-Kalium_[mmol_l]', 'P-Kolesterol_HDL_[mmol_l]', 'P-Kolesterol_LDL_[mmol_l]', 'P-Kolesterol_[mmol_l]']


In [7]:
# -----------------------------
# Save result
# -----------------------------
num_df.to_excel(OUT_PATH, index=False)
print("Wide table saved to:", OUT_PATH)
print("Shape:", num_df.shape)

Wide table saved to: C:\Users\kfq6\Documents\Data\LABKA_wide_rawSvar.xlsx
Shape: (57480, 22)
