In [7]:
import pandas as pd
from pathlib import Path

# --- Load data ---
LABKA_PATH = Path(r"C:\Users\kfq6\Documents\Data\LABKA.xlsx")
df = pd.read_excel(LABKA_PATH, engine="openpyxl")



  warn("Workbook contains no default style, apply openpyxl's default")


In [8]:
# --- Clean up column names ---
df.columns = df.columns.str.strip()

# --- Check total unique tests ---
unique_tests = df["Analysenavn"].nunique()
print(f"Antal unikke testtyper: {unique_tests}")



Antal unikke testtyper: 16


In [13]:
# --- Optional: include total counts too ---
test_summary = (
    df.groupby("Analysenavn")
      .agg(
          Antal_test=("Svar", "count"),
      )
      .sort_values("Antal_test", ascending=False)
      .reset_index()
)

print("\nTop 20 tests efter antal registrerede målinger:")
print(test_summary.head(20))



Top 20 tests efter antal registrerede målinger:
                                          Analysenavn  Antal_test
0                                         P-Kreatinin       49476
1   eGFR_x0020__x002F__x0020_1_x002C_73m_x00B2__x0...       49436
2                                            P-Kalium       44914
3                                           P-Natrium       44301
4                                        B-Hæmoglobin       44279
5                                           P-Albumin       26924
6   Hb_x0028_B_x0029_-Hæmoglobin_x0020_A1c_x0020__...       26671
7                                           P-Calcium       20008
8     P-Calcium_x0020__x0028_albuminkorrigeret_x0029_       20008
9                                        P-Kolesterol       15769
10                             P-Kolesterol_x0020_HDL       15721
11                                      P-Triglycerid       15373
12                             P-Kolesterol_x0020_LDL       15359
13                         

In [9]:
# --- Keep only relevant columns (Over70 is gone) ---
cols = [
    "DW_EK_Borger",
    "Dato_Proevetagningstid",
    "Klok_Proevetagningstid",
    "Alder_Proevetagningstid",
    "Analysenavn",
    "Svar",
    "Enhed",
]
df = df[[c for c in cols if c in df.columns]].copy()

# --- Parse date/time (no other text meddling), add Testdato ---
df["Dato_Proevetagningstid"] = pd.to_datetime(df["Dato_Proevetagningstid"], errors="coerce")
df["Testdato"] = df["Dato_Proevetagningstid"].dt.date

# --- Build column header: "Analysenavn [Enhed]" (unit optional) ---
def make_col_name(row):
    analy = str(row.get("Analysenavn", ""))
    unit = row.get("Enhed")
    unit = "" if pd.isna(unit) else str(unit)
    return f"{analy} [{unit}]" if unit else analy

df["AnalyseKolonne"] = df.apply(make_col_name, axis=1)

# --- Keep only last entry per test per day (so duplicates collapse) ---
df = df.sort_values(["DW_EK_Borger", "Testdato", "AnalyseKolonne", "Dato_Proevetagningstid"])
df_last = df.groupby(["DW_EK_Borger", "Testdato", "AnalyseKolonne"], as_index=False).tail(1)

# --- Pivot: one column per "Analysenavn [Enhed]", values are raw Svar (unaltered) ---
wide = df_last.pivot(
    index=["DW_EK_Borger", "Testdato"],
    columns="AnalyseKolonne",
    values="Svar"
)

# --- Add metadata back (except Over70) ---
meta_cols = [c for c in ["Alder_Proevetagningstid", "Dato_Proevetagningstid", "Klok_Proevetagningstid"] if c in df.columns]
meta = (
    df.sort_values(["DW_EK_Borger", "Testdato", "Dato_Proevetagningstid"])
      .groupby(["DW_EK_Borger", "Testdato"], as_index=False)[meta_cols]
      .first()
)

wide = meta.merge(wide.reset_index(), on=["DW_EK_Borger", "Testdato"], how="left")

# --- Excel-safe headers (don’t touch content) ---
safe_cols = []
for c in wide.columns:
    c = str(c).replace("\n", " ").replace("/", "_").replace("  ", " ").strip()
    c = c.replace(" ", "_")
    safe_cols.append(c)
wide.columns = safe_cols

In [10]:
# --- Drop all columns without units (but keep metadata columns) ---
meta_cols = ["DW_EK_Borger", "Testdato", "Alder_Proevetagningstid", "Dato_Proevetagningstid", "Klok_Proevetagningstid"]
unitless_cols = [c for c in wide.columns if "_[" not in c and c not in meta_cols]

print(f"Dropping {len(unitless_cols)} unitless columns:")
print(unitless_cols)

wide = wide.drop(columns=unitless_cols)


Dropping 16 unitless columns:
['B-Hæmoglobin', 'Hb_x0028_B_x0029_-Hæmoglobin_x0020_A1c_x0020__x0028_IFCC_x0029_', 'P-25-Hydroxy-Vitamin_x0020_D_x0028_D3_x002B_D2_x0029_', 'P-Albumin', 'P-Calcium', 'P-Calcium_x0020__x0028_albuminkorrigeret_x0029_', 'P-Kalium', 'P-Kolesterol', 'P-Kolesterol_x0020_HDL', 'P-Kolesterol_x0020_LDL', 'P-Kreatinin', 'P-Natrium', 'P-Triglycerid', 'P-Vitamin_x0020_B12', 'U-Albumin_x0020__x002F__x0020_Kreatinin-ratio', 'eGFR_x0020__x002F__x0020_1_x002C_73m_x00B2__x0028_CKD-EPI_x0029_']


In [11]:
# --- Save ---
out_path = Path(r"C:\Users\kfq6\Documents\Data\LABKA_wide_rawSvar.xlsx")
wide.to_excel(out_path, index=False)

print("Wide table saved to:", out_path)
print("Shape:", wide.shape)

Wide table saved to: C:\Users\kfq6\Documents\Data\LABKA_wide_rawSvar.xlsx
Shape: (57480, 22)
