In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import LinearRegression

# ---------------- CONFIG ----------------
SAMMEDAG_4M2M_PATH = Path(r"C:\Users\kfq6\Documents\Data\Sammedag_master_means_4m2m_3to4.xlsx")
LABKA_PATH         = Path(r"C:\Users\kfq6\Documents\Data\LABKA_wide_numeric.xlsx")

OUT_PATH = Path(r"C:\Users\kfq6\Documents\Data\Sammedag_master_LDL_Features.xlsx")

ID_COL = "DW_EK_Borger"

# Lab columns in Sammedag master
LDL_COL  = "LABmean__p_kolesterol_ldl_mmol_l"

# LDL i LABKA (RET DETTE NAVN HVIS DET IKKE MATCHER DIN FIL!)
LDL_LABKA_COL = "P-Kolesterol_LDL_[mmol_l]"   # <--- tjek LABKA_wide_numeric.xlsx for præcist navn




In [2]:
# ---------------- HELPERS ----------------
def missing_table(df_in: pd.DataFrame) -> pd.DataFrame:
    missing_count = df_in.isna().sum()
    missing_percent = (missing_count / len(df_in)) * 100
    tbl = pd.DataFrame({
        "Missing Count": missing_count,
        "Missing %": missing_percent.round(2),
    })
    tbl = tbl[tbl["Missing Count"] > 0].sort_values("Missing %", ascending=False)
    return tbl


def add_lab_timeseries_features(df_final: pd.DataFrame,
                                df_labka_ts: pd.DataFrame,
                                lab_col: str,
                                prefix: str) -> pd.DataFrame:
    """
    Beregner slope, CV og MAC for sidste års målinger af en given lab-værdi (lab_col)
    ift. anchor_date for hver række i df_final.

    Tilføjer kolonner:
        {prefix}_slope_prev_year
        {prefix}_CV_prev_year
        {prefix}_MAC_prev_year
    """
    slope_results = []
    cv_results    = []
    mac_results   = []

    for _, row in df_final.iterrows():
        pid = row[ID_COL]
        anchor_date = row["anchor_date"]

        if pd.isna(anchor_date):
            slope_results.append(np.nan)
            cv_results.append(np.nan)
            mac_results.append(np.nan)
            continue

        start_date = anchor_date - pd.DateOffset(years=1)
        end_date   = anchor_date

        df_patient = df_labka_ts[
            (df_labka_ts[ID_COL] == pid) &
            (df_labka_ts["Testdato"] >= start_date) &
            (df_labka_ts["Testdato"] < end_date)
        ].copy()

        df_patient = df_patient.dropna(subset=[lab_col, "Testdato"])

        if len(df_patient) < 2:
            slope_results.append(np.nan)
            cv_results.append(np.nan)
            mac_results.append(np.nan)
            continue

        df_patient = df_patient.sort_values("Testdato")
        X = (df_patient["Testdato"] - df_patient["Testdato"].min()).dt.days.values.reshape(-1, 1)
        y = df_patient[lab_col].values

        # Slope
        model = LinearRegression().fit(X, y)
        slope = model.coef_[0]
        slope_results.append(slope)

        # CV
        mean = y.mean()
        if mean == 0 or np.isnan(mean):
            cv_results.append(np.nan)
        else:
            std = y.std(ddof=1)
            cv = std / mean
            cv_results.append(cv)

        # MAC
        if len(y) < 2:
            mac_results.append(np.nan)
        else:
            mac = np.mean(np.abs(np.diff(y)))
            mac_results.append(mac)

    slope_col = f"{prefix}_slope_prev_year"
    cv_col    = f"{prefix}_CV_prev_year"
    mac_col   = f"{prefix}_MAC_prev_year"

    df_final[slope_col] = slope_results
    df_final[cv_col]    = cv_results
    df_final[mac_col]   = mac_results

    # AUDIT
    total = len(df_final)
    for col in [slope_col, cv_col, mac_col]:
        n_nan = df_final[col].isna().sum()
        print(f"Missing {col}: {n_nan} / {total} ({n_nan / total * 100:.2f}%)")

    return df_final


In [3]:
# %% ---------------- LOAD SAMMEDAG & BASIC CLEANING ----------------
df = pd.read_excel(SAMMEDAG_4M2M_PATH, sheet_name="Sheet1")

df["anchor_date"] = pd.to_datetime(df["anchor_date"], errors="coerce")
df["year"] = df["anchor_date"].dt.year

print("=== RAW 4m2m_3to4 AT LOAD ===")
print("Rows:", len(df))
print("Unique patients:", df[ID_COL].nunique())

anchor_counts_raw = df.groupby(ID_COL).size()
print("\nRows per patient (raw):")
print(anchor_counts_raw.value_counts().sort_index())


=== RAW 4m2m_3to4 AT LOAD ===
Rows: 2481
Unique patients: 818

Rows per patient (raw):
3    791
4     27
Name: count, dtype: int64


In [4]:
# %% ---------------- DROP PATIENTS WITH NO WHO ANYWHERE ----------------
who_missing = df["who_score"].isna()
all_missing_per_patient = who_missing.groupby(df[ID_COL]).transform("all")
ids_no_who_in_master = df.loc[all_missing_per_patient, ID_COL].unique()

print("\n=== WHO FILTER ===")
print("Number of patients with NO WHO in any row:", len(ids_no_who_in_master))

mask_keep_who = ~df[ID_COL].isin(ids_no_who_in_master)
df = df[mask_keep_who].copy()

print("Rows after WHO filter:", len(df))
print("Patients after WHO filter:", df[ID_COL].nunique())



=== WHO FILTER ===
Number of patients with NO WHO in any row: 66
Rows after WHO filter: 2281
Patients after WHO filter: 752


In [5]:
# %% ---------------- DROP META COLUMNS ----------------
drop_meta = [
    "window_type", "has_who_in_window",
    "window_start", "window_end",
    "anchor_first_time", "anchor_last_time",
    "visit_datetimes", "visit_topics", "visit_statuses",
    "lab_window_start", "lab_window_end", "diabetes_type",
    "who_date",
    "LABmean__u_albumin_kreatinin_ratio_x_10_3", "comp_ketoac", "comp_coma",
    "LABmean__p_25_hydroxy_vitamin_d_d3_d2_nmol_l",
    "n_anchor_bookings_that_day",
    "LABmean__p_calcium_albuminkorrigeret_mmol_l",
    "LABmean__p_calcium_mmol_l",
    "who_days_from_anchor",
    "comp_any",
    "has_lab_in_window",
    "lab_window_n_rows",
    "n_anchors", "LABmean__p_albumin_g_l",
]
drop_meta_existing = [c for c in drop_meta if c in df.columns]
df = df.drop(columns=drop_meta_existing)
print(f"\nDropped {len(drop_meta_existing)} meta columns.")




Dropped 26 meta columns.


In [6]:
# %% ---------------- AUDIT MISSINGNESS I LDL ----------------
target_cols = [LDL_COL]

print("\n=== MISSINGNESS AUDIT (LDL) ===")
row_missing_any = df[target_cols].isna().any(axis=1)

n_rows_missing = row_missing_any.sum()
rows_with_missing = df[row_missing_any]
bad_ids = rows_with_missing[ID_COL].unique()

print(f"Total rows: {len(df)}")
print(f"Rows with missing in {target_cols}: {n_rows_missing}")
print(f"Patients with >=1 row missing LDL: {len(bad_ids)}")

miss_counts = rows_with_missing.groupby(ID_COL).size()
print("\nMissing LDL rows per patient (only patients with at least one missing):")
print(miss_counts.value_counts().sort_index())

print("\nRows per patient BEFORE dropping LDL-missing rows:")
print(df.groupby(ID_COL).size().value_counts().sort_index())




=== MISSINGNESS AUDIT (LDL) ===
Total rows: 2281
Rows with missing in ['LABmean__p_kolesterol_ldl_mmol_l']: 84
Patients with >=1 row missing LDL: 72

Missing LDL rows per patient (only patients with at least one missing):
1    62
2     8
3     2
Name: count, dtype: int64

Rows per patient BEFORE dropping LDL-missing rows:
3    727
4     25
Name: count, dtype: int64


In [7]:
# %% ---------------- DROP INCOMPLETE LDL ROWS ----------------
df["missing_ldl"] = row_missing_any

df_complete = df[~df["missing_ldl"]].copy()
df_complete = df_complete.drop(columns=["missing_ldl"])

print("\n=== AFTER DROPPING LDL-MISSING YEARS ===")
print("Rows:", len(df_complete))
print("Patients:", df_complete[ID_COL].nunique())

anchor_counts_complete = df_complete.groupby(ID_COL).size()
print("\nRows per patient (after dropping LDL-missing years):")
print(anchor_counts_complete.value_counts().sort_index())

ids_after = set(df_complete[ID_COL].unique())
ids_before = set(df[ID_COL].unique())
lost_all = ids_before - ids_after
print(f"\nPatients who lost ALL rows after LDL row drop: {len(lost_all)}")



=== AFTER DROPPING LDL-MISSING YEARS ===
Rows: 2197
Patients: 750

Rows per patient (after dropping LDL-missing years):
1      8
2     61
3    657
4     24
Name: count, dtype: int64

Patients who lost ALL rows after LDL row drop: 2


In [8]:
# %% ---------------- ENFORCE MINIMUM YEARS ----------------
min_years = 3   # ≥3 complete LDL-years
counts_after = df_complete.groupby(ID_COL).size()
good_ids = counts_after[counts_after >= min_years].index

print(f"\nPatients with ≥{min_years} COMPLETE LDL years:", len(good_ids))

df_final = df_complete[df_complete[ID_COL].isin(good_ids)].copy()

print("\n=== FINAL COHORT FOR LDL FEATURES (after min_years filter) ===")
print("Final rows:", len(df_final))
print("Final patients:", df_final[ID_COL].nunique())
print("\nRows per patient in FINAL df:")
print(df_final.groupby(ID_COL).size().value_counts().sort_index())

print("\nExample patients and their years in final df:")
if len(df_final[ID_COL].unique()) > 0:
    example_ids = df_final[ID_COL].drop_duplicates().sample(
        min(5, len(df_final[ID_COL].unique())),
        random_state=42
    )
    for pid in example_ids:
        sub = df_final[df_final[ID_COL] == pid][[ID_COL, "year"]].sort_values("year")
        print(sub)



Patients with ≥3 COMPLETE LDL years: 681

=== FINAL COHORT FOR LDL FEATURES (after min_years filter) ===
Final rows: 2067
Final patients: 681

Rows per patient in FINAL df:
3    657
4     24
Name: count, dtype: int64

Example patients and their years in final df:
      DW_EK_Borger  year
2376       1237869  2022
2377       1237869  2024
2378       1237869  2025
      DW_EK_Borger  year
1100        584797  2022
1101        584797  2024
1102        584797  2025
     DW_EK_Borger  year
229        137744  2022
230        137744  2023
231        137744  2024
      DW_EK_Borger  year
1184        633320  2022
1185        633320  2023
1186        633320  2025
     DW_EK_Borger  year
363        193190  2022
364        193190  2023
365        193190  2025


In [9]:
# %% ---------------- LOAD LABKA TIME SERIES (LDL) ----------------
df_labka_ts = pd.read_excel(LABKA_PATH)
df_labka_ts["Testdato"] = pd.to_datetime(df_labka_ts["Testdato"], dayfirst=True, errors="coerce")

# Kun patienter, der er i df_final
df_labka_ts = df_labka_ts[df_labka_ts[ID_COL].isin(df_final[ID_COL].unique())].copy()

# Gør LDL kolonne numerisk
df_labka_ts[LDL_LABKA_COL] = pd.to_numeric(df_labka_ts[LDL_LABKA_COL], errors="coerce")

df_labka_ts = df_labka_ts[[ID_COL, "Testdato", LDL_LABKA_COL]].copy()
df_labka_ts = df_labka_ts.dropna(subset=[LDL_LABKA_COL, "Testdato"])

print("\n=== LABKA LDL TS OVERVIEW ===")
print("Rows:", len(df_labka_ts))
print("Unique patients in LABKA LDL:", df_labka_ts[ID_COL].nunique())



=== LABKA LDL TS OVERVIEW ===
Rows: 3530
Unique patients in LABKA LDL: 681


In [10]:
# %% ---------------- LDL SLOPE / CV / MAC OVER LAST YEAR ----------------
df_final = add_lab_timeseries_features(
    df_final=df_final,
    df_labka_ts=df_labka_ts,
    lab_col=LDL_LABKA_COL,
    prefix="LDL"
)

Missing LDL_slope_prev_year: 1612 / 2067 (77.99%)
Missing LDL_CV_prev_year: 1612 / 2067 (77.99%)
Missing LDL_MAC_prev_year: 1612 / 2067 (77.99%)


In [11]:
# %% ---------------- BUILD NEXT-YEAR LDL TARGET ----------------
df_final = df_final.sort_values([ID_COL, "anchor_date"]).reset_index(drop=True)

df_final["LDL_next"] = (
    df_final
    .groupby(ID_COL)[LDL_COL]
    .shift(-1)  # næste års LDL
)

df_final["year_next"] = df_final.groupby(ID_COL)["year"].shift(-1)
df_final["anchor_date_next"] = df_final.groupby(ID_COL)["anchor_date"].shift(-1)

df_final["delta_years_to_next"] = (
    (df_final["anchor_date_next"] - df_final["anchor_date"]).dt.days / 365.25
)

df_final["LDL_prev"] = (
    df_final
    .groupby(ID_COL)[LDL_COL]
    .shift(1)
)

print("\n=== LDL TARGET CONSTRUCTION AUDIT ===")
print("Rows with non-missing LDL_next:", df_final["LDL_next"].notna().sum())
print("Rows with missing LDL_next (expected: last per patient):", df_final["LDL_next"].isna().sum())

pairs_per_patient = (
    df_final.dropna(subset=["LDL_next"])
            .groupby(ID_COL)
            .size()
)
print("LDL prediction pairs per patient (distribution):")
print(pairs_per_patient.value_counts().sort_index())

print("\nPreview of year, LDL_current, LDL_next for first 10 rows:")
print(
    df_final[[ID_COL, "year", "anchor_date",
              LDL_COL, "year_next", "anchor_date_next", "LDL_next"]]
    .head(10)
)


=== LDL TARGET CONSTRUCTION AUDIT ===
Rows with non-missing LDL_next: 1386
Rows with missing LDL_next (expected: last per patient): 681
LDL prediction pairs per patient (distribution):
2    657
3     24
Name: count, dtype: int64

Preview of year, LDL_current, LDL_next for first 10 rows:
   DW_EK_Borger  year anchor_date  LABmean__p_kolesterol_ldl_mmol_l  \
0          7371  2022  2022-11-30                               2.5   
1          7371  2024  2024-02-13                               2.4   
2          7371  2025  2025-03-05                               2.6   
3         11946  2022  2022-12-08                               3.2   
4         11946  2024  2024-02-13                               2.9   
5         11946  2025  2025-03-03                               2.5   
6         15619  2022  2022-11-30                               1.4   
7         15619  2024  2024-02-20                               3.7   
8         15619  2025  2025-01-08                               1.9   
9

In [12]:
# %% ---------------- MISSINGNESS TABLE & EXPORT ----------------
missing_tbl = missing_table(df_final)
print("\nMissingness overview (top 20):")
print(missing_tbl.head(20))

df_final.to_excel(OUT_PATH, index=False)
print(f"\nSaved LDL feature-augmented dataset to: {OUT_PATH}")

print("\nUnique patients in final LDL dataset:", df_final[ID_COL].nunique())


Missingness overview (top 20):
                                         Missing Count  Missing %
LDL_slope_prev_year                               1612      77.99
LDL_CV_prev_year                                  1612      77.99
LDL_MAC_prev_year                                 1612      77.99
LDL_next                                           681      32.95
year_next                                          681      32.95
anchor_date_next                                   681      32.95
delta_years_to_next                                681      32.95
LDL_prev                                           681      32.95
who_score                                          391      18.92
LABmean__b_haemoglobin_mmol_l                      355      17.17
LABmean__u_albumin_kreatinin_ratio_mg_g            352      17.03
LABmean__p_vitamin_b12_pmol_l                        9       0.44
LABmean__p_kalium_mmol_l                             2       0.10
LABmean__egfr_1_73m2_ckd_epi_ml_min         