In [132]:
# %%
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import LinearRegression

# ---------------- CONFIG ----------------
SAMMEDAG_4M2M_PATH = Path(r"C:\Users\kfq6\Documents\Data\Sammedag_master_means_4m2m_3to4.xlsx")
LABKA_PATH         = Path(r"C:\Users\kfq6\Documents\Data\LABKA_wide_numeric.xlsx")

OUT_PATH = Path(r"C:\Users\kfq6\Documents\Data\Sammedag_master_HbA1c_Features.xlsx")

ID_COL = "DW_EK_Borger"

# Lab columns in Sammedag master
HBA1C_COL = "LABmean__hb_b_haemoglobin_a1c_ifcc_mmol_mol"
LDL_COL   = "LABmean__p_kolesterol_ldl_mmol_l"
EGFR_COL  = "LABmean__egfr_1_73m2_ckd_epi_ml_min"

# HbA1c in LABKA
HBA1C_LABKA_COL = "Hb(B)-Hæmoglobin_A1c_(IFCC)_[mmol_mol]"

In [133]:
# ---------------- HELPERS ----------------
def missing_table(df_in: pd.DataFrame) -> pd.DataFrame:
    missing_count = df_in.isna().sum()
    missing_percent = (missing_count / len(df_in)) * 100
    tbl = pd.DataFrame({
        "Missing Count": missing_count,
        "Missing %": missing_percent.round(2),
    })
    tbl = tbl[tbl["Missing Count"] > 0].sort_values("Missing %", ascending=False)
    return tbl


In [134]:
# %% ---------------- LOAD SAMMEDAG & BASIC CLEANING ----------------
df = pd.read_excel(SAMMEDAG_4M2M_PATH, sheet_name="Sheet1")

df["anchor_date"] = pd.to_datetime(df["anchor_date"], errors="coerce")
df["year"] = df["anchor_date"].dt.year

print("=== RAW 4m2m_3to4 AT LOAD ===")
print("Rows:", len(df))
print("Unique patients:", df[ID_COL].nunique())

anchor_counts_raw = df.groupby(ID_COL).size()
print("\nRows per patient (raw):")
print(anchor_counts_raw.value_counts().sort_index())

=== RAW 4m2m_3to4 AT LOAD ===
Rows: 2481
Unique patients: 818

Rows per patient (raw):
3    791
4     27
Name: count, dtype: int64


In [135]:
# %% ---------------- DROP PATIENTS WITH NO WHO ANYWHERE ----------------
who_missing = df["who_score"].isna()
all_missing_per_patient = who_missing.groupby(df[ID_COL]).transform("all")
ids_no_who_in_master = df.loc[all_missing_per_patient, ID_COL].unique()

print("\n=== WHO FILTER ===")
print("Number of patients with NO WHO in any row:", len(ids_no_who_in_master))

mask_keep_who = ~df[ID_COL].isin(ids_no_who_in_master)
df = df[mask_keep_who].copy()

print("Rows after WHO filter:", len(df))
print("Patients after WHO filter:", df[ID_COL].nunique())


=== WHO FILTER ===
Number of patients with NO WHO in any row: 66
Rows after WHO filter: 2281
Patients after WHO filter: 752


In [136]:
# %% ---------------- DROP META COLUMNS ----------------
drop_meta = [
    "window_type", "has_who_in_window",
    "window_start", "window_end",
    "anchor_first_time", "anchor_last_time",
    "visit_datetimes", "visit_topics", "visit_statuses",
    "lab_window_start", "lab_window_end", "diabetes_type",
    "who_date",
    "LABmean__u_albumin_kreatinin_ratio_x_10_3", "comp_ketoac", "comp_coma",
    "LABmean__p_25_hydroxy_vitamin_d_d3_d2_nmol_l",
    "n_anchor_bookings_that_day",
    "LABmean__p_calcium_albuminkorrigeret_mmol_l",
    "LABmean__p_calcium_mmol_l",
    "who_days_from_anchor",
    "comp_any",
    "has_lab_in_window",
    "lab_window_n_rows",
    "n_anchors", "LABmean__p_albumin_g_l",
]
drop_meta_existing = [c for c in drop_meta if c in df.columns]
df = df.drop(columns=drop_meta_existing)
print(f"\nDropped {len(drop_meta_existing)} meta columns.")


Dropped 26 meta columns.


In [137]:
# %% ---------------- AUDIT MISSINGNESS IN REQUIRED LABS ----------------
target_cols = [HBA1C_COL, LDL_COL, EGFR_COL]

print("\n=== MISSINGNESS AUDIT (before dropping lab-missing rows) ===")
row_missing_any = df[target_cols].isna().any(axis=1)

n_rows_missing = row_missing_any.sum()
rows_with_missing = df[row_missing_any]
bad_ids = rows_with_missing[ID_COL].unique()

print(f"Total rows: {len(df)}")
print(f"Rows with missing in any of {target_cols}: {n_rows_missing}")
print(f"Patients with >=1 row missing any required lab: {len(bad_ids)}")

miss_counts = rows_with_missing.groupby(ID_COL).size()
print("\nMissing rows per patient (only patients with at least one missing):")
print(miss_counts.value_counts().sort_index())

print("\nRows per patient BEFORE dropping any lab-missing rows:")
print(df.groupby(ID_COL).size().value_counts().sort_index())


=== MISSINGNESS AUDIT (before dropping lab-missing rows) ===
Total rows: 2281
Rows with missing in any of ['LABmean__hb_b_haemoglobin_a1c_ifcc_mmol_mol', 'LABmean__p_kolesterol_ldl_mmol_l', 'LABmean__egfr_1_73m2_ckd_epi_ml_min']: 85
Patients with >=1 row missing any required lab: 73

Missing rows per patient (only patients with at least one missing):
1    63
2     8
3     2
Name: count, dtype: int64

Rows per patient BEFORE dropping any lab-missing rows:
3    727
4     25
Name: count, dtype: int64


In [138]:
# %% ---------------- DROP ONLY INCOMPLETE ROWS (LABS) ----------------
df["missing_any_of_three"] = row_missing_any

df_complete = df[~df["missing_any_of_three"]].copy()
df_complete = df_complete.drop(columns=["missing_any_of_three"])

print("\n=== AFTER DROPPING INCOMPLETE YEARS (ROW-WISE LABS) ===")
print("Rows:", len(df_complete))
print("Patients:", df_complete[ID_COL].nunique())

anchor_counts_complete = df_complete.groupby(ID_COL).size()
print("\nRows per patient (after dropping incomplete years):")
print(anchor_counts_complete.value_counts().sort_index())

ids_after = set(df_complete[ID_COL].unique())
ids_before = set(df[ID_COL].unique())
lost_all = ids_before - ids_after
print(f"\nPatients who lost ALL rows after row-wise lab drop: {len(lost_all)}")



=== AFTER DROPPING INCOMPLETE YEARS (ROW-WISE LABS) ===
Rows: 2196
Patients: 750

Rows per patient (after dropping incomplete years):
1      8
2     62
3    656
4     24
Name: count, dtype: int64

Patients who lost ALL rows after row-wise lab drop: 2


In [139]:
#%% ---------------- ENFORCE MINIMUM YEARS ----------------
min_years = 3   # ≥3 complete lab-years
counts_after = df_complete.groupby(ID_COL).size()
good_ids = counts_after[counts_after >= min_years].index

print(f"\nPatients with ≥{min_years} COMPLETE years:", len(good_ids))

df_final = df_complete[df_complete[ID_COL].isin(good_ids)].copy()

print("\n=== FINAL COHORT FOR FEATURES (after min_years filter) ===")
print("Final rows:", len(df_final))
print("Final patients:", df_final[ID_COL].nunique())
print("\nRows per patient in FINAL df:")
print(df_final.groupby(ID_COL).size().value_counts().sort_index())

print("\nExample patients and their years in final df:")
example_ids = df_final[ID_COL].drop_duplicates().sample(
    min(5, len(df_final[ID_COL].unique())),
    random_state=42
)
for pid in example_ids:
    sub = df_final[df_final[ID_COL] == pid][[ID_COL, "year"]].sort_values("year")
    print(sub)


Patients with ≥3 COMPLETE years: 680

=== FINAL COHORT FOR FEATURES (after min_years filter) ===
Final rows: 2064
Final patients: 680

Rows per patient in FINAL df:
3    656
4     24
Name: count, dtype: int64

Example patients and their years in final df:
      DW_EK_Borger  year
2376       1237869  2022
2377       1237869  2024
2378       1237869  2025
      DW_EK_Borger  year
2244       1178410  2022
2245       1178410  2023
2246       1178410  2025
     DW_EK_Borger  year
229        137744  2022
230        137744  2023
231        137744  2024
      DW_EK_Borger  year
1181        631815  2023
1182        631815  2024
1183        631815  2025
     DW_EK_Borger  year
363        193190  2022
364        193190  2023
365        193190  2025


In [140]:
# %% ---------------- HBA1C SLOPE / CV / MAC OVER LAST YEAR ----------------
df_labka_ts = pd.read_excel(LABKA_PATH)
df_labka_ts["Testdato"] = pd.to_datetime(df_labka_ts["Testdato"], dayfirst=True, errors="coerce")

# Only keep LABKA rows for patients that survived all filtering
df_labka_ts = df_labka_ts[df_labka_ts[ID_COL].isin(df_final[ID_COL].unique())].copy()

df_labka_ts = df_labka_ts[[ID_COL, "Testdato", HBA1C_LABKA_COL]].copy()
df_labka_ts[HBA1C_LABKA_COL] = pd.to_numeric(df_labka_ts[HBA1C_LABKA_COL], errors="coerce")
df_labka_ts = df_labka_ts.dropna(subset=[HBA1C_LABKA_COL, "Testdato"])

slope_results = []
cv_results = []
mac_results = []

for _, row in df_final.iterrows():
    pid = row[ID_COL]
    anchor_date = row["anchor_date"]

    if pd.isna(anchor_date):
        slope_results.append(np.nan)
        cv_results.append(np.nan)
        mac_results.append(np.nan)
        continue

    start_date = anchor_date - pd.DateOffset(years=1)
    end_date = anchor_date

    df_patient = df_labka_ts[
        (df_labka_ts[ID_COL] == pid) &
        (df_labka_ts["Testdato"] >= start_date) &
        (df_labka_ts["Testdato"] < end_date)
    ].copy()

    if len(df_patient) < 2:
        slope_results.append(np.nan)
        cv_results.append(np.nan)
        mac_results.append(np.nan)
        continue

    df_patient = df_patient.sort_values("Testdato")
    X = (df_patient["Testdato"] - df_patient["Testdato"].min()).dt.days.values.reshape(-1, 1)
    y = df_patient[HBA1C_LABKA_COL].values

    # Slope
    model = LinearRegression().fit(X, y)
    slope = model.coef_[0]
    slope_results.append(slope)

    # CV
    mean = y.mean()
    if mean == 0 or np.isnan(mean):
        cv_results.append(np.nan)
    else:
        std = y.std(ddof=1)
        cv = std / mean
        cv_results.append(cv)

    # MAC
    if len(y) < 2:
        mac_results.append(np.nan)
    else:
        mac = np.mean(np.abs(np.diff(y)))
        mac_results.append(mac)

df_final["HbA1c_slope_prev_year"] = slope_results
df_final["HbA1c_CV_prev_year"] = cv_results
df_final["HbA1c_MAC_prev_year"] = mac_results

total = len(df_final)
n_slope_nan = df_final["HbA1c_slope_prev_year"].isna().sum()
n_cv_nan    = df_final["HbA1c_CV_prev_year"].isna().sum()
n_mac_nan   = df_final["HbA1c_MAC_prev_year"].isna().sum()

print(f"\nMissing HbA1c slope values: {n_slope_nan} / {total} ({n_slope_nan / total * 100:.2f}%)")
print(f"Missing HbA1c CV values:    {n_cv_nan} / {total} ({n_cv_nan / total * 100:.2f}%)")
print(f"Missing HbA1c MAC values:   {n_mac_nan} / {total} ({n_mac_nan / total * 100:.2f}%)")




Missing HbA1c slope values: 954 / 2064 (46.22%)
Missing HbA1c CV values:    954 / 2064 (46.22%)
Missing HbA1c MAC values:   954 / 2064 (46.22%)


In [141]:
# %% ---------------- BUILD NEXT-YEAR HBA1C TARGET ----------------
# Sort to guarantee temporal order within each patient
df_final = df_final.sort_values([ID_COL, "anchor_date"]).reset_index(drop=True)

# Target: HbA1c at next annual screening for the same patient
df_final["HbA1c_next"] = (
    df_final
    .groupby(ID_COL)[HBA1C_COL]
    .shift(-1)  # move next value up
)

# Also store timing of the next screening
df_final["year_next"] = df_final.groupby(ID_COL)["year"].shift(-1)
df_final["anchor_date_next"] = df_final.groupby(ID_COL)["anchor_date"].shift(-1)

# Time gap in years to next screening (for sanity / later filtering if needed)
df_final["delta_years_to_next"] = (
    (df_final["anchor_date_next"] - df_final["anchor_date"]).dt.days / 365.25
)

# Optional: previous HbA1c as a history feature
df_final["HbA1c_prev"] = (
    df_final
    .groupby(ID_COL)[HBA1C_COL]
    .shift(1)
)

print("\n=== TARGET CONSTRUCTION AUDIT ===")
print("Rows with non-missing HbA1c_next:", df_final["HbA1c_next"].notna().sum())
print("Rows with missing HbA1c_next (expected: last per patient):", df_final["HbA1c_next"].isna().sum())

pairs_per_patient = (
    df_final.dropna(subset=["HbA1c_next"])
            .groupby(ID_COL)
            .size()
)
print("Prediction pairs per patient (distribution):")
print(pairs_per_patient.value_counts().sort_index())

# Quick sanity print of first few rows
print("\nPreview of year, HbA1c_current, HbA1c_next for first 10 rows:")
print(
    df_final[["DW_EK_Borger", "year", "anchor_date",
              HBA1C_COL, "year_next", "anchor_date_next", "HbA1c_next"]]
    .head(10)
)


=== TARGET CONSTRUCTION AUDIT ===
Rows with non-missing HbA1c_next: 1384
Rows with missing HbA1c_next (expected: last per patient): 680
Prediction pairs per patient (distribution):
2    656
3     24
Name: count, dtype: int64

Preview of year, HbA1c_current, HbA1c_next for first 10 rows:
   DW_EK_Borger  year anchor_date  \
0          7371  2022  2022-11-30   
1          7371  2024  2024-02-13   
2          7371  2025  2025-03-05   
3         11946  2022  2022-12-08   
4         11946  2024  2024-02-13   
5         11946  2025  2025-03-03   
6         15619  2022  2022-11-30   
7         15619  2024  2024-02-20   
8         15619  2025  2025-01-08   
9         15621  2022  2022-06-15   

   LABmean__hb_b_haemoglobin_a1c_ifcc_mmol_mol  year_next anchor_date_next  \
0                                         79.0     2024.0       2024-02-13   
1                                         71.0     2025.0       2025-03-05   
2                                         71.0        NaN            

In [142]:
# %% ---------------- HBA1C BANDS & HISTORY FLAGS ----------------
import numpy as np  # just to be safe

HBA1C_CURR_COL = HBA1C_COL        # alias for readability
HBA1C_PREV_COL = "HbA1c_prev"     # created in the target block

# 0) Clean up any old HbA1c band / flag columns if this cell is re-run
cols_to_drop = [c for c in df_final.columns 
                if c.startswith("HbA1c_band_") 
                or c in ["HbA1c_band", "HbA1c_prev_high", "HbA1c_prev_missing"]]

if cols_to_drop:
    print("Dropping existing HbA1c feature columns:", cols_to_drop)
    df_final = df_final.drop(columns=cols_to_drop)

# 1) Band current HbA1c into <53, 53–69, ≥70 mmol/mol
df_final["HbA1c_band"] = pd.cut(
    df_final[HBA1C_CURR_COL],
    bins=[-np.inf, 53, 70, np.inf],
    labels=["lt53", "53_69", "ge70"],
    right=False,          # [-inf,53), [53,70), [70,inf)
    include_lowest=True
)

# One-hot encode the bands
hba1c_band_dummies = pd.get_dummies(
    df_final["HbA1c_band"],
    prefix="HbA1c_band"
)

df_final = pd.concat([df_final, hba1c_band_dummies], axis=1)

# 2) Previous-year high HbA1c: prev ≥ 75 mmol/mol
prev = df_final[HBA1C_PREV_COL]

df_final["HbA1c_prev_high"] = (prev >= 75)
df_final.loc[prev.isna(), "HbA1c_prev_high"] = pd.NA
df_final["HbA1c_prev_high"] = df_final["HbA1c_prev_high"].astype("Int64")

# 3) Previous HbA1c missing flag (simple 0/1 is fine)
df_final["HbA1c_prev_missing"] = prev.isna().astype(int)

# Quick sanity check
print("\nHbA1c band counts:")
print(df_final["HbA1c_band"].value_counts(dropna=False))

print("\nPreview of HbA1c band & prev-high flags:")
print(
    df_final[
        [ID_COL, "year", HBA1C_CURR_COL, HBA1C_PREV_COL,
         "HbA1c_band",
         "HbA1c_band_lt53", "HbA1c_band_53_69", "HbA1c_band_ge70",
         "HbA1c_prev_high", "HbA1c_prev_missing"]
    ].head(10)
)



HbA1c band counts:
HbA1c_band
53_69    1147
lt53      500
ge70      417
Name: count, dtype: int64

Preview of HbA1c band & prev-high flags:
   DW_EK_Borger  year  LABmean__hb_b_haemoglobin_a1c_ifcc_mmol_mol  \
0          7371  2022                                         79.0   
1          7371  2024                                         71.0   
2          7371  2025                                         71.0   
3         11946  2022                                         71.0   
4         11946  2024                                         69.0   
5         11946  2025                                         75.5   
6         15619  2022                                         72.0   
7         15619  2024                                         78.0   
8         15619  2025                                         70.0   
9         15621  2022                                         62.0   

   HbA1c_prev HbA1c_band  HbA1c_band_lt53  HbA1c_band_53_69  HbA1c_band_ge70  \
0       

  df_final.loc[prev.isna(), "HbA1c_prev_high"] = pd.NA


In [143]:

# %% ---------------- MISSINGNESS TABLE & EXPORT ----------------
missing_tbl = missing_table(df_final)
print("\nMissingness overview (top 20):")
print(missing_tbl.head(20))

df_final.to_excel(OUT_PATH, index=False)
print(f"\nSaved feature-augmented dataset to: {OUT_PATH}")


df_final["DW_EK_Borger"].nunique()


Missingness overview (top 20):
                                         Missing Count  Missing %
HbA1c_slope_prev_year                              954      46.22
HbA1c_CV_prev_year                                 954      46.22
HbA1c_MAC_prev_year                                954      46.22
HbA1c_next                                         680      32.95
year_next                                          680      32.95
anchor_date_next                                   680      32.95
delta_years_to_next                                680      32.95
HbA1c_prev                                         680      32.95
HbA1c_prev_high                                    680      32.95
who_score                                          389      18.85
LABmean__b_haemoglobin_mmol_l                      354      17.15
LABmean__u_albumin_kreatinin_ratio_mg_g            352      17.05
LABmean__p_vitamin_b12_pmol_l                        8       0.39
LABmean__p_kalium_mmol_l                    

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\kfq6\\Documents\\Data\\Sammedag_master_HbA1c_Features.xlsx'