In [481]:
# %%
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import LinearRegression

# ---------------- CONFIG ----------------
SAMMEDAG_4M2M_PATH = Path(r"C:\Users\kfq6\Documents\Data\Sammedag_master_means_4m2m_3to4.xlsx")
LABKA_PATH         = Path(r"C:\Users\kfq6\Documents\Data\LABKA_wide_numeric.xlsx")

OUT_PATH = Path(r"C:\Users\kfq6\Documents\Data\Sammedag_master_HbA1c_Features.xlsx")

ID_COL = "DW_EK_Borger"

# Lab columns in Sammedag master
HBA1C_COL = "LABmean__hb_b_haemoglobin_a1c_ifcc_mmol_mol"
LDL_COL   = "LABmean__p_kolesterol_ldl_mmol_l"
EGFR_COL  = "LABmean__egfr_1_73m2_ckd_epi_ml_min"

# HbA1c in LABKA
HBA1C_LABKA_COL = "Hb(B)-Hæmoglobin_A1c_(IFCC)_[mmol_mol]"

In [480]:
# ---------------- HELPERS ----------------
def missing_table(df_in: pd.DataFrame) -> pd.DataFrame:
    missing_count = df_in.isna().sum()
    missing_percent = (missing_count / len(df_in)) * 100
    tbl = pd.DataFrame({
        "Missing Count": missing_count,
        "Missing %": missing_percent.round(2),
    })
    tbl = tbl[tbl["Missing Count"] > 0].sort_values("Missing %", ascending=False)
    return tbl


In [470]:
# %% ---------------- LOAD SAMMEDAG & BASIC CLEANING ----------------
df = pd.read_excel(SAMMEDAG_4M2M_PATH, sheet_name="Sheet1")

df["anchor_date"] = pd.to_datetime(df["anchor_date"], errors="coerce")
df["year"] = df["anchor_date"].dt.year

print("=== RAW 4m2m_3to4 AT LOAD ===")
print("Rows:", len(df))
print("Unique patients:", df[ID_COL].nunique())

anchor_counts_raw = df.groupby(ID_COL).size()
print("\nRows per patient (raw):")
print(anchor_counts_raw.value_counts().sort_index())


# Drop meta columns that you truly don't want for modelling
drop_meta = [
    "window_type", "has_who_in_window",
    "window_start", "window_end",
    "anchor_first_time", "anchor_last_time",
    "visit_datetimes", "visit_topics", "visit_statuses",
    "lab_window_start", "lab_window_end", "diabetes_type",
    "who_date",
    "LABmean__u_albumin_kreatinin_ratio_x_10_3",
    "LABmean__p_albumin_g_l", "comp_ketoac", "comp_coma",
    "LABmean__p_25_hydroxy_vitamin_d_d3_d2_nmol_l",
    "n_anchor_bookings_that_day",
    "LABmean__p_calcium_albuminkorrigeret_mmol_l",
    "LABmean__p_calcium_mmol_l",
    "who_days_from_anchor",
    "n_visits_in_window",
    "comp_any",
    "has_lab_in_window",
    "lab_window_n_rows",
    "n_anchors",
]
drop_meta_existing = [c for c in drop_meta if c in df.columns]
df = df.drop(columns=drop_meta_existing)
print(f"\nDropped {len(drop_meta_existing)} meta columns.")


Dropped 27 meta columns.


In [471]:
# %% ---------------- AUDIT MISSINGNESS IN REQUIRED LABS ----------------
target_cols = [HBA1C_COL, LDL_COL, EGFR_COL]

print("\n=== MISSINGNESS AUDIT (before any dropping) ===")
row_missing_any = df[target_cols].isna().any(axis=1)

n_rows_missing = row_missing_any.sum()
rows_with_missing = df[row_missing_any]
bad_ids = rows_with_missing[ID_COL].unique()

print(f"Total rows: {len(df)}")
print(f"Rows with missing in any of {target_cols}: {n_rows_missing}")
print(f"Patients with >=1 row missing any required lab: {len(bad_ids)}")

# How many missing rows per patient?
miss_counts = rows_with_missing.groupby(ID_COL).size()
print("\nMissing rows per patient (only patients with at least one missing):")
print(miss_counts.value_counts().sort_index())

print("\nRows per patient BEFORE dropping any rows:")
print(df.groupby(ID_COL).size().value_counts().sort_index())



Missingness overview (top 20):
                                             Missing Count  Missing %
who_score                                              710      25.67
LABmean__u_albumin_kreatinin_ratio_mg_g                542      19.60
LABmean__b_haemoglobin_mmol_l                          497      17.97
LABmean__p_kolesterol_ldl_mmol_l                       117       4.23
LABmean__p_vitamin_b12_pmol_l                           83       3.00
LABmean__p_kolesterol_hdl_mmol_l                        74       2.68
LABmean__p_kolesterol_mmol_l                            73       2.64
LABmean__p_triglycerid_mmol_l                           73       2.64
LABmean__p_kalium_mmol_l                                63       2.28
LABmean__p_natrium_mmol_l                               61       2.21
LABmean__egfr_1_73m2_ckd_epi_ml_min                     56       2.02
LABmean__p_kreatinin_umol_l                             56       2.02
LABmean__hb_b_haemoglobin_a1c_ifcc_mmol_mol             53

In [472]:
# %% ---------------- DROP ONLY INCOMPLETE ROWS ----------------
df["missing_any_of_three"] = row_missing_any

df_complete = df[~df["missing_any_of_three"]].copy()
df_complete = df_complete.drop(columns=["missing_any_of_three"])

print("\n=== AFTER DROPPING INCOMPLETE YEARS (ROW-WISE) ===")
print("Rows:", len(df_complete))
print("Patients:", df_complete[ID_COL].nunique())

anchor_counts_complete = df_complete.groupby(ID_COL).size()
print("\nRows per patient (after dropping incomplete years):")
print(anchor_counts_complete.value_counts().sort_index())

# How many patients lost ALL rows?
ids_after = set(df_complete[ID_COL].unique())
ids_before = set(df[ID_COL].unique())
lost_all = ids_before - ids_after
print(f"\nPatients who lost ALL rows after row-wise drop: {len(lost_all)}")



Number of patients with at least one missing required HbA1c: 101


In [473]:
# %% ---------------- OPTIONAL: ENFORCE MINIMUM YEARS ----------------
min_years = 3   # ≥3 complete lab-years

counts_after = df_complete.groupby(ID_COL).size()
good_ids = counts_after[counts_after >= min_years].index

print(f"\nPatients with ≥{min_years} COMPLETE years:", len(good_ids))

df_final = df_complete[df_complete[ID_COL].isin(good_ids)].copy()

print("=== FINAL COHORT FOR FEATURES (after min_years filter) ===")
print("Final rows:", len(df_final))
print("Final patients:", df_final[ID_COL].nunique())
print("\nRows per patient in FINAL df:")
print(df_final.groupby(ID_COL).size().value_counts().sort_index())

# Sanity: show a few example patients and their years
print("\nExample patients and their years in final df:")
example_ids = df_final[ID_COL].drop_duplicates().sample(
    min(5, len(df_final[ID_COL].unique())),
    random_state=42
)
for pid in example_ids:
    sub = df_final[df_final[ID_COL] == pid][[ID_COL, "year"]].sort_values("year")
    print(sub)


Rows per patient distribution in features df:
3    862
4     45
Name: count, dtype: int64


In [474]:
# %% ---------------- HBA1C SLOPE / CV / MAC OVER LAST YEAR ----------------
df_labka_ts = pd.read_excel(LABKA_PATH)
df_labka_ts["Testdato"] = pd.to_datetime(df_labka_ts["Testdato"], dayfirst=True, errors="coerce")

# Only keep LABKA rows for patients that survived all filtering
df_labka_ts = df_labka_ts[df_labka_ts[ID_COL].isin(df_final[ID_COL].unique())].copy()

df_labka_ts = df_labka_ts[[ID_COL, "Testdato", HBA1C_LABKA_COL]].copy()
df_labka_ts[HBA1C_LABKA_COL] = pd.to_numeric(df_labka_ts[HBA1C_LABKA_COL], errors="coerce")
df_labka_ts = df_labka_ts.dropna(subset=[HBA1C_LABKA_COL, "Testdato"])

slope_results = []
cv_results = []
mac_results = []

for _, row in df_final.iterrows():
    pid = row[ID_COL]
    anchor_date = row["anchor_date"]

    if pd.isna(anchor_date):
        slope_results.append(np.nan)
        cv_results.append(np.nan)
        mac_results.append(np.nan)
        continue

    start_date = anchor_date - pd.DateOffset(years=1)
    end_date = anchor_date

    df_patient = df_labka_ts[
        (df_labka_ts[ID_COL] == pid) &
        (df_labka_ts["Testdato"] >= start_date) &
        (df_labka_ts["Testdato"] < end_date)
    ].copy()

    if len(df_patient) < 2:
        slope_results.append(np.nan)
        cv_results.append(np.nan)
        mac_results.append(np.nan)
        continue

    df_patient = df_patient.sort_values("Testdato")
    X = (df_patient["Testdato"] - df_patient["Testdato"].min()).dt.days.values.reshape(-1, 1)
    y = df_patient[HBA1C_LABKA_COL].values

    # Slope
    model = LinearRegression().fit(X, y)
    slope = model.coef_[0]
    slope_results.append(slope)

    # CV
    mean = y.mean()
    if mean == 0 or np.isnan(mean):
        cv_results.append(np.nan)
    else:
        std = y.std(ddof=1)
        cv = std / mean
        cv_results.append(cv)

    # MAC
    if len(y) < 2:
        mac_results.append(np.nan)
    else:
        mac = np.mean(np.abs(np.diff(y)))
        mac_results.append(mac)

df_final["HbA1c_slope_prev_year"] = slope_results
df_final["HbA1c_CV_prev_year"] = cv_results
df_final["HbA1c_MAC_prev_year"] = mac_results

total = len(df_final)
n_slope_nan = df_final["HbA1c_slope_prev_year"].isna().sum()
n_cv_nan    = df_final["HbA1c_CV_prev_year"].isna().sum()
n_mac_nan   = df_final["HbA1c_MAC_prev_year"].isna().sum()

print(f"\nMissing HbA1c slope values: {n_slope_nan} / {total} ({n_slope_nan / total * 100:.2f}%)")
print(f"Missing HbA1c CV values:    {n_cv_nan} / {total} ({n_cv_nan / total * 100:.2f}%)")
print(f"Missing HbA1c MAC values:   {n_mac_nan} / {total} ({n_mac_nan / total * 100:.2f}%)")



Total rows: 2766
Rows with missing in any target: 119
Rows after dropping incomplete years: 2647
Patients remaining after row-wise drop: 904
Patients with ≥4 complete years: 42
Final rows: 168
Final patients: 42


In [477]:
# %% ---------------- BUILD NEXT-YEAR HBA1C TARGET ----------------
# Sort to guarantee temporal order within each patient
df_final = df_final.sort_values([ID_COL, "anchor_date"]).reset_index(drop=True)

# Target: HbA1c at next annual screening for the same patient
df_final["HbA1c_next"] = (
    df_final
    .groupby(ID_COL)[HBA1C_COL]
    .shift(-1)  # move next value up
)

# Also store timing of the next screening
df_final["year_next"] = df_final.groupby(ID_COL)["year"].shift(-1)
df_final["anchor_date_next"] = df_final.groupby(ID_COL)["anchor_date"].shift(-1)

# Time gap in years to next screening (for sanity / later filtering if needed)
df_final["delta_years_to_next"] = (
    (df_final["anchor_date_next"] - df_final["anchor_date"]).dt.days / 365.25
)

# Optional: previous HbA1c as a history feature
df_final["HbA1c_prev"] = (
    df_final
    .groupby(ID_COL)[HBA1C_COL]
    .shift(1)
)

print("\n=== TARGET CONSTRUCTION AUDIT ===")
print("Rows with non-missing HbA1c_next:", df_final["HbA1c_next"].notna().sum())
print("Rows with missing HbA1c_next (expected: last per patient):", df_final["HbA1c_next"].isna().sum())

pairs_per_patient = (
    df_final.dropna(subset=["HbA1c_next"])
            .groupby(ID_COL)
            .size()
)
print("Prediction pairs per patient (distribution):")
print(pairs_per_patient.value_counts().sort_index())

# Quick sanity print of first few rows
print("\nPreview of year, HbA1c_current, HbA1c_next for first 10 rows:")
print(
    df_final[["DW_EK_Borger", "year", "anchor_date",
              HBA1C_COL, "year_next", "anchor_date_next", "HbA1c_next"]]
    .head(10)
)


Missingness overview (top 20):
                                         Missing Count  Missing %
LABmean__u_albumin_kreatinin_ratio_mg_g             39      30.95
who_score                                           37      29.37
LABmean__b_haemoglobin_mmol_l                       27      21.43
LABmean__p_vitamin_b12_pmol_l                        1       0.79

Saved feature-augmented dataset to: C:\Users\kfq6\Documents\Data\Sammedag_master_HbA1c_Features.xlsx


In [478]:

# %% ---------------- MISSINGNESS TABLE & EXPORT ----------------
missing_tbl = missing_table(df_final)
print("\nMissingness overview (top 20):")
print(missing_tbl.head(20))

df_final.to_excel(OUT_PATH, index=False)
print(f"\nSaved feature-augmented dataset to: {OUT_PATH}")

KeyboardInterrupt: 