In [182]:

# %%
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import LinearRegression

## path

In [183]:

# %%
SAMMEDAG_4M2M_PATH = Path(r"C:\Users\kfq6\Documents\Data\Sammedag_master_means_4m2m_3to4.xlsx")
LABKA_PATH         = Path(r"C:\Users\kfq6\Documents\Data\LABKA_wide_numeric.xlsx")

ID_COL = "DW_EK_Borger"
HBA1C_SAM_COL = "LABmean__hb_b_haemoglobin_a1c_ifcc_mmol_mol"
HBA1C_LABKA_COL = "Hb(B)-Hæmoglobin_A1c_(IFCC)_[mmol_mol]"

## Load Sammedag (4m2m) and create lag features


In [184]:
# %%
df = pd.read_excel(SAMMEDAG_4M2M_PATH, sheet_name="Sheet1")

# Ensure anchor_date is datetime and sort
df["anchor_date"] = pd.to_datetime(df["anchor_date"], errors="coerce")
df = df.sort_values(by=[ID_COL, "anchor_date"])

# Define lag features (previous measurement in same patient)
target_features = {
    "LABmean__hb_b_haemoglobin_a1c_ifcc_mmol_mol": "HbA1c_prev_year",
    "LABmean__egfr_1_73m2_ckd_epi_ml_min": "eGFR_prev_year",
    "LABmean__p_triglycerid_mmol_l": "TG_prev_year",
    "LABmean__p_kolesterol_ldl_mmol_l": "LDL_prev_year",
}

for col, new_col in target_features.items():
    if col not in df.columns:
        print(f"Warning: {col} not found, skipping {new_col}")
        continue
    df[new_col] = df.groupby(ID_COL)[col].shift(1)

df.head()


Unnamed: 0,DW_EK_Borger,sex,age,diabetes_type,diagnosis,comp_any,comp_eye,comp_renal,comp_neuro,comp_periph,...,LABmean__p_natrium_mmol_l,LABmean__p_triglycerid_mmol_l,LABmean__p_vitamin_b12_pmol_l,LABmean__u_albumin_kreatinin_ratio_mg_g,LABmean__u_albumin_kreatinin_ratio_x_10_3,n_anchors,HbA1c_prev_year,eGFR_prev_year,TG_prev_year,LDL_prev_year
0,7371,M,56.8,T1DM,Type 1-diabetes,1,0,0,1,0,...,139.0,2.0,497.0,133.0,,3,,,,
1,7371,M,56.8,T1DM,Type 1-diabetes,1,0,0,1,0,...,136.0,1.3,309.0,32.0,,3,79.0,90.0,2.0,2.5
2,7371,M,56.8,T1DM,Type 1-diabetes,1,0,0,1,0,...,140.0,0.9,427.0,31.0,,3,71.0,90.0,1.3,2.4
3,8395,M,33.8,T1DM,Type 1-diabetes,0,0,0,0,0,...,138.0,1.5,531.0,,6.4,3,,,,
4,8395,M,33.8,T1DM,Type 1-diabetes,0,0,0,0,0,...,140.0,0.8,410.0,5.0,,3,67.0,90.0,1.5,2.6



## Drop meta columns

In [185]:
drop_meta = [
    "window_type", "has_who_in_window",
    "window_start", "window_end",
    "anchor_first_time", "anchor_last_time",
    "visit_datetimes", "visit_topics", "visit_statuses",
    "lab_window_start", "lab_window_end", "diabetes_type",
    "who_date",
    "LABmean__u_albumin_kreatinin_ratio_x_10_3",
    "LABmean__p_albumin_g_l", "comp_ketoac", "comp_coma",
    "LABmean__p_25_hydroxy_vitamin_d_d3_d2_nmol_l",
    "n_anchor_bookings_that_day",
    "LABmean__p_calcium_albuminkorrigeret_mmol_l",
    "LABmean__p_calcium_mmol_l",
    "who_days_from_anchor",
    "n_visits_in_window",
    "comp_any",
    "has_lab_in_window",
    "lab_window_n_rows",
    "n_anchors",
]

drop_meta_existing = [c for c in drop_meta if c in df.columns]
df = df.drop(columns=drop_meta_existing)

df.head()

Unnamed: 0,DW_EK_Borger,sex,age,diagnosis,comp_eye,comp_renal,comp_neuro,comp_periph,comp_foot,comp_multiple,...,LABmean__p_kolesterol_mmol_l,LABmean__p_kreatinin_umol_l,LABmean__p_natrium_mmol_l,LABmean__p_triglycerid_mmol_l,LABmean__p_vitamin_b12_pmol_l,LABmean__u_albumin_kreatinin_ratio_mg_g,HbA1c_prev_year,eGFR_prev_year,TG_prev_year,LDL_prev_year
0,7371,M,56.8,Type 1-diabetes,0,0,1,0,0,0,...,4.9,81.0,139.0,2.0,497.0,133.0,,,,
1,7371,M,56.8,Type 1-diabetes,0,0,1,0,0,0,...,4.3,78.0,136.0,1.3,309.0,32.0,79.0,90.0,2.0,2.5
2,7371,M,56.8,Type 1-diabetes,0,0,1,0,0,0,...,4.4,91.0,140.0,0.9,427.0,31.0,71.0,90.0,1.3,2.4
3,8395,M,33.8,Type 1-diabetes,0,0,0,0,0,0,...,4.3,80.0,138.0,1.5,531.0,,,,,
4,8395,M,33.8,Type 1-diabetes,0,0,0,0,0,0,...,5.4,83.0,140.0,0.8,410.0,5.0,67.0,90.0,1.5,2.6


## Patients with missing *required* HbA1c

In [186]:
required_cols = [HBA1C_SAM_COL]

rows_with_missing = df[df[required_cols].isna().any(axis=1)]
bad_ids = rows_with_missing[ID_COL].unique()

print(f"Number of patients to drop completely (missing required HbA1c): {len(bad_ids)}")


Number of patients to drop completely (missing required HbA1c): 52


## Patients that **always** have missing HbA1c (in all windows)



In [187]:
patients_always_missing = (
    df.groupby(ID_COL)[HBA1C_SAM_COL]
      .apply(lambda x: x.isna().all())
)

patients_always_missing = patients_always_missing[patients_always_missing == True]
patients_always_missing_ids = patients_always_missing.index

patients_always_missing_ids

Index([], dtype='int64', name='DW_EK_Borger')

## Patients with missing HbA1c by year 

In [188]:
df_3m = pd.read_excel(SAMMEDAG_4M2M_PATH, sheet_name="Sheet1")
df_3m["anchor_date"] = pd.to_datetime(df_3m["anchor_date"], errors="coerce")
df_3m["year"] = df_3m["anchor_date"].dt.year

# rows where HbA1c is missing
missing_hba1c_3m = df_3m[df_3m[HBA1C_SAM_COL].isna()]

# table: patient × year with missing HbA1c
missing_table_3m = (
    missing_hba1c_3m
    .groupby([ID_COL, "year"])
    .size()
    .reset_index(name="missing_count")
)

# list of years per patient with missing HbA1c
missing_years_per_patient = (
    missing_table_3m
    .groupby(ID_COL)["year"]
    .apply(list)
    .reset_index(name="years_missing")
)

missing_years_per_patient.head()

Unnamed: 0,DW_EK_Borger,years_missing
0,49262,[2025]
1,51061,[2025]
2,78002,[2025]
3,84269,[2024]
4,94249,[2025]


## HbA1c slope over the last year 

In [189]:
df_labka = pd.read_excel(LABKA_PATH)

# Parse dates
df_labka["Testdato"] = pd.to_datetime(df_labka["Testdato"], errors="coerce")
df["anchor_date"] = pd.to_datetime(df["anchor_date"], errors="coerce")  # ensure

# Keep only overlapping patients in LABKA
common_ids = df[ID_COL].unique()
df_labka = df_labka[df_labka[ID_COL].isin(common_ids)]

slope_results = []

for _, row in df.iterrows():
    pid = row[ID_COL]
    anchor_date = row["anchor_date"]

    if pd.isna(anchor_date):
        slope_results.append(np.nan)
        continue

    start_date = anchor_date - pd.DateOffset(years=1)
    end_date = anchor_date

    df_patient = df_labka[
        (df_labka[ID_COL] == pid) &
        (df_labka["Testdato"] >= start_date) &
        (df_labka["Testdato"] < end_date)
    ][["Testdato", HBA1C_LABKA_COL]].dropna()

    # Need at least 2 points to compute slope
    if len(df_patient) < 2:
        slope_results.append(np.nan)
        continue

    df_patient = df_patient.sort_values("Testdato")
    X = (df_patient["Testdato"] - df_patient["Testdato"].min()).dt.days.values.reshape(-1, 1)
    y = df_patient[HBA1C_LABKA_COL].values

    model = LinearRegression().fit(X, y)
    slope = model.coef_[0]  # mmol/mol per day
    slope_results.append(slope)

df["HbA1c_slope_prev_year"] = slope_results

df.head()


Unnamed: 0,DW_EK_Borger,sex,age,diagnosis,comp_eye,comp_renal,comp_neuro,comp_periph,comp_foot,comp_multiple,...,LABmean__p_kreatinin_umol_l,LABmean__p_natrium_mmol_l,LABmean__p_triglycerid_mmol_l,LABmean__p_vitamin_b12_pmol_l,LABmean__u_albumin_kreatinin_ratio_mg_g,HbA1c_prev_year,eGFR_prev_year,TG_prev_year,LDL_prev_year,HbA1c_slope_prev_year
0,7371,M,56.8,Type 1-diabetes,0,0,1,0,0,0,...,81.0,139.0,2.0,497.0,133.0,,,,,
1,7371,M,56.8,Type 1-diabetes,0,0,1,0,0,0,...,78.0,136.0,1.3,309.0,32.0,79.0,90.0,2.0,2.5,
2,7371,M,56.8,Type 1-diabetes,0,0,1,0,0,0,...,91.0,140.0,0.9,427.0,31.0,71.0,90.0,1.3,2.4,
3,8395,M,33.8,Type 1-diabetes,0,0,0,0,0,0,...,80.0,138.0,1.5,531.0,,,,,,
4,8395,M,33.8,Type 1-diabetes,0,0,0,0,0,0,...,83.0,140.0,0.8,410.0,5.0,67.0,90.0,1.5,2.6,


## Summary of HbA1c slope feature

In [190]:
# %%
n_nans = df["HbA1c_slope_prev_year"].isna().sum()
n_total = len(df)

print(f"Missing slope values: {n_nans} / {n_total} ({(n_nans / n_total) * 100:.2f}%)")

print("\nSample rows with valid HbA1c slopes:")
(
    df[df["HbA1c_slope_prev_year"].notna()]
    [[ID_COL, "anchor_date", "HbA1c_slope_prev_year"]]
    .head(10)
)

Missing slope values: 1294 / 2766 (46.78%)

Sample rows with valid HbA1c slopes:


Unnamed: 0,DW_EK_Borger,anchor_date,HbA1c_slope_prev_year
7,10562,2023-10-09,-3e-05
8,10562,2024-12-02,0.011292
10,10834,2023-05-24,0.025
15,15619,2022-11-30,-0.134583
17,15619,2025-01-08,-0.024922
19,15621,2023-01-11,-0.152783
22,16209,2023-01-02,0.014493
28,20288,2022-12-15,-0.036296
31,22118,2022-05-05,-0.142507
32,22118,2023-12-14,0.034924


## CV

In [191]:
slope_results = []
cv_results = []
mac_results = []  # <--- NEW: MAC

for _, row in df.iterrows():
    pid = row[ID_COL]
    anchor_date = row["anchor_date"]

    if pd.isna(anchor_date):
        slope_results.append(np.nan)
        cv_results.append(np.nan)
        mac_results.append(np.nan)
        continue

    start_date = anchor_date - pd.DateOffset(years=1)
    end_date = anchor_date

    df_patient = df_labka[
        (df_labka[ID_COL] == pid) &
        (df_labka["Testdato"] >= start_date) &
        (df_labka["Testdato"] < end_date)
    ][["Testdato", HBA1C_LABKA_COL]].dropna()

    if len(df_patient) < 2:
        slope_results.append(np.nan)
        cv_results.append(np.nan)
        mac_results.append(np.nan)
        continue

    df_patient = df_patient.sort_values("Testdato")
    X = (df_patient["Testdato"] - df_patient["Testdato"].min()).dt.days.values.reshape(-1, 1)
    y = pd.to_numeric(df_patient[HBA1C_LABKA_COL], errors="coerce").dropna().values

    # --- Slope ---
    model = LinearRegression().fit(X[:len(y)], y)  # match lengths
    slope = model.coef_[0]
    slope_results.append(slope)

    # --- CV ---
    mean = y.mean()
    if mean == 0 or np.isnan(mean):
        cv_results.append(np.nan)
    else:
        std = y.std(ddof=1)
        cv = std / mean
        cv_results.append(cv)

    # --- MAC ---
    if len(y) < 2:
        mac_results.append(np.nan)
    else:
        mac = np.mean(np.abs(np.diff(y)))
        mac_results.append(mac)

# Assign results to df
df["HbA1c_slope_prev_year"] = slope_results
df["HbA1c_CV_prev_year"] = cv_results
df["HbA1c_MAC_prev_year"] = mac_results

df.head()


Unnamed: 0,DW_EK_Borger,sex,age,diagnosis,comp_eye,comp_renal,comp_neuro,comp_periph,comp_foot,comp_multiple,...,LABmean__p_triglycerid_mmol_l,LABmean__p_vitamin_b12_pmol_l,LABmean__u_albumin_kreatinin_ratio_mg_g,HbA1c_prev_year,eGFR_prev_year,TG_prev_year,LDL_prev_year,HbA1c_slope_prev_year,HbA1c_CV_prev_year,HbA1c_MAC_prev_year
0,7371,M,56.8,Type 1-diabetes,0,0,1,0,0,0,...,2.0,497.0,133.0,,,,,,,
1,7371,M,56.8,Type 1-diabetes,0,0,1,0,0,0,...,1.3,309.0,32.0,79.0,90.0,2.0,2.5,,,
2,7371,M,56.8,Type 1-diabetes,0,0,1,0,0,0,...,0.9,427.0,31.0,71.0,90.0,1.3,2.4,,,
3,8395,M,33.8,Type 1-diabetes,0,0,0,0,0,0,...,1.5,531.0,,,,,,,,
4,8395,M,33.8,Type 1-diabetes,0,0,0,0,0,0,...,0.8,410.0,5.0,67.0,90.0,1.5,2.6,,,


In [192]:
# --- Slope missingness ---
n_slope_nan = df["HbA1c_slope_prev_year"].isna().sum()
total = len(df)

print(f"Missing HbA1c slope values: {n_slope_nan} / {total} "
      f"({n_slope_nan / total * 100:.2f}%)")

print("\nSample rows with valid HbA1c slopes:")
print(
    df[df["HbA1c_slope_prev_year"].notna()]
    [[ID_COL, "anchor_date", "HbA1c_slope_prev_year"]]
    .head(10)
)

# --- CV missingness ---
n_cv_nan = df["HbA1c_CV_prev_year"].isna().sum()

print(f"\nMissing HbA1c CV values: {n_cv_nan} / {total} "
      f"({n_cv_nan / total * 100:.2f}%)")

print("\nSample rows with valid HbA1c CV values:")
print(
    df[df["HbA1c_CV_prev_year"].notna()]
    [[ID_COL, "anchor_date", "HbA1c_CV_prev_year"]]
    .head(10)
)

mac_cv_nan = df["HbA1c_MAC_prev_year"].isna().sum()

print(f"\nMissing HbA1c MAC values: {mac_cv_nan} / {total} "
        f"({mac_cv_nan / total * 100:.2f}%)")

print("\nSample rows with valid HbA1c MAC values:")
print(
    df[df["HbA1c_MAC_prev_year"].notna()]
    [[ID_COL, "anchor_date", "HbA1c_MAC_prev_year"]]
    .head(10)
)


Missing HbA1c slope values: 1294 / 2766 (46.78%)

Sample rows with valid HbA1c slopes:
    DW_EK_Borger anchor_date  HbA1c_slope_prev_year
7          10562  2023-10-09              -0.000030
8          10562  2024-12-02               0.011292
10         10834  2023-05-24               0.025000
15         15619  2022-11-30              -0.134583
17         15619  2025-01-08              -0.024922
19         15621  2023-01-11              -0.152783
22         16209  2023-01-02               0.014493
28         20288  2022-12-15              -0.036296
31         22118  2022-05-05              -0.142507
32         22118  2023-12-14               0.034924

Missing HbA1c CV values: 1294 / 2766 (46.78%)

Sample rows with valid HbA1c CV values:
    DW_EK_Borger anchor_date  HbA1c_CV_prev_year
7          10562  2023-10-09            0.032696
8          10562  2024-12-02            0.047042
10         10834  2023-05-24            0.054814
15         15619  2022-11-30            0.174722
17      

## Missing value table (all columns)

In [193]:
def missing_table(df_in: pd.DataFrame) -> pd.DataFrame:
    missing_count = df_in.isna().sum()
    missing_percent = (missing_count / len(df_in)) * 100

    tbl = pd.DataFrame({
        "Missing Count": missing_count,
        "Missing %": missing_percent.round(2),
    })
    tbl = tbl[tbl["Missing Count"] > 0].sort_values("Missing %", ascending=False)
    return tbl

missing_tbl = missing_table(df)
missing_tbl

Unnamed: 0,Missing Count,Missing %
HbA1c_MAC_prev_year,1294,46.78
HbA1c_CV_prev_year,1294,46.78
HbA1c_slope_prev_year,1294,46.78
LDL_prev_year,951,34.38
TG_prev_year,923,33.37
eGFR_prev_year,915,33.08
HbA1c_prev_year,912,32.97
who_score,722,26.1
LABmean__u_albumin_kreatinin_ratio_mg_g,542,19.6
LABmean__b_haemoglobin_mmol_l,497,17.97


## one hot encode diagnosis and sex columns

In [194]:
df = pd.get_dummies(df, columns=["diagnosis", "sex"], drop_first=True)


In [196]:
df.to_excel(r"C:\Users\kfq6\Documents\Data\Sammedag_master_HbA1c_Features.xlsx", index=False)