In [12]:
# %%
import pandas as pd
import numpy as np
from pathlib import Path
import datetime as dt
import re
import unicodedata

# ---------------- CONFIG ----------------
ID_COL   = "DW_EK_Borger"

# Base dir (adjust if you moved stuff)
OUT_DIR  = Path(r"C:\Users\kfq6\Documents\Data")

# Prebuilt windows (4 months before, 2 months after)
WINDOWS_4M2M_PATH = OUT_DIR / "Bookplan_sammedag_month_windows.xlsx"

POP_PATH = OUT_DIR / "Population.xlsx"
WHO_PATH = OUT_DIR / "WHO-5 (PRO).xlsx"
LAB_PATH = OUT_DIR / "LABKA_wide_numeric.xlsx"

# Base path for master outputs – we’ll derive filenames from this
OUT_PATH_BASE = OUT_DIR / "Sammedag_master_means.xlsx"

# Derived outputs for 4m2m windows
OUT_PATH_4M2M = OUT_PATH_BASE.with_name(OUT_PATH_BASE.stem + "_4m2m.xlsx")

STRICT_NUMERIC_ANALYTES = True  # keep only analyte columns that are actually numeric after coercion



In [13]:
# %% ---------------- HELPERS ----------------
def ensure_dates(df: pd.DataFrame, cols):
    for c in cols:
        if c not in df.columns:
            raise KeyError(f"Missing required datetime column: {c}")
        df[c] = pd.to_datetime(df[c], errors="coerce", dayfirst=True)
    return df


def pick_closest_in_window(df: pd.DataFrame,
                           date_col: str,
                           anchor: pd.Timestamp,
                           start: pd.Timestamp,
                           end: pd.Timestamp):
    """Return a single-row DataFrame: row closest to anchor within [start, end]."""
    sub = df[(df[date_col] >= start) & (df[date_col] <= end)].copy()
    if sub.empty:
        return None
    sub["abs_days_from_anchor"] = (sub[date_col] - anchor).abs().dt.days
    sub = sub.sort_values(["abs_days_from_anchor", date_col], ascending=[True, False])
    return sub.iloc[0:1]


In [14]:
# %% ---------------- LOAD WINDOWS (4M2M) ----------------
def load_windows(path: Path) -> pd.DataFrame:
    windows = pd.read_excel(
        path,
        engine="openpyxl",
        parse_dates=["anchor_date", "window_start", "window_end"],
    )
    need_win = {"anchor_date", "window_start", "window_end", ID_COL}
    missing = need_win - set(windows.columns)
    if missing:
        raise KeyError(f"'windows' file {path} missing columns: {missing}")
    return windows

windows_4m2m = load_windows(WINDOWS_4M2M_PATH)

# Coerce ID to a consistent type
windows_4m2m[ID_COL] = pd.to_numeric(windows_4m2m[ID_COL], errors="coerce").astype("Int64")


In [15]:
# %% ---------------- LOAD WHO ----------------
who = pd.read_excel(WHO_PATH, engine="openpyxl")
who.columns = who.columns.str.strip()

WHO_DATE_CANDIDATES = ["Dato_Besoeg"]
who_date_col = next((c for c in WHO_DATE_CANDIDATES if c in who.columns), None)
if not who_date_col:
    raise KeyError(
        f"WHO file missing a date column; expected one of: {', '.join(WHO_DATE_CANDIDATES)}"
    )

who = ensure_dates(who, [who_date_col])

if ID_COL not in who.columns:
    raise KeyError(f"WHO file missing ID column: {ID_COL}")

who[ID_COL] = pd.to_numeric(who[ID_COL], errors="coerce").astype("Int64")

WHO_SCORE_CANDIDATES = ["WHO5_score", "WHO5", "WHO-5", "WHO5_total"]
who_score_col = next((c for c in WHO_SCORE_CANDIDATES if c in who.columns), None)



In [16]:
# %% ---------------- LOAD POPULATION ----------------
pop = pd.read_excel(POP_PATH, engine="openpyxl")

# Basic sanity checks
required_cols = ["Aktionsdiagnosegruppe", "CPRNummer"]
for c in required_cols:
    if c not in pop.columns:
        raise KeyError(f"Population file missing column: '{c}'")
if ID_COL not in pop.columns:
    raise KeyError(f"Population file missing ID column: {ID_COL}")

pop[ID_COL] = pd.to_numeric(pop[ID_COL], errors="coerce").astype("Int64")

# Clean Aktionsdiagnosegruppe (used for diabetes type)
pop["Aktionsdiagnosegruppe"] = (
    pop["Aktionsdiagnosegruppe"]
      .astype(str).str.strip()
      .replace({"": np.nan, "nan": np.nan, "None": np.nan})
)

def pick_mode(series: pd.Series):
    s = series.dropna()
    if s.empty:
        return np.nan
    m = s.mode()
    return m.sort_values().iloc[0]

# ---- CPR parsing: birthdate, sex, age ----
CPR_COL = "CPRNummer"
pop[CPR_COL] = (
    pop[CPR_COL]
      .astype(str)
      .str.replace(r"\D", "", regex=True)
      .str.zfill(10)
)

def parse_cpr_birth(cpr: str):
    if not isinstance(cpr, str) or len(cpr) < 6:
        return np.nan
    try:
        dd, mm, yy = int(cpr[:2]), int(cpr[2:4]), int(cpr[4:6])
        today = dt.date.today()
        century = 1900 if yy > today.year % 100 else 2000
        return dt.date(century + yy, mm, dd)
    except Exception:
        return np.nan

pop["birth_date"] = pop[CPR_COL].apply(parse_cpr_birth)

def parse_cpr_sex(cpr: str):
    if not isinstance(cpr, str) or len(cpr) < 10:
        return np.nan
    try:
        return "M" if int(cpr[-1]) % 2 == 1 else "F"
    except Exception:
        return np.nan

pop["sex"] = pop[CPR_COL].apply(parse_cpr_sex)

today_ts = pd.Timestamp.today().normalize()
pop["age"] = ((today_ts - pd.to_datetime(pop["birth_date"], errors="coerce")).dt.days / 365.25).round(1)


In [17]:
# ----------------------------------------------------
# DIAGNOSE-PARSING: type + komplikationer
# ----------------------------------------------------

# Diabetes-type fra Aktionsdiagnosegruppe
type_txt = pop["Aktionsdiagnosegruppe"].fillna("")

pop["diabetes_type"] = np.select(
    [
        type_txt.str.contains("Type 1-diabetes", case=False, na=False),
        type_txt.str.contains("Type 2-diabetes", case=False, na=False),
        type_txt.str.contains("Anden diabetes",  case=False, na=False),
        type_txt.str.contains("Diabetes UNS",    case=False, na=False),
    ],
    ["T1DM", "T2DM", "OtherDM", "Diabetes_UNS"],
    default=np.nan,
)

# DE-kode (fra DiagnoseKode hvis den findes – ellers tom)
if "DiagnoseKode" in pop.columns:
    pop["diag_code"] = (
        pop["DiagnoseKode"].astype(str).str.extract(r"(DE\d{3})")[0]
    )
else:
    pop["diag_code"] = np.nan

# Sikr at de detaljerede tekstkolonner findes
for c in ["DiagnoseKodeTekst", "DiagnoseNiveau5KodeTekst", "DiagnoseNiveau6KodeTekst"]:
    if c not in pop.columns:
        pop[c] = ""

# Samlet tekst vi scanner for komplikationer
comp_txt = (
    pop["DiagnoseKodeTekst"].fillna("") + " " +
    pop["DiagnoseNiveau5KodeTekst"].fillna("") + " " +
    pop["DiagnoseNiveau6KodeTekst"].fillna("")
)

# Komplikations-flags per række (bool)
pop["comp_any"] = (
    comp_txt.str.contains("komplikation", case=False, na=False)
    & ~comp_txt.str.contains("uden komplikationer", case=False, na=False)
)

pop["comp_eye"]      = comp_txt.str.contains("øjenkomplikation",        case=False, na=False)
pop["comp_renal"]    = comp_txt.str.contains("nyrekomplikation",        case=False, na=False)
pop["comp_neuro"]    = comp_txt.str.contains("neurologisk komplikation",case=False, na=False)
pop["comp_periph"]   = comp_txt.str.contains("perifere karsystem",      case=False, na=False)
pop["comp_foot"]     = comp_txt.str.contains("fodsår",                  case=False, na=False)
pop["comp_multiple"] = comp_txt.str.contains("multiple komplikationer", case=False, na=False)
pop["comp_ketoac"]   = comp_txt.str.contains("ketoacidose",             case=False, na=False)
pop["comp_coma"]     = comp_txt.str.contains(r"\bkoma\b",               case=False, na=False)
pop["comp_uns"]      = (
    comp_txt.str.contains("komplikation UNS",   case=False, na=False) |
    comp_txt.str.contains("anden komplikation", case=False, na=False)
)

# ----------------------------------------------------
# AGGREGATION TIL ÉN RÆKKE PR. PATIENT
# ----------------------------------------------------
comp_cols = [
    "comp_any", "comp_eye", "comp_renal", "comp_neuro",
    "comp_periph", "comp_foot", "comp_multiple",
    "comp_ketoac", "comp_coma", "comp_uns"
]

agg_dict = {
    "Aktionsdiagnosegruppe": pick_mode,  # overordnet tekst
    "sex": pick_mode,
    "age": "mean",
    "diabetes_type": pick_mode,
}
for c in comp_cols:
    agg_dict[c] = "max"   # OR over rækker

pop_diag = (
    pop.groupby(ID_COL, as_index=False)
       .agg(agg_dict)
       .rename(columns={"Aktionsdiagnosegruppe": "diagnosis"})
)

# Konvertér komplikations-flags til 0/1 (int) for modelling
for c in comp_cols:
    pop_diag[c] = pop_diag[c].astype(int)

# Endelig kolonneorden
pop_diag = pop_diag[[ID_COL, "sex", "age", "diabetes_type", "diagnosis"] + comp_cols]

print("Population summary (one row per patient):")
print(pop_diag.head())

Population summary (one row per patient):
   DW_EK_Borger sex   age diabetes_type        diagnosis  comp_any  comp_eye  \
0          2611   M  59.9          T2DM  Type 2-diabetes         1         0   
1          2822   M  55.9          T1DM  Type 1-diabetes         1         0   
2          2897   M  54.9          T2DM  Type 2-diabetes         1         0   
3          3557   M  43.9          T1DM  Type 1-diabetes         0         0   
4          4001   M  37.9          T1DM  Type 1-diabetes         1         1   

   comp_renal  comp_neuro  comp_periph  comp_foot  comp_multiple  comp_ketoac  \
0           1           0            0          0              0            0   
1           0           0            0          0              0            0   
2           0           0            1          1              0            0   
3           0           0            0          0              0            0   
4           0           0            0          0              0        

In [18]:
# %% ---------------- LABKA NORMALISATION HELPERS ----------------
def normalize_header(s: str) -> str:
    if not isinstance(s, str):
        s = str(s)
    s = s.strip()

    # Decode _xHHHH_
    s = re.sub(r"_x([0-9A-Fa-f]{4})_", lambda m: chr(int(m.group(1), 16)), s)

    # Strip simple HTML sup tags and variants
    s = (s.replace("<sup>", "")
           .replace("</sup>", "")
           .replace("<_sup>", "")
           .replace("&sup;", "")
           .replace("&lt;sup&gt;", "")
           .replace("&lt;/sup&gt;", ""))

    # Unify symbols
    s = (s.replace("µ", "u")
           .replace("×", "x")
           .replace("−", "-")
           .replace("/", "_")
           .replace("\\", "_"))

    # NFKD + ASCII fold (æ->ae, ø->oe, å->aa, ²->2, etc.)
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = (s.replace("æ", "ae").replace("Æ", "Ae")
           .replace("ø", "oe").replace("Ø", "Oe")
           .replace("å", "aa").replace("Å", "Aa")
           .replace("²", "2"))

    # Special-case: 1,73m2 -> 1_73m2
    s = (s.replace("1,73m2", "1_73m2")
           .replace("1,73m", "1_73m"))

    s = s.lower()
    s = re.sub(r"[^\w]+", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")

    # Units harmonization
    s = (s.replace("mmol_liter", "mmol_l")
           .replace("mmol_per_l", "mmol_l")
           .replace("pmol_liter", "pmol_l"))
    return s


def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [normalize_header(c) for c in df.columns]
    return df


def guess_lab_date_column(cols):
    candidates = [
        r"^testdato$",
        r"^dato_proevetagningstid$",
        r"^dato_proevetagning$",
        r"^proevetagning.*(dato|tid)$",
        r"^(pr\w*tagning|sample).*date.*$",
        r"^(analysetid|analysedato)$",
    ]
    for pat in candidates:
        for c in cols:
            if re.search(pat, c):
                return c
    return None


ANALYTE_PATTERNS = {
    "b_haemoglobin_mmol_l": [
        r"^b_haemoglobin_mmol_l$",
        r"^b_haemoglobin.*mmol_l$",
        r"^b_h?ae?moglobin_?mmol_l$",
    ],
    "hba1c_ifcc_mmol_mol": [
        r"(hb|hba1c).*(a1c).*(ifcc).*(mmol_mol)",
        r"hb_?b_?.*h?ae?moglobin.*a1c.*ifcc.*mmol_mol",
        r"hba1c.*ifcc.*mmol_mol",
        r"hba1c.*mmol_mol",
    ],
    "p_25oh_vitd_nmol_l": [
        r"25.*hydroxy.*vitamin.*d.*nmol_l",
        r"vitamin.*d3.*d2.*nmol_l",
    ],
    "p_albumin_g_l": [r"^p_albumin_g_l$"],
    "p_calcium_mmol_l": [r"^p_calcium_mmol_l$"],
    "p_calcium_albuminkorrigeret_mmol_l": [r"^p_calcium_.*albumin.*mmol_l$"],
    "p_kalium_mmol_l": [r"^p_kalium_mmol_l$", r"^p_potassium_mmol_l$"],
    "p_kolesterol_total_mmol_l": [r"^p_kolesterol_mmol_l$", r"^p_cholesterol_mmol_l$"],
    "p_hdl_mmol_l": [r"^p_kolesterol_hdl_mmol_l$", r"^p_hdl_mmol_l$"],
    "p_ldl_mmol_l": [r"^p_kolesterol_ldl_mmol_l$", r"^p_ldl_mmol_l$"],
    "p_kreatinin_umol_l": [r"^p_kreatinin_u?mol_l$"],
    "p_natrium_mmol_l": [r"^p_natrium_mmol_l$", r"^p_sodium_mmol_l$"],
    "p_triglycerid_mmol_l": [r"^p_triglycerid_mmol_l$", r"^p_triglycerides?_mmol_l$"],
    "p_vitamin_b12_pmol_l": [r"^p_vitamin_b12_pmol_l$"],
    "u_acr_mg_g": [
        r"^u_.*albumin.*kreatinin.*mg_g$",
        r"^uacr_mg_g$"
    ],
    "u_acr_x10e_3": [
        r"u_.*albumin.*kreatinin.*(x|times)_?10_?-?3",
        r"u_.*albumin.*kreatinin.*10_?-?3",
        r"uacr.*(x|times)?_?10_?-?3",
    ],
    "egfr_ckd_epi_ml_min_1_73m2": [
        r"^egfr.*ckd.*epi.*ml_min",
        r"^egfr_.*1_73m2.*ml_min",
        r"^egfr_ml_min(_1_73m2)?$",
    ],
}


def find_analyte_columns(cols):
    found = {}
    for key, pats in ANALYTE_PATTERNS.items():
        for pat in pats:
            m = [c for c in cols if re.search(pat, c)]
            if m:
                found[key] = sorted(m, key=len, reverse=True)[0]
                break
    return found


def coerce_numeric_series(s: pd.Series) -> pd.Series:
    x = s.astype(str)
    x = x.str.replace(r"_x([0-9A-Fa-f]{4})_", lambda m: chr(int(m.group(1), 16)), regex=True)
    x = x.str.replace(r'^\s*[<>]=?\s*', '', regex=True)
    x = x.str.replace(',', '.', regex=False)
    x = x.str.replace('\u00A0', '', regex=False)
    x = x.str.replace('−', '-', regex=False)
    x = x.str.strip()
    return pd.to_numeric(x, errors='coerce')



In [19]:
# %% ---------------- LOAD LABKA ----------------
lab = pd.read_excel(LAB_PATH, engine="openpyxl")
lab = normalize_columns(lab)

lab_date_col = guess_lab_date_column(lab.columns)
if not lab_date_col:
    raise KeyError(
        "Kunne ikke finde dato-kolonnen. Ledte efter Testdato/Prøvetagningstid mv. "
        f"Første 20 normaliserede headers: {', '.join(list(lab.columns)[:20])}"
    )

lab[lab_date_col] = pd.to_datetime(lab[lab_date_col], errors="coerce", dayfirst=True)

# Align ID column name + type
id_norm = normalize_header(ID_COL)
if id_norm not in lab.columns:
    raise KeyError(
        f"LAB mangler ID-kolonnen. Forventede '{id_norm}'. "
        f"Havde: {', '.join(list(lab.columns)[:10])} ..."
    )

if id_norm != ID_COL:
    lab = lab.rename(columns={id_norm: ID_COL})

lab[ID_COL] = pd.to_numeric(lab[ID_COL], errors="coerce").astype("Int64")

mapping = find_analyte_columns(lab.columns)
if not mapping:
    raise RuntimeError("Fandt ingen analysekolonner. Headers er ændret, udvid ANALYTE_PATTERNS.")

for alias, col in mapping.items():
    lab[col] = coerce_numeric_series(lab[col])

if STRICT_NUMERIC_ANALYTES:
    lab_value_cols = [c for c in mapping.values() if pd.api.types.is_numeric_dtype(lab[c])]
else:
    lab_value_cols = list(mapping.values())

print("LAB date column:", lab_date_col)
print("Antal analyter fundet (mapping):", len(mapping))
print("Antal analyter der er numeriske (lab_value_cols):", len(lab_value_cols))
print("Match (alias -> kolonne):")
for k, v in mapping.items():
    print(f"  {k:30s} -> {v}")


LAB date column: testdato
Antal analyter fundet (mapping): 17
Antal analyter der er numeriske (lab_value_cols): 17
Match (alias -> kolonne):
  b_haemoglobin_mmol_l           -> b_haemoglobin_mmol_l
  hba1c_ifcc_mmol_mol            -> hb_b_haemoglobin_a1c_ifcc_mmol_mol
  p_25oh_vitd_nmol_l             -> p_25_hydroxy_vitamin_d_d3_d2_nmol_l
  p_albumin_g_l                  -> p_albumin_g_l
  p_calcium_mmol_l               -> p_calcium_mmol_l
  p_calcium_albuminkorrigeret_mmol_l -> p_calcium_albuminkorrigeret_mmol_l
  p_kalium_mmol_l                -> p_kalium_mmol_l
  p_kolesterol_total_mmol_l      -> p_kolesterol_mmol_l
  p_hdl_mmol_l                   -> p_kolesterol_hdl_mmol_l
  p_ldl_mmol_l                   -> p_kolesterol_ldl_mmol_l
  p_kreatinin_umol_l             -> p_kreatinin_umol_l
  p_natrium_mmol_l               -> p_natrium_mmol_l
  p_triglycerid_mmol_l           -> p_triglycerid_mmol_l
  p_vitamin_b12_pmol_l           -> p_vitamin_b12_pmol_l
  u_acr_mg_g                   

In [20]:
# %% ---------------- BUILD MASTER (4M2M) ----------------
CARRY_COLS = [
    "n_visits_in_window",
    "visit_datetimes",
    "visit_topics",
    "visit_statuses",
    "n_anchor_bookings_that_day",
    "anchor_first_time",
    "anchor_last_time",
]

def build_master_for_windows(windows: pd.DataFrame, window_label: str) -> pd.DataFrame:
    rows = []

    for w in windows.itertuples(index=False):
        wd = w._asdict()

        pid       = wd[ID_COL]
        anchor    = pd.to_datetime(wd["anchor_date"])
        win_start = pd.to_datetime(wd["window_start"])
        win_end   = pd.to_datetime(wd["window_end"])

        # ---------- WHO (6-month window first, then calendar-year fallback) ----------
        who_pid = who[who[ID_COL] == pid].copy()

        # 1) primary: 6-month window (4m before, 2m after)
        who_pick = pick_closest_in_window(
            who_pid,
            date_col=who_date_col,
            anchor=anchor,
            start=win_start,
            end=win_end,
        )

        who_within_6m = False

        # 2) fallback: same calendar year if none in 6-month window
        if who_pick is None:
            year_start = pd.Timestamp(anchor.year, 1, 1)
            year_end   = pd.Timestamp(anchor.year, 12, 31)

            who_pick = pick_closest_in_window(
                who_pid,
                date_col=who_date_col,
                anchor=anchor,
                start=year_start,
                end=year_end,
            )
            who_within_6m = False
        else:
            who_within_6m = True

        # 3) extract values
        who_present = who_pick is not None

        if who_present:
            who_date = who_pick[who_date_col].iloc[0]
            who_days = (who_date - anchor).days
            who_score = (
                who_pick[who_score_col].iloc[0]
                if (who_score_col and who_score_col in who_pick.columns)
                else None
            )
        else:
            who_date  = pd.NaT
            who_days  = None
            who_score = None

        # ---------- LAB: use window from windows file (4m before, 2m after) ----------
        lab_pid = lab[lab[ID_COL] == pid]
        sub = lab_pid[
            (lab_pid[lab_date_col] >= win_start) &
            (lab_pid[lab_date_col] <= win_end)
        ].copy()

        lab_present = not sub.empty
        lab_means = {}
        if lab_present and lab_value_cols:
            means = sub[lab_value_cols].mean(axis=0, skipna=True)
            lab_means = {f"LABmean__{c}": means[c] for c in lab_value_cols}

        # ---------- assemble row ----------
        row = {
            ID_COL: pid,
            "anchor_date": anchor,
            "window_start": win_start,
            "window_end": win_end,
            "window_type": window_label,
            # WHO
            "has_who_in_window": bool(who_present and who_within_6m),
            "has_who_in_year":   bool(who_present),
            "who_date": who_date,
            "who_days_from_anchor": who_days,
            "who_score": who_score,
            # LAB
            "has_lab_in_window": bool(lab_present),
            "lab_window_start": win_start,
            "lab_window_end": win_end,
            "lab_window_n_rows": int(sub.shape[0]) if lab_present else 0,
        }

        for c in CARRY_COLS:
            if c in wd:
                row[c] = wd[c]

        row.update(lab_means)
        rows.append(row)

    master = pd.DataFrame(rows)

    # --- merge population (diagnosis + type + complications) ---
    cols_from_pop = [ID_COL, "sex", "age", "diabetes_type", "diagnosis"] + comp_cols
    pop_view = pop_diag[[c for c in cols_from_pop if c in pop_diag.columns]]
    master = master.merge(pop_view, on=ID_COL, how="left")

    # --- order columns ---
    front = [
        ID_COL,
        "sex", "age",
        "diabetes_type", "diagnosis",
        # complication flags (if present)
        "comp_any", "comp_eye", "comp_renal", "comp_neuro",
        "comp_periph", "comp_foot", "comp_multiple",
        "comp_ketoac", "comp_coma", "comp_uns",
        # anchor + windows
        "anchor_date", "window_start", "window_end", "window_type",
        # booking / visits
        "n_visits_in_window", "visit_datetimes", "visit_topics", "visit_statuses",
        "n_anchor_bookings_that_day", "anchor_first_time", "anchor_last_time",
        # WHO
        "has_who_in_window", "has_who_in_year",
        "who_date", "who_days_from_anchor", "who_score",
        # LAB window meta
        "has_lab_in_window", "lab_window_start", "lab_window_end", "lab_window_n_rows",
    ]

    # All LAB means at the end, sorted
    mean_cols = sorted([c for c in master.columns if c.startswith("LABmean__")])

    # Anything else that sneaks in
    leftovers = [c for c in master.columns if c not in set(front + mean_cols)]

    ordered_cols = [c for c in front if c in master.columns] + leftovers + mean_cols
    master = master[ordered_cols]

    return master


In [21]:
# %% ---------------- RUN BUILD FOR 4M2M ----------------
master_4m2m = build_master_for_windows(windows_4m2m, "4m2m")

# Single output file (4m before, 2m after dataset)
master_4m2m.to_excel(OUT_PATH_4M2M, index=False)
print(f"Saved 4m2m window dataset: {OUT_PATH_4M2M}")

n_all = len(master_4m2m)
if n_all == 0:
    print("[4m2m] No rows at all. Something is wrong upstream.")
else:
    mask_complete = master_4m2m["has_who_in_window"] & master_4m2m["has_lab_in_window"]
    n_complete = int(mask_complete.sum())
    pct = n_complete / n_all * 100
    print(f"[4m2m] Complete anchors (WHO+LAB): {n_complete}/{n_all} ({pct:.1f}%)")
    print("Unique patients in 4m2m:", master_4m2m[ID_COL].nunique())
# %% ---------------- FILTER BY NUMBER OF ANCHORS PER PATIENT ----------------

# Count how many anchor rows each patient has
anchor_counts = (
    master_4m2m
    .groupby(ID_COL)
    .size()
    .rename("n_anchors")
)

# Attach count back to master_4m2m
master_4m2m = master_4m2m.copy()
master_4m2m["n_anchors"] = master_4m2m[ID_COL].map(anchor_counts)

print("\nAnchor count per patient (summary):")
print(master_4m2m["n_anchors"].describe())

# Drop patients with < 2 entries
master_4m2m_ge2 = master_4m2m[master_4m2m["n_anchors"] >= 2].copy()

print(f"\nRows after dropping patients with <2 anchors: {len(master_4m2m_ge2)}")
print("Unique patients with >=2 anchors:", master_4m2m_ge2[ID_COL].nunique())

# --- Dataset 1: patients with exactly 2, 3, or 4 anchors ---
mask_2to4 = master_4m2m_ge2["n_anchors"].between(2, 4)
ds_4m2m_2to4 = master_4m2m_ge2[mask_2to4].copy()

print(f"\n[4m2m, 2–4 anchors] rows: {len(ds_4m2m_2to4)}, patients: {ds_4m2m_2to4[ID_COL].nunique()}")

OUT_PATH_4M2M_2TO4 = OUT_PATH_BASE.with_name(OUT_PATH_BASE.stem + "_4m2m_2to4.xlsx")
ds_4m2m_2to4.to_excel(OUT_PATH_4M2M_2TO4, index=False)
print(f"Saved 4m2m dataset (2–4 anchors): {OUT_PATH_4M2M_2TO4}")

# --- Dataset 2: patients with exactly 3 or 4 anchors ---
mask_3to4 = master_4m2m_ge2["n_anchors"].between(3, 4)
ds_4m2m_3to4 = master_4m2m_ge2[mask_3to4].copy()

print(f"\n[4m2m, 3–4 anchors] rows: {len(ds_4m2m_3to4)}, patients: {ds_4m2m_3to4[ID_COL].nunique()}")

OUT_PATH_4M2M_3TO4 = OUT_PATH_BASE.with_name(OUT_PATH_BASE.stem + "_4m2m_3to4.xlsx")
ds_4m2m_3to4.to_excel(OUT_PATH_4M2M_3TO4, index=False)
print(f"Saved 4m2m dataset (3–4 anchors): {OUT_PATH_4M2M_3TO4}")


Saved 4m2m window dataset: C:\Users\kfq6\Documents\Data\Sammedag_master_means_4m2m.xlsx
[4m2m] Complete anchors (WHO+LAB): 3250/4597 (70.7%)
Unique patients in 4m2m: 2037

Anchor count per patient (summary):
count    4597.000000
mean        2.547531
std         0.716106
min         1.000000
25%         2.000000
50%         3.000000
75%         3.000000
max         4.000000
Name: n_anchors, dtype: float64

Rows after dropping patients with <2 anchors: 4168
Unique patients with >=2 anchors: 1608

[4m2m, 2–4 anchors] rows: 4168, patients: 1608
Saved 4m2m dataset (2–4 anchors): C:\Users\kfq6\Documents\Data\Sammedag_master_means_4m2m_2to4.xlsx

[4m2m, 3–4 anchors] rows: 2766, patients: 907
Saved 4m2m dataset (3–4 anchors): C:\Users\kfq6\Documents\Data\Sammedag_master_means_4m2m_3to4.xlsx


In [22]:
# %% ---------------- VALIDATION: RANDOM ROW CHECKS (4M2M DATASET) ----------------
import numpy as np

master = master_4m2m        # already in memory
win    = windows_4m2m       # original 4m2m windows

N = 5  # how many random anchors to inspect

np.random.seed(42)
sample_idx = np.random.choice(len(master), size=min(N, len(master)), replace=False)

def pretty_print_separator(title=None):
    print("\n" + "="*80)
    if title:
        print(title)
        print("="*80)

for i in sample_idx:
    row = master.iloc[i]
    pid = row[ID_COL]
    anchor = row["anchor_date"]
    w_start = row["window_start"]
    w_end   = row["window_end"]

    pretty_print_separator(f"CHECKING ROW {i} | PID={pid} | anchor={anchor.date()}")

    # 1) WINDOW ROW(S)
    print("\n[WINDOW ROW(S) FROM ORIGINAL 4M2M FILE]")
    win_rows = win[(win[ID_COL] == pid) &
                   (win["anchor_date"] == anchor)]
    if win_rows.empty:
        print("  !! No matching window row found in original window file.")
    else:
        print(win_rows)

    # 2) POPULATION
    print("\n[POPULATION ROW]")
    pop_row = pop_diag[pop_diag[ID_COL] == pid]
    if pop_row.empty:
        print("  !! No population row for this ID in pop_diag.")
    else:
        print(pop_row)

    # 3) WHO CHECK
    print("\n[WHO ROWS IN WINDOW]")
    who_pid = who[who[ID_COL] == pid]
    who_in_win = who_pid[
        (who_pid[who_date_col] >= w_start) &
        (who_pid[who_date_col] <= w_end)
    ].copy()

    if who_in_win.empty:
        print("  No WHO rows in this window.")
    else:
        who_in_win["abs_days_from_anchor"] = (who_in_win[who_date_col] - anchor).abs().dt.days
        who_in_win = who_in_win.sort_values(["abs_days_from_anchor", who_date_col], ascending=[True, False])
        chosen = who_in_win.iloc[0]

        print("  All WHO rows in window:")
        print(who_in_win[[who_date_col, who_score_col, "abs_days_from_anchor"]])

        print("\n  Chosen WHO (closest to anchor):")
        print(chosen[[who_date_col, who_score_col, "abs_days_from_anchor"]])

        print("\n  Values stored in MASTER:")
        print("    who_date:           ", row["who_date"])
        print("    who_days_from_anchor", row["who_days_from_anchor"])
        print("    who_score:          ", row["who_score"])

    # 4) LAB CHECK
    print("\n[LAB ROWS IN WINDOW]")
    lab_pid = lab[lab[ID_COL] == pid]
    lab_in_win = lab_pid[
        (lab_pid[lab_date_col] >= w_start) &
        (lab_pid[lab_date_col] <= w_end)
    ].copy()

    if lab_in_win.empty:
        print("  No LAB rows in this window.")
    else:
        print(f"  Found {lab_in_win.shape[0]} lab rows in window.")
        print("  Date range in LAB:")
        print("    min:", lab_in_win[lab_date_col].min(), "max:", lab_in_win[lab_date_col].max())

        # recompute means
        recomputed_means = lab_in_win[lab_value_cols].mean(axis=0, skipna=True)

        print("\n  Comparing LAB means:")
        for col in lab_value_cols:
            master_col = f"LABmean__{col}"
            if master_col not in master.columns:
                continue

            master_val = row[master_col]
            recalced_val = recomputed_means[col]

            print(f"    {master_col:40s}  master={master_val!r:12}  recomputed={recalced_val!r}")


CHECKING ROW 1393 | PID=422473 | anchor=2023-11-07

[WINDOW ROW(S) FROM ORIGINAL 4M2M FILE]
      DW_EK_Borger anchor_date window_start window_end  \
1393        422473  2023-11-07   2023-07-07 2024-01-07   

      n_anchor_bookings_that_day anchor_first_time anchor_last_time  \
1393                           1        2023-11-07       2023-11-07   

      n_visits_in_window                                    visit_datetimes  \
1393                  10  2023-08-07 00:00 | 2023-08-07 00:00 | 2023-09-...   

                                           visit_topics  \
1393  FUS_x0020_diskus_x002C__x0020_lænd | FUS_x0020...   

                                         visit_statuses  
1393  Afviklet | Afviklet | Afviklet | Afviklet | Af...  

[POPULATION ROW]
     DW_EK_Borger sex   age diabetes_type        diagnosis  comp_any  \
835        422473   F  70.0          T2DM  Type 2-diabetes         1   

     comp_eye  comp_renal  comp_neuro  comp_periph  comp_foot  comp_multiple  \
835       