In [7]:
# %% -----------------------------------------------------------
# Sammedagsscreening-anchored monthly windows from Bookplan
# One row per (patient, anchor screening), with all visits in-window
# ---------------------------------------------------------------

import pandas as pd
from pathlib import Path

# ------------------ CONFIG ------------------
BOOK_PATH = Path(r"C:\Users\kfq6\Documents\Data\Bookinger.xlsx")

# Window mode:
#   "calendar" -> whole calendar month of the anchor date
#   "rolling_30d" -> [anchor-30 days, anchor+30 days]
WINDOW_MODE = "calendar"   # change to "rolling_30d" if you must

# Keep only completed/active bookings?
FILTER_ACTIVE = True
ACTIVE_STATES = {"Afviklet", "Booket"}

# Case-insensitive match for Sammedagsscreening
SAMMEDAG_TOKEN = "sammedags"

# Output paths (optional)
OUT_DIR = BOOK_PATH.parent
OUT_WINDOWS = OUT_DIR / "Bookplan_sammedag_month_windows.xlsx"


In [8]:
# ------------------ LOAD ------------------
book = pd.read_excel(BOOK_PATH, engine="openpyxl")

# Basic hygiene
book.columns = book.columns.str.strip()
for col in ["Angaaende", "BookingStatusTekst"]:
    if col in book.columns:
        book[col] = book[col].astype(str).str.strip()

# Robust parse of date+time into a single datetime
def parse_meeting_dt(row, dcol="Dato_MoedeDato", tcol="Klok_MoedeTid"):
    d = pd.to_datetime(row.get(dcol), errors="coerce", dayfirst=True)
    if pd.isna(d):
        return pd.NaT
    t_raw = row.get(tcol)
    if pd.isna(t_raw):
        return d
    t_str = str(t_raw).strip()
    for fmt in ("%H:%M:%S", "%H:%M", "%H%M%S", "%H%M"):
        try:
            t_parsed = pd.to_datetime(t_str, format=fmt)
            return pd.to_datetime(f"{d.date()} {t_parsed.time()}")
        except Exception:
            continue
    # If time format is a mess, just keep the date
    return d

book = book.copy()
book["meeting_dt"] = book.apply(parse_meeting_dt, axis=1)
book["meeting_date"] = book["meeting_dt"].dt.normalize()

# Optional state filter
if FILTER_ACTIVE and "BookingStatusTekst" in book.columns:
    book = book[book["BookingStatusTekst"].isin(ACTIVE_STATES)].copy()


  warn("Workbook contains no default style, apply openpyxl's default")


In [9]:
# ------------------ FIND ANCHORS ------------------
# Filter to Sammedagsscreening anchors
if "Angaaende" not in book.columns:
    raise KeyError("Column 'Angaaende' missing. Can't find Sammedagsscreening rows.")

anchors = book[
    book["Angaaende"].str.lower().str.contains(SAMMEDAG_TOKEN, na=False)
].copy()

# Safety: drop rows without a valid meeting_date
anchors = anchors[anchors["meeting_date"].notna()].copy()

# If multiple anchor bookings same day/patient, collapse to one
anchor_days = (
    anchors.groupby(["DW_EK_Borger", "meeting_date"], dropna=False)
    .agg(
        n_anchor_bookings=("DW_EK_Borger", "size"),
        first_time=("meeting_dt", "min"),
        last_time=("meeting_dt", "max"),
    )
    .reset_index()
    .rename(columns={"meeting_date": "anchor_date"})
)

In [10]:
# ------------------ BUILD WINDOWS ------------------
def calendar_window(d: pd.Timestamp):
    # Whole calendar month of the anchor date
    start = d.replace(day=1)
    # first day of next month then minus 1 day
    if d.month == 12:
        next_month_start = pd.Timestamp(year=d.year + 1, month=1, day=1)
    else:
        next_month_start = pd.Timestamp(year=d.year, month=d.month + 1, day=1)
    end = next_month_start - pd.Timedelta(days=1)
    return start, end

def rolling_30d_window(d: pd.Timestamp):
    return d - pd.Timedelta(days=30), d + pd.Timedelta(days=30)

# Precompute window bounds for every anchor row
if WINDOW_MODE == "calendar":
    bounds = anchor_days["anchor_date"].apply(calendar_window)
elif WINDOW_MODE == "rolling_30d":
    bounds = anchor_days["anchor_date"].apply(rolling_30d_window)
else:
    raise ValueError("WINDOW_MODE must be 'calendar' or 'rolling_30d'.")

anchor_days["window_start"] = [s for s, _ in bounds]
anchor_days["window_end"]   = [e for _, e in bounds]

In [11]:
# ------------------ COLLECT VISITS IN EACH WINDOW ------------------
# Reduce bookings to only columns we need for aggregation
need_cols = ["DW_EK_Borger", "meeting_dt", "meeting_date", "BookingStatusTekst", "Angaaende"]
for c in need_cols:
    if c not in book.columns:
        raise KeyError(f"Expected column '{c}' missing in Bookinger data.")
vis = book[need_cols].copy()

# Helper to extract visits in a given window
def visits_in_window(df_vis, pid, start, end):
    sub = df_vis[df_vis["DW_EK_Borger"] == pid]
    sub = sub[(sub["meeting_date"] >= start) & (sub["meeting_date"] <= end)].copy()
    # Order by datetime
    sub = sub.sort_values("meeting_dt")
    return sub

rows = []
for r in anchor_days.itertuples(index=False):
    pid = r.DW_EK_Borger
    ws, we = r.window_start, r.window_end
    within = visits_in_window(vis, pid, ws, we)
    # Build aggregated fields
    n_visits = len(within)
    # To make Excel readable, join with " | "
    dates_join = " | ".join(within["meeting_dt"].dt.strftime("%Y-%m-%d %H:%M").fillna("").tolist())
    topics_join = " | ".join(within["Angaaende"].fillna("").tolist())
    status_join = " | ".join(within["BookingStatusTekst"].fillna("").tolist())

    rows.append(
        {
            "DW_EK_Borger": pid,
            "anchor_date": r.anchor_date,
            "window_start": ws,
            "window_end": we,
            "n_anchor_bookings_that_day": r.n_anchor_bookings,
            "anchor_first_time": r.first_time,
            "anchor_last_time": r.last_time,
            "n_visits_in_window": n_visits,
            "visit_datetimes": dates_join,
            "visit_topics": topics_join,
            "visit_statuses": status_join,
        }
    )

windows = pd.DataFrame(rows).sort_values(["DW_EK_Borger", "anchor_date"])


In [12]:
# ------------------ SAVE ------------------
try:
    
    windows.to_excel(OUT_WINDOWS, index=False)
    print(f"Saved windows: {OUT_WINDOWS}")
except Exception as e:
    print("Skipping file save:", e)

# ------------------ SHOW SAMPLE ------------------
print("\n=== Anchors (sample) ===")
print(anchor_days.head(10))
print("\n=== Windows (sample) ===")
print(windows.head(10))

Saved windows: C:\Users\kfq6\Documents\Data\Bookplan_sammedag_month_windows.xlsx

=== Anchors (sample) ===
   DW_EK_Borger anchor_date  n_anchor_bookings first_time  last_time  \
0          2822  2023-11-13                  1 2023-11-13 2023-11-13   
1          2822  2025-01-08                  1 2025-01-08 2025-01-08   
2          2897  2022-10-04                  1 2022-10-04 2022-10-04   
3          2897  2025-06-17                  1 2025-06-17 2025-06-17   
4          3557  2024-05-21                  1 2024-05-21 2024-05-21   
5          4001  2022-05-12                  1 2022-05-12 2022-05-12   
6          4001  2023-03-23                  1 2023-03-23 2023-03-23   
7          7022  2023-09-29                  1 2023-09-29 2023-09-29   
8          7036  2023-03-09                  1 2023-03-09 2023-03-09   
9          7036  2025-05-21                  1 2025-05-21 2025-05-21   

  window_start window_end  
0   2023-11-01 2023-11-30  
1   2025-01-01 2025-01-31  
2   2022-10-01 2

In [13]:
# --- Coverage stats ---
total_bookings = book.shape[0]
used_bookings = vis[vis["meeting_date"].isin(windows["visit_datetimes"].str.extractall(r"(\d{4}-\d{2}-\d{2})")[0].dropna())].shape[0]

# Simpler and more accurate: count all visits that fall inside ANY window
used_mask = pd.Series(False, index=vis.index)
for r in windows.itertuples(index=False):
    mask = (
        (vis["DW_EK_Borger"] == r.DW_EK_Borger)
        & (vis["meeting_date"] >= r.window_start)
        & (vis["meeting_date"] <= r.window_end)
    )
    used_mask |= mask

used_bookings = used_mask.sum()
coverage_pct = used_bookings / total_bookings * 100
waste_pct = 100 - coverage_pct

print(f"\n--- Coverage summary ---")
print(f"Total bookings in Bookinger: {total_bookings:,}")
print(f"Bookings falling within any 1-month Sammedag window: {used_bookings:,} ({coverage_pct:.1f}%)")
print(f"Bookings outside all windows: {total_bookings - used_bookings:,} ({waste_pct:.1f}%)")



--- Coverage summary ---
Total bookings in Bookinger: 137,220
Bookings falling within any 1-month Sammedag window: 26,163 (19.1%)
Bookings outside all windows: 111,057 (80.9%)


In [14]:
# --- Identify visits NOT used in any window ---
unused = vis[~used_mask].copy()

print(f"\nUnused consultations: {unused.shape[0]:,} ({100 - coverage_pct:.1f}% of total)")

# --- Quick overview of what they're about ---
# Normalize text a bit
unused["Angaaende_clean"] = (
    unused["Angaaende"].astype(str).str.lower().str.strip()
)

# Top 20 most common consultation topics
topic_counts = (
    unused["Angaaende_clean"]
    .value_counts()
    .head(20)
)

print("\nTop 20 'Angaaende' among unused consultations:")
print(topic_counts)

# --- Optional: percentage distribution ---
topic_pct = (topic_counts / unused.shape[0] * 100).round(2)
print("\nTop 20 topics (% of unused visits):")
print(topic_pct)





Unused consultations: 111,057 (80.9% of total)

Top 20 'Angaaende' among unused consultations:
Angaaende_clean
telefonkonsultation                                                                    11496
nan                                                                                     8893
hæmodialyse-alb                                                                         7263
kba-med_x0020_blodprøvetagning_x0020_5_x0020_min                                        3882
sygeplejerskekontrol                                                                    2514
sygeplejerske_x0020_vejledning                                                          2345
øb                                                                                      1915
kba-med_x0020_blodprøve_x0020_5_x0020_min                                               1836
lægesamtale                                                                             1824
kba-nord_x0020_blodprøvetagning_x0020_5_x0020_min  

In [15]:
patients_with_sammedag = set(windows["DW_EK_Borger"].unique())
unused["has_sammedag"] = unused["DW_EK_Borger"].isin(patients_with_sammedag)
print(unused["has_sammedag"].value_counts(normalize=True).mul(100).round(1))


has_sammedag
True     65.8
False    34.2
Name: proportion, dtype: float64


In [22]:
import pandas as pd
from pathlib import Path

# Paths
WHO_PATH = Path(r"C:\Users\kfq6\Documents\Data\WHO-5 (PRO).xlsx")

# Load WHO-5 data
who = pd.read_excel(WHO_PATH)
who.columns = who.columns.str.strip()

# Use the right identifier column name
ID_COL = "DW_EK_Borger"  

# Unique patient IDs in each dataset
samedag_ids = set(windows[ID_COL].unique())  # or per_patient if you prefer
who_ids = set(who[ID_COL].dropna().unique())

# Compare overlap
total_sammedag = len(samedag_ids)
has_who = len(samedag_ids & who_ids)
missing_who = total_sammedag - has_who

ratio_with = has_who / total_sammedag * 100
ratio_missing = missing_who / total_sammedag * 100

print(f"Total Sammedag patients: {total_sammedag:,}")
print(f"With WHO-5 data: {has_who:,} ({ratio_with:.1f}%)")
print(f"Without WHO-5 data: {missing_who:,} ({ratio_missing:.1f}%)")

# Optional: list missing ones
missing_ids = samedag_ids - who_ids
missing_df = pd.DataFrame({ID_COL: list(missing_ids)})


Total Sammedag patients: 2,037
With WHO-5 data: 1,760 (86.4%)
Without WHO-5 data: 277 (13.6%)


## LABKA

In [24]:
# Paths
LABKA_PATH = Path(r"C:\Users\kfq6\Documents\Data\LABKA_wide_rawSvar.xlsx")

# Load LABKA data
labka = pd.read_excel(LABKA_PATH)
labka.columns = labka.columns.str.strip()

# Use consistent ID column
ID_COL = "DW_EK_Borger"

# Unique IDs in each dataset
sammedag_ids = set(windows[ID_COL].unique())   # or per_patient, same thing
labka_ids = set(labka[ID_COL].dropna().unique())

# Compare overlap
total_sammedag = len(sammedag_ids)
has_labka = len(sammedag_ids & labka_ids)
missing_labka = total_sammedag - has_labka

ratio_with = has_labka / total_sammedag * 100
ratio_missing = missing_labka / total_sammedag * 100

print(f"Total Sammedag patients: {total_sammedag:,}")
print(f"With LABKA data: {has_labka:,} ({ratio_with:.1f}%)")
print(f"Without LABKA data: {missing_labka:,} ({ratio_missing:.1f}%)")

# Optional: export list of patients with no lab data
missing_ids = sammedag_ids - labka_ids
missing_df = pd.DataFrame({ID_COL: list(missing_ids)})


Total Sammedag patients: 2,037
With LABKA data: 2,035 (99.9%)
Without LABKA data: 2 (0.1%)


In [26]:



# Identifier
ID_COL = "DW_EK_Borger"



# Clean up columns
who.columns = who.columns.str.strip()
labka.columns = labka.columns.str.strip()

# Unique IDs
sammedag_ids = set(windows[ID_COL].unique())    # or per_patient if that's what you use
who_ids = set(who[ID_COL].dropna().unique())
labka_ids = set(labka[ID_COL].dropna().unique())

# --- Compute intersections ---
have_all = sammedag_ids & who_ids & labka_ids
have_sammedag_only = sammedag_ids - (who_ids | labka_ids)
have_two_of_three = sammedag_ids - have_all

# --- Counts and ratios ---
total_sammedag = len(sammedag_ids)
n_all = len(have_all)
pct_all = n_all / total_sammedag * 100

print(f"Total Sammedag patients: {total_sammedag:,}")
print(f"With ALL (WHO + LABKA): {n_all:,} ({pct_all:.1f}%)")
print(f"Missing one or both: {total_sammedag - n_all:,} ({100 - pct_all:.1f}%)")

Total Sammedag patients: 2,037
With ALL (WHO + LABKA): 1,759 (86.4%)
Missing one or both: 278 (13.6%)
