# 1. ID mapping 

your final_master_sheet_clean.xlsx has a unified patient ID column that lines up with your intraday data.
what we’ll do

##### 1. load final_master_sheet_clean.xlsx.
##### 2. build a mapping Code → patientID (from the table you provided).
##### 3. for every visit sheet (Ramadan, Visit 1 … Visit 7), replace/add a column PatientID (Huawei Data).
##### 4. keep the original Code if you like, or drop it once you confirm the mapping is correct.
##### 5. save a new Excel (e.g., final_master_sheet_clean_with_huawei.xlsx).

In [None]:
import pandas as pd
from pathlib import Path

# input/output
MASTER_PATH = Path("/kaggle/input/static-variables/final_master_sheet_clean.xlsx")
OUT_PATH = Path("/kaggle/working/final_master_sheet_clean_with_huawei.xlsx")

# your mapping dictionary
PATIENT_ID_MAP = {
    "R01": 45, "R02": 46, "R04": 47, "R05": 48, "R06": 49, "R07": 53,
    "R10": 54, "R11": 55, "R12": 57, "R15": 59, "R16": 60, "R17": 61,
    "R20": 63, "R21": 64, "R22": 66, "R23": 67, "R24": 68, "R25": 69,
    "R26": 70, "R27": 71, "R28": 72, "R29": 73, "R30": 74, "R31": 75,
    "R32": 76, "R33": 77, "R34": 78, "R35": 79, "R36": 80, "R37": 81,
    "R39": 82, "R40": 83, "R41": 84, "R42": 85, "R43": 86,
}
id_map_df = pd.DataFrame(list(PATIENT_ID_MAP.items()), columns=["Code", "PatientID (Huawei Data)"])

# load workbook
xls = pd.ExcelFile(MASTER_PATH)
sheets = {s: pd.read_excel(MASTER_PATH, sheet_name=s) for s in xls.sheet_names}

# update each sheet that has "Code"
updated_sheets = {}
for name, df in sheets.items():
    if "Code" in df.columns:
        df = df.merge(id_map_df, on="Code", how="left")
        # optional: drop old Code col and just keep Huawei ID
        # df = df.drop(columns=["Code"])
        updated_sheets[name] = df
    else:
        updated_sheets[name] = df

# save to new Excel
with pd.ExcelWriter(OUT_PATH, engine="openpyxl") as writer:
    for name, df in updated_sheets.items():
        df.to_excel(writer, index=False, sheet_name=name)

print(f"saved → {OUT_PATH}")


# 2. subperiods + Ramadan/Shawwal periods

Sets main periods (Ramadan/Shawwal) with your exact dates.

Defines visit subperiods as inclusive date ranges:

V1 = 2023-03-13 → 2023-03-26
V2 = 2023-03-27 → 2023-04-02
V3 = 2023-04-03 → 2023-04-09
V4 = 2023-04-10 → 2023-04-19 (ends at Ramadan end)
V5 = 2023-04-20 → 2023-04-26
V6 = 2023-04-27 → 2023-05-08
V7 = 2023-05-09 → 2023-05-19 (ends at Shawwal end)


Add explicit Visit subperiods + Ramadan/Shawwal periods:
- Annotates intraday rows with `period_main` and `visit_assigned`
- Writes spec sheets into master workbook
- Saves:
  /kaggle/working/intraday_with_visits.csv
  /kaggle/working/final_master_sheet_clean_with_visits.xlsx



In [None]:
# -*- coding: utf-8 -*-
"""
Annotate intraday with Visit subperiods + Ramadan/Shawwal periods
- Adds `visit_assigned` (inclusive visit windows) and `period_main` (Ramadan/Shawwal/outside)
- Saves annotated intraday to CSV and **single-sheet Excel (Intraday_All)**
- Injects spec sheets into the master workbook and adds Visit_Anchor_Date to visit sheets
- Also writes a dynamic missingness-by-visit CSV (optional QA artifact)
"""

import pandas as pd
import numpy as np
from pathlib import Path

# ---------------- CONFIG ----------------
INTRADAY_CSV_PATH = Path("/kaggle/input/intraday/intraday.csv")
MASTER_XLSX_IN    = Path("/kaggle/working/final_master_sheet_clean_with_huawei.xlsx")
MASTER_XLSX_OUT   = Path("/kaggle/working/final_master_sheet_clean_with_visits.xlsx")

INTRADAY_OUT_CSV  = Path("/kaggle/working/intraday_with_visits.csv")
INTRADAY_OUT_XLSX = Path("/kaggle/working/intraday_with_visits.xlsx")

DYN_VISIT_OUT     = Path("/kaggle/working/missingness_dynamic_by_visit.csv")  # optional QA

# Main periods (inclusive)
I_RAMADAN_START = pd.Timestamp("2023-03-22")
I_RAMADAN_END   = pd.Timestamp("2023-04-19")
I_SHAWWAL_START = pd.Timestamp("2023-04-20")
I_SHAWWAL_END   = pd.Timestamp("2023-05-19")

# Visit subperiods (ALL INCLUSIVE — exactly as specified)
VISIT_SUBPERIODS = [
    {"Visit": "Visit 1", "start": "2023-03-13", "end": "2023-03-21"},
    {"Visit": "Visit 2", "start": "2023-03-22", "end": "2023-03-30"},
    {"Visit": "Visit 3", "start": "2023-03-31", "end": "2023-04-06"},
    {"Visit": "Visit 4", "start": "2023-04-08", "end": "2023-04-16"},
    {"Visit": "Visit 5", "start": "2023-04-17", "end": "2023-04-26"},
    {"Visit": "Visit 6", "start": "2023-04-27", "end": "2023-05-08"},
    {"Visit": "Visit 7", "start": "2023-05-09", "end": "2023-05-19"},
]

# Sheets in master we do not modify
SKIP_SHEETS = {"HMC_map_patientID"}


# ---------------- HELPERS ----------------
def _normalize_date_col(s: pd.Series) -> pd.Series:
    return pd.to_datetime(s, errors="coerce").dt.normalize()

def build_visit_spec_df(rows: list[dict]) -> pd.DataFrame:
    """
    Build inclusive visit windows as a tidy DataFrame, warn (don't crash) on overlaps or bad spans.
    """
    df = pd.DataFrame(rows)
    df["start"] = _normalize_date_col(df["start"])
    df["end"]   = _normalize_date_col(df["end"])
    df = df.sort_values(["start", "end"]).reset_index(drop=True)

    bad_span = df[df["end"] < df["start"]]
    if not bad_span.empty:
        print("⚠️ Visit with end < start:\n", bad_span)

    df["prev_end"] = df["end"].shift(1)
    overlap = df[(df.index > 0) & (df["start"] <= df["prev_end"])]
    if not overlap.empty:
        print("⚠️ Overlapping visits detected (inclusive ranges). Ensure previous 'end' < next 'start'.")
        print(overlap[["Visit", "start", "end", "prev_end"]])

    return df.drop(columns=["prev_end"], errors="ignore")

def tag_main_period(ts: pd.Timestamp) -> str | float:
    if pd.isna(ts):
        return np.nan
    if I_RAMADAN_START <= ts <= I_RAMADAN_END:
        return "Ramadan (Mar 22–Apr 19, 2023)"
    if I_SHAWWAL_START <= ts <= I_SHAWWAL_END:
        return "Shawwal (Apr 20–May 19, 2023)"
    return "Outside Ramadan/Shawwal"

def assign_visit_inclusive(dates: pd.Series, visit_spec: pd.DataFrame) -> pd.Series:
    """
    Inclusive mapping: a date belongs to Visit i if start_i <= date <= end_i.
    Dates outside all windows remain NaN.
    """
    visits_cat = pd.CategoricalDtype(categories=list(visit_spec["Visit"]), ordered=True)
    out = pd.Series(pd.Categorical([None] * len(dates), dtype=visits_cat), index=dates.index)
    for _, r in visit_spec.iterrows():
        mask = (dates >= r["start"]) & (dates <= r["end"])
        out.loc[mask] = r["Visit"]
    return out

def _order_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Put key columns first for readability if they exist."""
    cols = list(df.columns)
    front = [c for c in ["date", "visit_assigned", "period_main", "patientID", "huaweiID"] if c in cols]
    rest  = [c for c in cols if c not in front]
    return df[front + rest]


# ---------------- 1) LOAD & ANNOTATE INTRADAY ----------------
print("Annotating intraday with visit subperiods & main periods…")
intraday = pd.read_csv(INTRADAY_CSV_PATH)
if "date" not in intraday.columns:
    raise ValueError("intraday.csv must have a 'date' column")

intraday["date"] = _normalize_date_col(intraday["date"])

visit_spec_df = build_visit_spec_df(VISIT_SUBPERIODS)
intraday["period_main"]    = intraday["date"].apply(tag_main_period)
intraday["visit_assigned"] = assign_visit_inclusive(intraday["date"], visit_spec_df)

# Save CSV once
intraday.to_csv(INTRADAY_OUT_CSV, index=False)
print(f"✅ Saved annotated intraday CSV → {INTRADAY_OUT_CSV}")

# ---------------- 2) UPDATE MASTER WORKBOOK ----------------
print("Injecting Visit_Subperiods_Spec & Period_Bounds into master…")
xls = pd.ExcelFile(MASTER_XLSX_IN)
sheets = {s: pd.read_excel(MASTER_XLSX_IN, sheet_name=s) for s in xls.sheet_names}

# optional: add a Visit_Anchor_Date to each Visit sheet (start of its inclusive subperiod)
ANCHORS = {row["Visit"]: row["start"] for _, row in visit_spec_df.iterrows()}

updated = {}
for name, df in sheets.items():
    if name in SKIP_SHEETS:
        updated[name] = df
        continue
    if name in ANCHORS and "Visit_Anchor_Date" not in df.columns:
        df = df.copy()
        df["Visit_Anchor_Date"] = ANCHORS[name].date()
    updated[name] = df

period_bounds = pd.DataFrame({
    "Period":    ["Ramadan", "Shawwal"],
    "Start":     [I_RAMADAN_START.date(), I_SHAWWAL_START.date()],
    "End":       [I_RAMADAN_END.date(),   I_SHAWWAL_END.date()],
    "Inclusive": [True, True],
})

with pd.ExcelWriter(MASTER_XLSX_OUT, engine="openpyxl") as writer:
    for name, df in updated.items():
        df.to_excel(writer, index=False, sheet_name=name[:31])  # safe truncate
    visit_spec_df.to_excel(writer, index=False, sheet_name="Visit_Subperiods_Spec")
    period_bounds.to_excel(writer, index=False, sheet_name="Period_Bounds")

print(f"✅ Saved master with specs → {MASTER_XLSX_OUT}")

# ---------------- 3) OPTIONAL QA: MISSINGNESS BY VISIT (CSV) ----------------
EXCLUDE = {"Unnamed: 0", "huaweiID", "date", "start", "period_main", "visit_assigned"}
dfv = intraday.copy()
feature_cols = [c for c in dfv.columns if c not in EXCLUDE and c != "patientID"]

if feature_cols:
    out = (
        dfv.groupby("visit_assigned")[feature_cols]
           .apply(lambda g: g.isna().mean().mul(100).round(2))
           .reset_index()
           .rename(columns={"visit_assigned": "Visit"})
    )
    out.melt(id_vars=["Visit"], var_name="Feature", value_name="% Missing (Visit Window)") \
       .to_csv(DYN_VISIT_OUT, index=False)
    print(f"✅ Saved dynamic-by-visit missingness CSV → {DYN_VISIT_OUT}")
else:
    print("ℹ️ Skipped missingness export (no feature columns found).")

# ---------------- 4) SINGLE-SHEET EXCEL EXPORT ----------------
print("Writing single-sheet annotated intraday Excel…")
intraday_ordered = _order_columns(intraday)

with pd.ExcelWriter(INTRADAY_OUT_XLSX, engine="openpyxl") as writer:
    intraday_ordered.to_excel(writer, index=False, sheet_name="Intraday_All")

print(f"✅ Saved single-sheet intraday Excel → {INTRADAY_OUT_XLSX}")

# ---------------- 5) QUICK CHECKS (stdout) ----------------
print("\n— Quick checks —")
try:
    print("Date coverage:", intraday["date"].min(), "→", intraday["date"].max(), "| rows:", len(intraday))
    print("\nVisit assignment counts:")
    print(intraday["visit_assigned"].value_counts(dropna=False))
    print("\nMain period counts:")
    print(intraday["period_main"].value_counts(dropna=False))
except Exception as e:
    print("Sanity checks skipped:", e)


# 3. Overview about the Static and Dynamic feature 

Unified Missingness Pipeline (Qatar edition, percent-aware)
- Normalized name alignment (cleaned + aliases + percent-aware)
- Separate Static vs Dynamic outputs
- Ramadan/Shawwal rollups using visit groups
- QC conditional formatting (>30 warn, >50 alert)
- Patient-level missingness per visit
- Robust dictionary merge (canonical-first, strict percent-safe fallback)


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
import re
import unicodedata

# =============== CONFIG ===============
# Inputs
MASTER_PATH = Path("/kaggle/working/final_master_sheet_clean_with_huawei.xlsx")
DICT_PATH   = Path("/kaggle/input/static-variables/Categorized_Data_Dictionary.xlsx")  # xlsx or csv
INTRADAY_CSV_PATH = Path("/kaggle/input/intraday/intraday.csv")  # optional dynamic

# Patient ID column in the master visit sheets (after your mapping step)
PATIENT_ID_COL = "PatientID (Huawei Data)"

# Which sheets to treat as visits (the script will read all except SKIP)
SKIP_SHEETS = {"HMC_map_patientID"}

# Visit groups for rollups (edit as needed)
RAMADAN_VISITS = {"Ramadan", "Visit 1", "Visit 2", "Visit 3", "Visit 4 (whole Ramadan)"}
SHAWWAL_VISITS = {"Visit 6 (Shawal)"}  # add Visit 5 here if it belongs to Shawwal

# Intraday options
INCLUDE_INTRADAY = True
RAMADAN_START = pd.to_datetime("2023-03-23")
RAMADAN_END   = pd.to_datetime("2023-04-21")
SHAWWAL_START = pd.to_datetime("2023-04-22")
SHAWWAL_END   = pd.to_datetime("2023-05-21")
# keep patientID to align; exclude only non-feature fields
INTRADAY_EXCLUDE_COLS = {"Unnamed: 0", "huaweiID", "date", "start"}

# Optional stage-2 CSV that (if present) we add as a sheet
DYN_VISIT_OUT = Path("/kaggle/working/missingness_dynamic_by_visit.csv")

# Outputs
OUT_XLSX = Path("/kaggle/working/missingness_report_full.xlsx")
OUT_CSV_STATIC  = Path("/kaggle/working/missingness_static.csv")
OUT_CSV_DYNAMIC = Path("/kaggle/working/missingness_dynamic.csv")
OUT_CSV_PATIENT = Path("/kaggle/working/missingness_patient_level.csv")

# QC thresholds (%)
WARN_THRES = 30.0
ALERT_THRES = 50.0
# =====================================


# ===== Percent-aware normalization =====
def _strip_accents(s: str) -> str:
    s = unicodedata.normalize("NFKD", s)
    return "".join(ch for ch in s if not unicodedata.combining(ch))

def clean_name_base(s: str) -> str:
    """Base alnum key (lower, accents removed, spaces collapsed, punctuation removed)."""
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return ""
    s = str(s).lower().strip()
    s = _strip_accents(s)
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^a-z0-9]", "", s)
    return s

def has_percent(s: str) -> bool:
    """Detect percent variants like 'HbA1c (%)', 'hba1c%', 'hba1c%%'."""
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return False
    s0 = str(s).lower()
    if "%" in s0:
        return True
    if "(%)" in s0:
        return True
    # If you want the word 'percent' to count:
    # if re.search(r"\bpercent\b", s0): return True
    return False

def canonical_key(raw_name: str) -> str:
    """Percent-safe canonical key: base or base+'_pct'."""
    base = clean_name_base(raw_name)
    return f"{base}_pct" if has_percent(raw_name) else base

# Canonical targets (extend as needed)
CANON = {
    "meals_per_day": "meals_per_day",
    "carbs_per_day": "carbs_per_day",
    "active_insulin_time_hours": "active_insulin_time_hours",
    "average_sg_mgdl": "average_sg_mgdl",
    "icr_1": "icr_1",
    "icr_2": "icr_2",
    "sg_sd_mgdl": "sg_sd_mgdl",
    "total_daily_dose_units": "total_daily_dose_units",
    # example duals for percent/non-percent:
    "hba1c": "hba1c",
    "hba1c_pct": "hba1c_pct",
}

# Aliases on canonical_key(raw) -> canonical name
ALIASES = {
    # meals
    canonical_key("meals"): "meals_per_day",
    canonical_key("meal"): "meals_per_day",
    canonical_key("meals/day"): "meals_per_day",
    canonical_key("meals per day"): "meals_per_day",
    canonical_key("number of meals"): "meals_per_day",

    # carbs
    canonical_key("carb"): "carbs_per_day",
    canonical_key("carbs"): "carbs_per_day",
    canonical_key("carb/day"): "carbs_per_day",
    canonical_key("carbs/day"): "carbs_per_day",
    canonical_key("carbohydrates per day"): "carbs_per_day",

    # specifics
    canonical_key("Active insulin time (hours)"): "active_insulin_time_hours",
    canonical_key("Active insulin time hours"): "active_insulin_time_hours",
    canonical_key("AIT (hours)"): "active_insulin_time_hours",

    canonical_key("Average SG mg/dL"): "average_sg_mgdl",
    canonical_key("Avg SG mg/dL"): "average_sg_mgdl",
    canonical_key("Average sensor glucose (mg/dL)"): "average_sg_mgdl",

    canonical_key("ICR-1"): "icr_1",
    canonical_key("ICR 1"): "icr_1",
    canonical_key("insulin carb ratio 1"): "icr_1",

    canonical_key("ICR-2"): "icr_2",
    canonical_key("ICR 2"): "icr_2",
    canonical_key("insulin carb ratio 2"): "icr_2",

    canonical_key("SG SD mg/dL"): "sg_sd_mgdl",
    canonical_key("sensor glucose sd (mg/dL)"): "sg_sd_mgdl",

    canonical_key("Total daily dose (Unit)"): "total_daily_dose_units",
    canonical_key("Total daily dose (Units)"): "total_daily_dose_units",
    canonical_key("TDD (units)"): "total_daily_dose_units",
    canonical_key("TDD"): "total_daily_dose_units",

    # HbA1c (non-percent vs percent — DO NOT MERGE)
    canonical_key("HbA1c"): "hba1c",
    canonical_key("HBA1C"): "hba1c",
    canonical_key("HbA1c (%)"): "hba1c_pct",
    canonical_key("hba1c%"): "hba1c_pct",
    canonical_key("hba1c%%"): "hba1c_pct",
}

def normalize_to_canonical(raw_name: str) -> str:
    """Map raw -> canonical (percent-safe)."""
    key = canonical_key(raw_name)
    return ALIASES.get(key, key)


# =========================
# ====== HELPERS ==========
# =========================
def _clean_df_strings(df: pd.DataFrame) -> pd.DataFrame:
    df2 = df.copy()
    for col in df2.columns:
        if df2[col].dtype == object:
            s = df2[col].astype(str).str.strip()
            s = s.replace({"": np.nan, "nan": np.nan, "NaN": np.nan, "NA": np.nan, "None": np.nan})
            df2[col] = s
    return df2

def percent_missing_by_column(df: pd.DataFrame, visit_name: str) -> pd.DataFrame:
    df2 = _clean_df_strings(df)
    miss = df2.isna().mean().mul(100).round(2)
    out = miss.reset_index()
    out.columns = ["Feature", f"% Missing {visit_name}"]
    return out

def missing_counts(df: pd.DataFrame, visit_name: str) -> pd.DataFrame:
    df2 = _clean_df_strings(df)
    total = len(df2)
    miss = df2.isna().sum()
    return pd.DataFrame({
        "Feature": miss.index,
        f"missing_{visit_name}": miss.values,
        f"total_{visit_name}": total
    })

def availability_for_features(features, visit_dfs):
    avail = []
    for feat in features:
        present_in = [vname for vname, vdf in visit_dfs.items() if feat in vdf.columns]
        avail.append(present_in)
    return pd.Series(avail, name="Availability (Visits)")

def _pick_feature_col(cols):
    for c in ["Feature", "Feature Name", "Variable", "Field", "Name"]:
        if c in cols:
            return c
    return None

def load_dictionary(dict_path: Path) -> pd.DataFrame | None:
    if not dict_path.exists():
        print(f"[INFO] Dictionary not found at {dict_path}. Skipping metadata merge.")
        return None

    try:
        if dict_path.suffix.lower() == ".csv":
            raw = pd.read_csv(dict_path)
        else:
            xls = pd.ExcelFile(dict_path)
            selected = None
            for s in xls.sheet_names:
                tmp = pd.read_excel(dict_path, sheet_name=s)
                fcol = _pick_feature_col(tmp.columns)
                if fcol and "Category" in tmp.columns:
                    selected = tmp.copy()
                    break
            if selected is None:
                selected = pd.read_excel(dict_path)
            raw = selected
    except Exception as e:
        print(f"[WARN] Failed to read dictionary: {e}. Skipping metadata merge.")
        return None

    fcol = _pick_feature_col(raw.columns)
    if fcol is None:
        print("[WARN] No recognizable feature-name column in dictionary. Skipping metadata merge.")
        return None

    df = raw.rename(columns={fcol: "Feature"})
    if "Definition" in df.columns and "Definition & Unit" not in df.columns:
        df = df.rename(columns={"Definition": "Definition & Unit"})
    keep_cols = [c for c in [
        "Feature", "Category", "Definition & Unit", "Unit/Type", "Feature Type", "Subtype"
    ] if c in df.columns]
    if "Feature" not in keep_cols or "Category" not in keep_cols:
        print("[WARN] Dictionary missing 'Feature' or 'Category' after normalization. Skipping metadata merge.")
        return None

    df = df[keep_cols].drop_duplicates()
    # percent-aware merge keys for the dictionary side
    df["Feature_base"]      = df["Feature"].map(clean_name_base)
    df["Feature_is_pct"]    = df["Feature"].map(has_percent)
    df["Feature_canonical"] = df["Feature"].map(normalize_to_canonical)
    return df

def weighted_overall_pct(merged_counts: pd.DataFrame) -> pd.Series:
    missing_cols = [c for c in merged_counts.columns if c.startswith("missing_")]
    total_cols   = [c for c in merged_counts.columns if c.startswith("total_")]
    missing_total = merged_counts[missing_cols].sum(axis=1)
    total_total   = merged_counts[total_cols].sum(axis=1)
    with np.errstate(divide="ignore", invalid="ignore"):
        pct = np.where(total_total > 0, (missing_total / total_total) * 100, np.nan)
    return np.round(pct, 2)

def qc_format_excel(writer, sheet_name, warn=WARN_THRES, alert=ALERT_THRES):
    """Add conditional formatting for % columns (>30 yellow, >50 red)."""
    try:
        from openpyxl.styles import PatternFill
        from openpyxl.formatting.rule import CellIsRule
        wb = writer.book
        ws = wb[sheet_name]
        header = [cell.value for cell in ws[1]]
        pct_cols = [i for i, h in enumerate(header, start=1) if isinstance(h, str) and "Missing" in h]
        yellow = PatternFill(start_color="FFF3CD", end_color="FFF3CD", fill_type="solid")  # warn
        red    = PatternFill(start_color="F8D7DA", end_color="F8D7DA", fill_type="solid")  # alert
        max_row = ws.max_row
        for col in pct_cols:
            col_letter = ws.cell(row=1, column=col).column_letter
            rng = f"{col_letter}2:{col_letter}{max_row}"
            ws.conditional_formatting.add(rng, CellIsRule(operator="greaterThan", formula=[str(warn)], fill=yellow))
            ws.conditional_formatting.add(rng, CellIsRule(operator="greaterThan", formula=[str(alert)], fill=red))
    except Exception as e:
        print(f"[INFO] QC formatting skipped on {sheet_name}: {e}")


# ------------- Main -------------
def main():
    # Ensure output dir exists
    OUT_XLSX.parent.mkdir(parents=True, exist_ok=True)

    # Load visit sheets
    xls = pd.ExcelFile(MASTER_PATH)
    visit_sheet_names = [s for s in xls.sheet_names if s not in SKIP_SHEETS]
    visits = {name: pd.read_excel(MASTER_PATH, sheet_name=name) for name in visit_sheet_names}

    # ---- Static: per-visit missingness ----
    miss_tables = [percent_missing_by_column(df, name) for name, df in visits.items()]
    from functools import reduce
    merged_miss = reduce(lambda L, R: pd.merge(L, R, on="Feature", how="outer"), miss_tables) if miss_tables else pd.DataFrame(columns=["Feature"])

    count_tables = [missing_counts(df, name) for name, df in visits.items()]
    merged_counts = reduce(lambda L, R: pd.merge(L, R, on="Feature", how="outer"), count_tables) if count_tables else pd.DataFrame(columns=["Feature"])
    for col in merged_counts.columns:
        if col.startswith("missing_") or col.startswith("total_"):
            merged_counts[col] = merged_counts[col].fillna(0)

    final_static = merged_miss.copy()
    if not merged_counts.empty:
        final_static["Overall % Missing"] = weighted_overall_pct(merged_counts)
    else:
        final_static["Overall % Missing"] = np.nan

    final_static["Availability (Visits)"] = availability_for_features(final_static["Feature"], visits).apply(lambda x: json.dumps(x))

    # ---- Merge dictionary (percent-aware) ----
    dict_core = load_dictionary(DICT_PATH)
    final_static["Feature_base"]      = final_static["Feature"].map(clean_name_base)
    final_static["Feature_is_pct"]    = final_static["Feature"].map(has_percent)
    final_static["Feature_canonical"] = final_static["Feature"].map(normalize_to_canonical)

    if dict_core is not None:
        # Pass 1: canonical join
        merged = final_static.merge(
            dict_core.add_suffix("_dict"),
            left_on="Feature_canonical",
            right_on="Feature_canonical_dict",
            how="left"
        )
        # Pass 2 (strict): fallback by (base, is_pct) for unmatched rows
        need_fb = merged["Category_dict"].isna() if "Category_dict" in merged.columns else pd.Series(False, index=merged.index)
        if need_fb.any():
            fb_left = final_static.loc[need_fb, ["Feature", "Feature_base", "Feature_is_pct"]].copy()
            fb_right = dict_core.add_suffix("_dict")[
                ["Feature_dict", "Feature_base_dict", "Feature_is_pct_dict",
                 "Category_dict", "Definition & Unit_dict", "Unit/Type_dict", "Feature Type_dict", "Subtype_dict"]
            ]
            fb_joined = fb_left.merge(
                fb_right,
                left_on=["Feature_base", "Feature_is_pct"],
                right_on=["Feature_base_dict", "Feature_is_pct_dict"],
                how="left"
            )
            idx = merged.index[need_fb]
            for col in fb_joined.columns:
                if col.endswith("_dict") and col in merged.columns:
                    merged.loc[idx, col] = fb_joined[col].values

        # Prefer dictionary metadata where available
        for c in ["Category", "Definition & Unit", "Unit/Type", "Feature Type", "Subtype"]:
            lc, rc = c, f"{c}_dict"
            if lc not in merged.columns:
                merged[lc] = np.nan
            if rc in merged.columns:
                merged[lc] = merged[lc].combine_first(merged[rc])

        merged["Matched Dictionary Feature"] = merged.get("Feature_dict", np.nan)
        # Cleanup helper cols
        drop_helpers = [c for c in merged.columns if c.endswith("_dict")] + \
                       ["Feature_base", "Feature_base_dict", "Feature_is_pct", "Feature_is_pct_dict",
                        "Feature_canonical", "Feature_canonical_dict"]
        final_static = merged.drop(columns=[c for c in drop_helpers if c in merged.columns])
    else:
        for c in ["Category", "Definition & Unit", "Unit/Type", "Feature Type", "Subtype", "Matched Dictionary Feature"]:
            if c not in final_static.columns:
                final_static[c] = np.nan

    # Order columns
    visit_cols = [c for c in final_static.columns if c.startswith("% Missing ")]
    final_static = final_static[
        ["Category", "Feature", "Matched Dictionary Feature", "Definition & Unit",
         "Unit/Type", "Feature Type", "Subtype"]
        + visit_cols
        + ["Overall % Missing", "Availability (Visits)"]
    ]

    # ---- Ramadan/Shawwal rollups (static) ----
    counts_map = {name: missing_counts(df, name).set_index("Feature") for name, df in visits.items()}

    def pooled_group_pct(features, group_visits):
        if len(features) == 0:
            return np.array([])
        miss_total = pd.Series(0, index=features, dtype=float)
        tot_total  = pd.Series(0, index=features, dtype=float)
        for v in group_visits:
            if v not in counts_map:
                continue
            mc = counts_map[v]
            miss_total = miss_total.add(mc.get(f"missing_{v}", pd.Series(0, index=features)).reindex(features).fillna(0), fill_value=0)
            tot_total  = tot_total.add(mc.get(f"total_{v}", pd.Series(0, index=features)).reindex(features).fillna(0),   fill_value=0)
        with np.errstate(divide="ignore", invalid="ignore"):
            pct = np.where(tot_total > 0, (miss_total / tot_total) * 100, np.nan)
        return np.round(pct, 2)

    features_list = final_static["Feature"]
    final_static["% Missing Ramadan (rollup)"] = pooled_group_pct(features_list, RAMADAN_VISITS)
    final_static["% Missing Shawwal (rollup)"] = pooled_group_pct(features_list, SHAWWAL_VISITS)

    # ---- Patient-level missingness per visit (row-wise) ----
    plm_list = []
    for vname, vdf in visits.items():
        if PATIENT_ID_COL not in vdf.columns:
            continue
        df2 = _clean_df_strings(vdf)
        cols_eval = [c for c in df2.columns if c != PATIENT_ID_COL]
        if not cols_eval:
            continue
        row_pct = df2[cols_eval].isna().mean(axis=1).mul(100).round(2)
        tmp = pd.DataFrame({
            "Visit": vname,
            PATIENT_ID_COL: df2[PATIENT_ID_COL],
            "Row % Missing (all fields)": row_pct,
            "Missing Cells": df2[cols_eval].isna().sum(axis=1),
            "Total Cells": len(cols_eval)
        })
        plm_list.append(tmp)
    patient_level_missing = pd.concat(plm_list, ignore_index=True) if plm_list else pd.DataFrame()

    # ---- Dynamic (intraday) Ramadan/Shawwal ----
    final_dynamic = pd.DataFrame()
    if INCLUDE_INTRADAY and INTRADAY_CSV_PATH.exists():
        try:
            intraday = pd.read_csv(INTRADAY_CSV_PATH)
            if "date" in intraday.columns:
                intraday["date"] = pd.to_datetime(intraday["date"], errors="coerce")
                ram = intraday[(intraday["date"] >= RAMADAN_START) & (intraday["date"] <= RAMADAN_END)]
                shw = intraday[(intraday["date"] >= SHAWWAL_START) & (intraday["date"] <= SHAWWAL_END)]

                dyn_cols = [c for c in intraday.columns if c not in INTRADAY_EXCLUDE_COLS]

                rows = []
                for feat in dyn_cols:
                    r_miss = ram[feat].isna().mean() * 100 if len(ram) else np.nan
                    s_miss = shw[feat].isna().mean() * 100 if len(shw) else np.nan
                    both = pd.concat([ram[feat], shw[feat]]) if len(ram) or len(shw) else pd.Series(dtype=float)
                    overall = both.isna().mean() * 100 if len(both) else np.nan
                    rows.append({
                        "Feature": feat,
                        "% Missing Ramadan": round(r_miss, 2) if pd.notna(r_miss) else np.nan,
                        "% Missing Shawwal": round(s_miss, 2) if pd.notna(s_miss) else np.nan,
                        "Overall % Missing": round(overall, 2) if pd.notna(overall) else np.nan,
                        "Availability (Visits)": json.dumps(["Ramadan (intraday)", "Shawwal (intraday)"])
                    })
                final_dynamic = pd.DataFrame(rows)

                # Dictionary enrichment (percent-aware)
                dict_core_dyn = dict_core  # reuse if available
                if dict_core_dyn is not None and not final_dynamic.empty:
                    final_dynamic["Feature_base"]      = final_dynamic["Feature"].map(clean_name_base)
                    final_dynamic["Feature_is_pct"]    = final_dynamic["Feature"].map(has_percent)
                    final_dynamic["Feature_canonical"] = final_dynamic["Feature"].map(normalize_to_canonical)

                    merged_dyn = final_dynamic.merge(
                        dict_core_dyn.add_suffix("_dict"),
                        left_on="Feature_canonical",
                        right_on="Feature_canonical_dict",
                        how="left"
                    )
                    # strict fallback on (base, is_pct)
                    need_fb = merged_dyn["Category_dict"].isna() if "Category_dict" in merged_dyn.columns else pd.Series(False, index=merged_dyn.index)
                    if need_fb.any():
                        fb_left = final_dynamic.loc[need_fb, ["Feature", "Feature_base", "Feature_is_pct"]].copy()
                        fb_right = dict_core_dyn.add_suffix("_dict")[
                            ["Feature_dict", "Feature_base_dict", "Feature_is_pct_dict",
                             "Category_dict", "Definition & Unit_dict", "Unit/Type_dict", "Feature Type_dict", "Subtype_dict"]
                        ]
                        fb_joined = fb_left.merge(
                            fb_right,
                            left_on=["Feature_base", "Feature_is_pct"],
                            right_on=["Feature_base_dict", "Feature_is_pct_dict"],
                            how="left"
                        )
                        idx = merged_dyn.index[need_fb]
                        for col in fb_joined.columns:
                            if col.endswith("_dict") and col in merged_dyn.columns:
                                merged_dyn.loc[idx, col] = fb_joined[col].values

                    for c in ["Category", "Definition & Unit", "Unit/Type", "Feature Type", "Subtype"]:
                        lc, rc = c, f"{c}_dict"
                        if lc not in merged_dyn.columns:
                            merged_dyn[lc] = np.nan
                        if rc in merged_dyn.columns:
                            merged_dyn[lc] = merged_dyn[lc].combine_first(merged_dyn[rc])

                    merged_dyn["Matched Dictionary Feature"] = merged_dyn.get("Feature_dict", np.nan)
                    drop_helpers = [c for c in merged_dyn.columns if c.endswith("_dict")] + \
                                   ["Feature_base", "Feature_base_dict", "Feature_is_pct", "Feature_is_pct_dict",
                                    "Feature_canonical", "Feature_canonical_dict"]
                    final_dynamic = merged_dyn.drop(columns=[c for c in drop_helpers if c in merged_dyn.columns])
                else:
                    for c in ["Category", "Definition & Unit", "Unit/Type", "Feature Type", "Subtype", "Matched Dictionary Feature"]:
                        if c not in final_dynamic.columns:
                            final_dynamic[c] = np.nan
            else:
                print("[INFO] intraday has no 'date' column; dynamic missingness skipped.")
        except Exception as e:
            print(f"[WARN] Intraday integration skipped: {e}")

    # ---- Save everything to Excel with QC formatting ----
    with pd.ExcelWriter(OUT_XLSX, engine="openpyxl") as writer:
        # Static (always create at least one sheet to avoid openpyxl 'no visible sheet' error)
        final_static.to_excel(writer, index=False, sheet_name="Missingness_Static")
        qc_format_excel(writer, "Missingness_Static")

        # Dynamic (Ramadan/Shawwal)
        if not final_dynamic.empty:
            dyn_cols_order = ["Category", "Feature", "Matched Dictionary Feature", "Definition & Unit",
                              "Unit/Type", "Feature Type", "Subtype",
                              "% Missing Ramadan", "% Missing Shawwal", "Overall % Missing", "Availability (Visits)"]
            dyn_cols_order = [c for c in dyn_cols_order if c in final_dynamic.columns] + \
                             [c for c in final_dynamic.columns if c not in dyn_cols_order]
            final_dynamic[dyn_cols_order].to_excel(writer, index=False, sheet_name="Missingness_Dynamic")
            qc_format_excel(writer, "Missingness_Dynamic")

        # Dynamic-by-Visit (optional stage 2 CSV)
        if DYN_VISIT_OUT.exists():
            try:
                dyn_visit_df = pd.read_csv(DYN_VISIT_OUT)
                dyn_visit_df.to_excel(writer, index=False, sheet_name="Missingness_Dynamic_By_Visit")
                qc_format_excel(writer, "Missingness_Dynamic_By_Visit")
                print("✅ Added sheet: Missingness_Dynamic_By_Visit")
            except Exception as e:
                print(f"[INFO] Skipped Missingness_Dynamic_By_Visit: {e}")

        # Patient-level
        if not patient_level_missing.empty:
            patient_level_missing.to_excel(writer, index=False, sheet_name="Patient_Level_Missingness")

        # Unmapped audits
        if "Category" in final_static.columns:
            unmapped_static = (
                final_static[final_static["Category"].isna()][["Feature"]]
                .drop_duplicates().sort_values("Feature")
            )
            unmapped_static.to_excel(writer, index=False, sheet_name="Unmapped_Static")

        if not final_dynamic.empty and "Category" in final_dynamic.columns:
            unmapped_dyn = (
                final_dynamic[final_dynamic["Category"].isna()][["Feature"]]
                .drop_duplicates().sort_values("Feature")
            )
            unmapped_dyn.to_excel(writer, index=False, sheet_name="Unmapped_Dynamic")

        # Name mapping audits
        if "Matched Dictionary Feature" in final_static.columns:
            mapping_static = (
                final_static[["Feature", "Matched Dictionary Feature"]]
                .drop_duplicates().sort_values(["Feature", "Matched Dictionary Feature"])
            )
            mapping_static.to_excel(writer, index=False, sheet_name="Name_Mapping_Static")

        if not final_dynamic.empty and "Matched Dictionary Feature" in final_dynamic.columns:
            mapping_dyn = (
                final_dynamic[["Feature", "Matched Dictionary Feature"]]
                .drop_duplicates().sort_values(["Feature", "Matched Dictionary Feature"])
            )
            mapping_dyn.to_excel(writer, index=False, sheet_name="Name_Mapping_Dynamic")

    # CSVs
    final_static.to_csv(OUT_CSV_STATIC, index=False)
    if not final_dynamic.empty:
        final_dynamic.to_csv(OUT_CSV_DYNAMIC, index=False)
    if not patient_level_missing.empty:
        patient_level_missing.to_csv(OUT_CSV_PATIENT, index=False)

    print(f"Saved Excel report → {OUT_XLSX}")
    print(f"Saved static CSV   → {OUT_CSV_STATIC}")
    if not final_dynamic.empty:
        print(f"Saved dynamic CSV  → {OUT_CSV_DYNAMIC}")
    if not patient_level_missing.empty:
        print(f"Saved patient CSV  → {OUT_CSV_PATIENT}")


if __name__ == "__main__":
    main()


# 4. CGM with Dynamic Features 

1. Correlation heatmap → checks pairwise correlations of features vs CGM.
2. VIF (Variance Inflation Factor) → detects collinearity (e.g., steps ↔ distance ↔ calories).
3. Random Forest regression → finds the most important predictors of CGM.
4. Partial Dependence + ICE plots with smoothing → visualizes how the top predictors (heart_rate, calories, steps, distance, spo2 in your example) influence CGM, both on average and at the individual sample level.
      Blue = ICE curves (individual sample effects).
      Orange = average PDP.
      Red = smoothed PDP (to reduce jaggedness).
6. PCA (95% variance) → reduces 10 correlated features to the minimum number of orthogonal components that still explain ≥95% of the total variance.

## 1. VIF (Multicollinearity check)

Both periods show moderate collinearity between steps, calories, and distance (VIF ~3–5). That’s expected — people who walk more burn more calories and cover more distance.

Other features (heart rate, sleep stages, SpO₂, awake, nap) are clean (VIF ~1–1.3).

👉 Interpretation: nothing is screaming "drop me immediately", but steps/calories/distance are partially redundant. If you want a leaner model, you could collapse them (e.g., PCA or just pick one).

## 2. Random Forest Feature Importance

Heart rate + calories + steps are the big three drivers for CGM in both Ramadan & Shawwal.

Distance still matters, but less once steps/calories are in.

SpO₂ shows some relevance (~7–8%). Sleep features (deep, light, REM, nap, awake) barely move the needle (<3%).

👉 Interpretation: CGM is being explained more by physical activity + cardio strain than sleep metrics.

## 4. Partial Dependence + ICE plots with smoothing 

→ visualizes how the top predictors (heart_rate, calories, steps, distance, spo2 in your example) influence CGM, both on average and at the individual sample level.

Blue = ICE curves (individual sample effects). 
Orange = average PDP. 
Red = smoothed PDP (to reduce jaggedness).

## 5. PCA (Dimensionality reduction)

Ramadan: 10 features → 3 principal components capture 98.5% variance.
Shawwal: 10 features → 3 components capture 97.9% variance.

👉 Interpretation: data is very compressible. Three latent “axes” explain basically everything. Likely:
   Activity/energy (steps + calories + distance).
   Physiological (heart rate + SpO₂).
   ![](http://)Sleep/rest cycle (deep, REM, light, awake, nap).

## 6. Correlation differences

You mentioned a file "Live_CGM_Correlation_Ramadan_vs_Shawwal.xlsx" where you want to sort properly. Probably you want to line up features side-by-side and order them consistently (e.g., by Ramadan strength or absolute correlation).

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import PartialDependenceDisplay
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from statsmodels.stats.outliers_influence import variance_inflation_factor

# ---------------- CONFIG ----------------
PATH = "intraday_with_visits.xlsx"  # ← Update path if needed

FEATURES = [
    "calories", "deep", "distance", "heart_rate",
    "light", "nap", "rem", "spo2", "steps", "awake"
]

TARGET = "cgm"
PERIOD_COL_CANDIDATES = ["period_main", "period", "ramadan_shawwal", "season", "phase"]
TOP_K = 5

# ---------------- HELPERS ----------------
def find_period_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

def normalize_period(s):
    if pd.isna(s): return np.nan
    x = str(s).strip().lower()
    if "ramad" in x: return "Ramadan"
    if "shaw" in x: return "Shawwal"
    return np.nan

def to_numeric(df, cols):
    return df[cols].apply(pd.to_numeric, errors='coerce')

def safe_heatmap(mat, title):
    mask = mat.isna()
    plt.figure(figsize=(10, 8))
    sns.heatmap(mat, mask=mask, annot=True, fmt=".2f", cmap="coolwarm")
    plt.title(title)
    plt.tight_layout()
    plt.show()

def compute_vif(X_df):
    vif = []
    for i in range(X_df.shape[1]):
        try:
            v = variance_inflation_factor(X_df.values, i)
        except Exception:
            v = np.nan
        vif.append(v)
    return pd.DataFrame({'Feature': X_df.columns, 'VIF': vif}).sort_values('VIF', ascending=False)

def ensure_min_samples(df, nmin=10):
    if len(df) < nmin:
        print(f"⚠️ Not enough samples: {len(df)} < {nmin}")
        return False
    return True

# ---------------- LOAD DATA ----------------
df = pd.read_excel(PATH)
df.columns = df.columns.str.lower().str.replace(" ", "_")
period_col = find_period_col(df, PERIOD_COL_CANDIDATES)

if not period_col:
    raise ValueError("No valid period column found.")

df[period_col] = df[period_col].apply(normalize_period)

df_r = to_numeric(df[df[period_col] == "Ramadan"].copy(), FEATURES + [TARGET]).dropna(subset=[TARGET])
df_s = to_numeric(df[df[period_col] == "Shawwal"].copy(), FEATURES + [TARGET]).dropna(subset=[TARGET])

# ---------------- ANALYSIS FUNCTION ----------------
def analyze_period(df_period, label):
    print(f"\n=== {label} Analysis ===")
    if not ensure_min_samples(df_period): 
        return None

    # --- Correlation ---
    corr = df_period[FEATURES + [TARGET]].corr()
    safe_heatmap(corr, f"{label} – Feature Correlation")

    # --- VIF ---
    X = pd.DataFrame(
        SimpleImputer(strategy="median").fit_transform(df_period[FEATURES]),
        columns=FEATURES
    )
    vif_df = compute_vif(X)
    print("\nVIF:")
    print(vif_df)

    # --- Random Forest ---
    y = df_period[TARGET].values
    rf = RandomForestRegressor(n_estimators=300, random_state=42)
    rf.fit(X, y)
    importances = pd.DataFrame({
        "Feature": FEATURES,
        "Importance (%)": 100 * rf.feature_importances_ / rf.feature_importances_.sum()
    }).sort_values("Importance (%)", ascending=False)
    print("\nRandom Forest Importances:")
    print(importances)

    # --- PDP + ICE with smoothing ---
    from sklearn.inspection import partial_dependence
    top_feats = importances["Feature"].head(TOP_K).tolist()
    valid_feats = [f for f in top_feats if X[f].nunique() > 2]

    if valid_feats:
        try:
            fig, ax = plt.subplots(1, len(valid_feats), figsize=(5 * len(valid_feats), 4))
            if len(valid_feats) == 1:
                ax = [ax]

            for i, feat in enumerate(valid_feats):
                disp = PartialDependenceDisplay.from_estimator(
                    rf, X, [feat],
                    kind="both", subsample=200,
                    random_state=42, ax=ax[i],
                    percentiles=(0.01, 0.99)
                )

                # Smoothing
                pd_results = partial_dependence(
                    rf, X, [feat], kind="average", percentiles=(0.01, 0.99)
                )
                xx = pd_results['values'][0]
                yy = pd_results['average'][0]
                smooth_y = pd.Series(yy).rolling(window=10, min_periods=1, center=True).mean()

                ax[i].plot(xx, smooth_y, color="red", linewidth=2, label="Smoothed PDP")
                ax[i].legend()

            plt.suptitle(f"{label} – PDP + ICE (Smoothed)")
            plt.tight_layout()
            plt.show()

        except Exception as e:
            print(f"PDP failed even after filtering: {e}")
    else:
        print(f"⚠️ No valid features for PDP in {label}")

    # --- PCA ---
    pca = PCA(n_components=0.95, svd_solver='full')
    pca.fit(X)


    print(f"\nPCA – 95% Variance (Raw Features): {X.shape[1]} → {pca.n_components_}")
    print("Cumulative Variance:", np.round(np.cumsum(pca.explained_variance_ratio_), 4))

    # PCA loadings
    loadings = pd.DataFrame(
        pca.components_.T,
        index=FEATURES,
        columns=[f"PC{i+1}" for i in range(pca.n_components_)]
    )
    print("\nPCA Loadings (feature contributions per component):")
    print(loadings.round(3))

 
    # --- Plot PCA loadings as heatmap ---
    plt.figure(figsize=(8, 6))
    sns.heatmap(loadings, annot=True, cmap="coolwarm", center=0, fmt=".2f")
    plt.title(f"{label} – PCA Loadings Heatmap")
    plt.xlabel("Principal Components")
    plt.ylabel("Features")
    plt.tight_layout()
    plt.show()

    return {
        "corr": corr,
        "vif": vif_df,
        "rf_importance": importances,
        "pca": pca,
        "pca_loadings": loadings
    }



# ---------------- RUN ANALYSIS ----------------
res_r = analyze_period(df_r, "Ramadan")
res_s = analyze_period(df_s, "Shawwal")



# ---------------- OPTIONAL: EXPORT OR PLOT DIFFERENCE ----------------
import pandas as pd
import matplotlib.pyplot as plt

# Extract correlation of each feature with CGM from the results
corr_r = res_r["corr"]["cgm"].drop("cgm")   # Ramadan correlations
corr_s = res_s["corr"]["cgm"].drop("cgm")   # Shawwal correlations

# Build dataframe
df_corr = pd.DataFrame({
    "Ramadan": corr_r,
    "Shawwal": corr_s
})

# Use absolute values so all bars are positive
df_corr = df_corr.abs()

# --- Plot Ramadan only ---
df_corr["Ramadan"].sort_values().plot(
    kind="barh", color="firebrick", figsize=(8,6)
)
plt.title("Correlation with CGM – Ramadan")
plt.xlabel("Absolute Pearson Correlation")
plt.tight_layout()
plt.show()

# --- Plot Shawwal only ---
df_corr["Shawwal"].sort_values().plot(
    kind="barh", color="royalblue", figsize=(8,6)
)
plt.title("Correlation with CGM – Shawwal")
plt.xlabel("Absolute Pearson Correlation")
plt.tight_layout()
plt.show()

# --- Combined Comparison ---
df_corr.sort_values("Ramadan", ascending=True)[["Ramadan", "Shawwal"]].plot(
    kind="barh",
    color=["firebrick", "royalblue"],
    figsize=(10,6)
)
plt.title("Correlation with CGM – Ramadan vs. Shawwal")
plt.xlabel("Absolute Pearson Correlation")
plt.tight_layout()
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'intraday_with_visits.xlsx'

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# A: Correlation heatmap (already saved as .png)
img1 = mpimg.imread("Correlation with CGM – Ramadanl.png")
axes[0].imshow(img1)
axes[0].axis("off")
axes[0].set_title("(A) Feature Correlation")

# B: VIF bar chart
vif = {"steps":4.99,"distance":2.99,"calories":2.70,"heart_rate":1.26,"spo2":1.17,"light":1.07,
       "rem":1.04,"deep":1.04,"nap":1.01,"awake":1.01}
axes[1].barh(list(vif.keys()), list(vif.values()), color="skyblue")
axes[1].axvline(5, color="r", linestyle="--", label="threshold")
axes[1].invert_yaxis()
axes[1].set_title("(B) VIF < 5 – Acceptable collinearity")
axes[1].set_xlabel("VIF")

# C: Random Forest importance
rf_imp = {"heart_rate":29.35,"calories":26.78,"steps":17.13,"distance":11.03,"spo2":7.68,
          "light":2.61,"rem":2.10,"deep":1.26,"awake":1.22,"nap":0.84}
axes[2].barh(list(rf_imp.keys()), list(rf_imp.values()), color="orange")
axes[2].invert_yaxis()
axes[2].set_title("(C) Random Forest Importance")
axes[2].set_xlabel("Importance (%)")

plt.tight_layout()
plt.savefig("Feature_Stability_Importance_Figure_Sx.png", dpi=300)
plt.show()


## PCA Components Conceptual Diagram 

In [None]:
# ================= PCA Components Conceptual Diagram ==================
import matplotlib.pyplot as plt

# Define conceptual components
points = {
    "PC1: Activity / Energy\n(steps, distance, calories)": (0, 1),
    "PC2: Physiology\n(heart_rate, SpO₂)": (-0.87, -0.5),
    "PC3: Sleep / Rest Pattern\n(deep, light, REM, nap, awake)": (0.87, -0.5)
}

# Create plot
fig, ax = plt.subplots(figsize=(6,6))

# Plot points and labels
for (label, (x,y)) in points.items():
    ax.scatter(x, y, s=500, marker="o", color="skyblue", edgecolors="k", zorder=3)
    ax.text(x, y+0.1, label, ha="center", va="bottom", fontsize=10, weight="bold")

# Connect points to form a triangle
coords = list(points.values())
for i in range(len(coords)):
    x1, y1 = coords[i]
    x2, y2 = coords[(i+1) % len(coords)]
    ax.plot([x1, x2], [y1, y2], "k-", lw=1.5)

# Formatting
ax.set_xlim(-1.2, 1.2)
ax.set_ylim(-1, 1.2)
ax.axis("off")
ax.set_title("PCA Components from CGM Features", fontsize=14, weight="bold")
plt.show()


Great—steps 1–4 are done and you now have:

final_master_sheet_clean_with_visits.xlsx

intraday_with_visits.csv

The next milestone is to materialize PATH_HOURLY (the hourly ML-ready dataset) and also produce the two helper tables PATH_VISIT (per‑visit features) and PATH_BASE (static baseline).

Kaggle note: you cannot write to /kaggle/input. Write to /kaggle/working and, if needed, create a dataset from those files afterward.

Below are drop‑in cells you can paste into your Kaggle notebook. They will:

Build hourly CGM features (+ context) from intraday_with_visits.csv.

Extract per‑visit features (carb, meals, TDD, fasting%) from the master workbook.

Extract static baseline variables (Age, Gender, BMI, HbA1C, Cholesterol, LDL, HDL, Triglycerides, eGFR, Creatinine, Insulin_units_per_kg, SmartGuard_percent).

Merge them (patientID & visit‑aware) into a single hourly ML table.

Save the three CSVs under /kaggle/working/… and expose feature lists you’ll pass to the XGB/BiLSTM cells.

Leakage‑safe PCA: we do not compute pca_cgm1–3 or lifestyle PCs here. You’ll compute those inside the modeling pipeline on the training fold only, as we set up earlier. The hourly CSV will carry the raw columns (CGM dynamics + activity/sleep/HR) that PCA will be derived from.

# Cell A — Paths & setup

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

# ----- Inputs produced in your earlier steps -----
INTRADAY_ANN = Path("/kaggle/working/intraday_with_visits.csv")
MASTER_XLSX  = Path("/kaggle/working/final_master_sheet_clean_with_visits.xlsx")

# ----- Outputs (write to /kaggle/working) -----
PATH_HOURLY = Path("/kaggle/working/hourly_features_ml.csv")
PATH_VISIT  = Path("/kaggle/working/per_visit_features.csv")
PATH_BASE   = Path("/kaggle/working/static_baseline.csv")

print("Reading:", INTRADAY_ANN.exists(), MASTER_XLSX.exists())


# Cell B — Build hourly CGM features (+ context)

In [None]:
# Robust loader
intraday = pd.read_csv(INTRADAY_ANN)
cols = {c.lower(): c for c in intraday.columns}

# unify ID
pid_col = None
for k in ["patientid", "patient_id", "patientid_(huawei_data)", "patientid_(huawei data)", "huaweiid"]:
    if k in cols:
        pid_col = cols[k]; break
if pid_col is None:
    raise ValueError("No patient ID column found in intraday_with_visits.csv")

# unify time → 'start' and 'hour'
time_col = None
for k in ["start", "datetime", "timestamp", "time"]:
    if k in cols:
        time_col = cols[k]; break
if time_col is None and "date" in cols:
    # if only 'date' present, treat it as midnight timestamps and warn
    intraday["start"] = pd.to_datetime(intraday[cols["date"]], errors="coerce")
else:
    intraday["start"] = pd.to_datetime(intraday[time_col], errors="coerce")

intraday["hour"] = intraday["start"].dt.floor("h")

# unify CGM column (sensor glucose)
cgm_col = None
for k in ["cgm", "sg", "sensor_glucose", "glucose"]:
    if k in cols:
        cgm_col = cols[k]; break
if cgm_col is None:
    raise ValueError("No CGM column found in intraday_with_visits.csv (expected one of: cgm, sg, sensor_glucose, glucose).")

# Optional lifestyle columns at intraday grain (aggregate later if present)
lifestyle_candidates = [
    "steps","calories","distance","heart_rate","spo2",
    "deep","light","rem","nap","awake"
]
present_dyn = [cols[k] for k in lifestyle_candidates if k in cols]

# Keep period/visit context if present
visit_col  = cols.get("visit_assigned", None)
period_col = cols.get("period_main", None)

keep_cols = [pid_col, "hour", cgm_col] + [c for c in [visit_col, period_col] if c] + present_dyn
df = intraday[keep_cols].copy()

# --- hourly CGM stats per patient-hour
g = df.groupby([pid_col, "hour"], as_index=False)

hourly_cgm = g.agg(
    cgm_mean=(cgm_col, "mean"),
    cgm_min =(cgm_col, "min"),
    cgm_max =(cgm_col, "max"),
    cgm_std =(cgm_col, "std")
)

hourly_cgm["cgm_mean_plus_std"]  = hourly_cgm["cgm_mean"] + hourly_cgm["cgm_std"].fillna(0)
hourly_cgm["cgm_mean_minus_std"] = hourly_cgm["cgm_mean"] - hourly_cgm["cgm_std"].fillna(0)

# Label: hypoglycemia within this hour (detection). (Forecast label is created later in modeling if you choose.)
hypo_hour = (df[cgm_col] < 70).groupby([df[pid_col], df["hour"]]).any().reset_index()
hypo_hour.columns = [pid_col, "hour", "hypo_label"]
hourly_cgm = hourly_cgm.merge(hypo_hour, on=[pid_col, "hour"], how="left")
hourly_cgm["hypo_label"] = hourly_cgm["hypo_label"].fillna(False).astype(int)

# Re-attach visit/period context (take the most frequent within the hour if needed)
def mode_or_first(s):
    try:
        return s.mode(dropna=True).iloc[0]
    except Exception:
        return s.dropna().iloc[0] if s.notna().any() else np.nan

ctx = df.groupby([pid_col, "hour"]).agg(
    visit_assigned=(visit_col, mode_or_first) if visit_col else (cgm_col, "size"),
    period_main=(period_col, mode_or_first) if period_col else (cgm_col, "size")
).reset_index()

if visit_col is None:
    ctx = ctx.drop(columns=["visit_assigned"])
if period_col is None:
    ctx = ctx.drop(columns=["period_main"])

hourly = hourly_cgm.merge(ctx, on=[pid_col, "hour"], how="left")

# hour-of-day (0–23)
hourly["hour_of_day"] = pd.to_datetime(hourly["hour"]).dt.hour

# rename patientID column consistently
hourly = hourly.rename(columns={pid_col: "patientID"})

print("Hourly CGM rows:", len(hourly), "| patients:", hourly["patientID"].nunique())
hourly.head(3)


# Cell C — Aggregate lifestyle to hourly (if present)

In [None]:
# For activity-like streams, use sums per hour; for vitals, use means.
SUM_COLS  = [c for c in ["steps","calories","distance","deep","light","rem","nap","awake"] if c in cols]
MEAN_COLS = [c for c in ["heart_rate","spo2"] if c in cols]

agg_parts = []
if SUM_COLS:
    agg_sum = df.groupby([pid_col,"hour"])[[cols[c] for c in SUM_COLS]].sum().reset_index()
    # restore canonical names
    agg_sum = agg_sum.rename(columns={cols[c]: c for c in SUM_COLS})
    agg_parts.append(agg_sum)

if MEAN_COLS:
    agg_mean = df.groupby([pid_col,"hour"])[[cols[c] for c in MEAN_COLS]].mean().reset_index()
    agg_mean = agg_mean.rename(columns={cols[c]: c for c in MEAN_COLS})
    agg_parts.append(agg_mean)

if agg_parts:
    agg_all = agg_parts[0]
    for k in agg_parts[1:]:
        agg_all = agg_all.merge(k, on=[pid_col, "hour"], how="outer")
    agg_all = agg_all.rename(columns={pid_col: "patientID"})
    hourly = hourly.merge(agg_all, on=["patientID","hour"], how="left")

print("Added lifestyle columns:", [c for c in ["steps","calories","distance","deep","light","rem","nap","awake","heart_rate","spo2"] if c in hourly.columns])
hourly.head(3)


## Cell D — Build per‑visit table (PATH_VISIT)

In [None]:
import re
from pandas import ExcelFile

def normalize_visit_name(x: str) -> str:
    if not isinstance(x, str): return x
    t = x.strip()
    m = re.search(r"visit\s*(\d+)", t, flags=re.I)
    if m: return f"Visit {int(m.group(1))}"
    if "ramadan" in t.lower(): return "Ramadan"
    if "shaw" in t.lower():    return "Shawwal"
    return t

# Columns we want (robust fuzzy find)
want_keys = {
    "carb": r"carb",                # carb(s)
    "meals": r"meal",               # meals
    "tdd_u": r"(total\s*daily\s*dose|tdd)",  # total daily dose units
    "fasting_pct_29": r"fasting.*(29|%)"     # fasting % (out of 29 days)
}

vis_rows = []
xls = ExcelFile(MASTER_XLSX)
for sheet in xls.sheet_names:
    if sheet in {"HMC_map_patientID"}: 
        continue
    dfv = pd.read_excel(MASTER_XLSX, sheet_name=sheet)
    # locate patient id
    pidc = None
    for cand in ["PatientID (Huawei Data)","patientID","PatientID","huaweiID"]:
        if cand in dfv.columns: pidc = cand; break
    if pidc is None: 
        continue
    # pick columns if present
    sel = { "Visit": normalize_visit_name(sheet), "SourceSheet": sheet }
    ok = False
    for key, pat in want_keys.items():
        hit = [c for c in dfv.columns if re.search(pat, str(c), flags=re.I)]
        if hit:
            sel[key] = dfv[hit[0]]
            ok = True
        else:
            sel[key] = np.nan
    if ok:
        tmp = pd.DataFrame({
            "patientID": dfv[pidc].values,
            "visit": sel["Visit"],
            "carb": sel["carb"],
            "meals": sel["meals"],
            "total_daily_dose_u": sel["tdd_u"],
            "fasting_pct_29": sel["fasting_pct_29"]
        })
        vis_rows.append(tmp)

visit_tbl = pd.concat(vis_rows, ignore_index=True) if vis_rows else pd.DataFrame(
    columns=["patientID","visit","carb","meals","total_daily_dose_u","fasting_pct_29"]
)

# Clean types
for c in ["carb","meals","total_daily_dose_u","fasting_pct_29"]:
    if c in visit_tbl.columns:
        visit_tbl[c] = pd.to_numeric(visit_tbl[c], errors="coerce")

# Save (and also keep to merge later)
visit_tbl.to_csv(PATH_VISIT, index=False)
print(f"Saved per-visit features → {PATH_VISIT} | rows:", len(visit_tbl), "| visits:", visit_tbl["visit"].nunique())
visit_tbl.head(3)


# Cell E — Build static baseline table (PATH_BASE)

In [None]:
# We’ll scan all sheets and pick the best-available columns per patient.
STATIC_WANT = {
    "Age": [r"^age$"],
    "Gender": [r"^gender$", r"^sex$"],
    "BMI": [r"^bmi$"],
    "HbA1C": [r"^hba1c\b(?!.*%)", r"^a1c$"],
    "Cholesterol": [r"chol(?!\w)"],
    "LDL": [r"^ldl$"],
    "HDL": [r"^hdl$"],
    "Triglycerides": [r"^trig(lycerides)?$","^tg$"],
    "eGFR": [r"^egfr$"],
    "Creatinine": [r"creat"],
    "Insulin_units_per_kg": [r"units.*kg", r"tdd.*kg", r"^iu/kg$"],
    "SmartGuard_percent": [r"smart.*guard", r"sg.*%"]
}

def first_match(df, pats):
    for p in pats:
        hit = [c for c in df.columns if re.search(p, str(c), flags=re.I)]
        if hit: return hit[0]
    return None

stat_rows = []
for sheet in xls.sheet_names:
    if sheet in {"HMC_map_patientID"}:
        continue
    d = pd.read_excel(MASTER_XLSX, sheet_name=sheet)
    pidc = None
    for cand in ["PatientID (Huawei Data)","patientID","PatientID","huaweiID"]:
        if cand in d.columns: pidc = cand; break
    if pidc is None: 
        continue
    rec = {"patientID": d[pidc]}
    found_any = False
    for out_name, patterns in STATIC_WANT.items():
        col = first_match(d, patterns)
        if col is not None:
            rec[out_name] = pd.to_numeric(d[col], errors="coerce") if d[col].dtype != object else d[col]
            found_any = True
    if found_any:
        stat_rows.append(pd.DataFrame(rec))

static_tbl = (pd.concat(stat_rows, ignore_index=True)
              if stat_rows else pd.DataFrame({"patientID": []}))

# collapse duplicates by patient (prefer last non-null)
static_tbl = static_tbl.sort_values("patientID").groupby("patientID", as_index=False).agg(lambda s: s.dropna().iloc[-1] if s.notna().any() else np.nan)

# Optional: derive Insulin_units_per_kg if missing and TDD/weight available (commented here; enable if you have columns)
# if "Insulin_units_per_kg" not in static_tbl.columns and {"total_daily_dose_u","Weight_kg"}.issubset(set(static_tbl.columns)):
#     static_tbl["Insulin_units_per_kg"] = static_tbl["total_daily_dose_u"] / static_tbl["Weight_kg"]

# Save
static_tbl.to_csv(PATH_BASE, index=False)
print(f"Saved static baseline → {PATH_BASE} | patients:", static_tbl["patientID"].nunique())
static_tbl.head(3)


# Cell F — Merge everything → PATH_HOURLY

In [None]:
# Left join hourly ← per-visit by (patientID, visit_assigned)
hourly_merged = hourly.copy()
if "visit_assigned" in hourly_merged.columns:
    # normalize visit name in visit_tbl
    visit_tbl_norm = visit_tbl.copy()
    visit_tbl_norm["visit"] = visit_tbl_norm["visit"].astype(str).str.strip().str.replace(r"\s+", " ", regex=True)
    hourly_merged["visit_assigned"] = hourly_merged["visit_assigned"].astype(str).str.strip().str.replace(r"\s+", " ", regex=True)
    hourly_merged = hourly_merged.merge(
        visit_tbl_norm.rename(columns={"visit":"visit_assigned"}),
        on=["patientID","visit_assigned"], how="left"
    )
else:
    print("⚠️ No visit_assigned in hourly — skipping per-visit merge.")

# Left join hourly ← static baseline by patientID
hourly_merged = hourly_merged.merge(static_tbl, on="patientID", how="left")

# Final ordering
front = ["patientID","hour","hour_of_day"]
ctx   = [c for c in ["visit_assigned","period_main"] if c in hourly_merged.columns]
target= ["hypo_label"]
cgm   = ["cgm_min","cgm_max","cgm_mean","cgm_std","cgm_mean_plus_std","cgm_mean_minus_std"]
lifestyle = [c for c in ["steps","calories","distance","deep","light","rem","nap","awake","heart_rate","spo2"] if c in hourly_merged.columns]
pervisit  = [c for c in ["carb","meals","total_daily_dose_u","fasting_pct_29"] if c in hourly_merged.columns]
static    = [c for c in ["Age","Gender","BMI","HbA1C","Cholesterol","LDL","HDL","Triglycerides","eGFR","Creatinine","Insulin_units_per_kg","SmartGuard_percent"] if c in hourly_merged.columns]

col_order = front + ctx + target + cgm + lifestyle + pervisit + static
col_order += [c for c in hourly_merged.columns if c not in col_order]  # keep any extras at end

hourly_merged = hourly_merged[col_order]
hourly_merged.to_csv(PATH_HOURLY, index=False)

print(f"✅ Saved hourly ML table → {PATH_HOURLY}")
print("Rows:", len(hourly_merged), "patients:", hourly_merged['patientID'].nunique())
hourly_merged.head(3)


# Cell G — Feature lists for modeling (to reuse exactly in the XGB/BiLSTM cells)

In [None]:
# These lists will feed directly into the modeling cells you have.
TARGET = "hypo_label"

dyn_cols = [c for c in [
    # CGM dynamics (raw; PCA comes later in the pipeline, train-only)
    "cgm_min","cgm_max","cgm_mean","cgm_std",
    "cgm_mean_plus_std","cgm_mean_minus_std",
    # Lifestyle raw streams aggregated hourly (PCs later, train-only)
    "steps","calories","distance","deep","light","rem","nap","awake","heart_rate","spo2"
] if c in hourly_merged.columns]

visit_cols = [c for c in ["carb","meals","total_daily_dose_u","fasting_pct_29"] if c in hourly_merged.columns]
static_cols = [c for c in ["Age","Gender","BMI","HbA1C","Cholesterol","LDL","HDL","Triglycerides","eGFR","Creatinine","Insulin_units_per_kg","SmartGuard_percent"] if c in hourly_merged.columns]
context_cols = [c for c in ["hour_of_day","visit_assigned","period_main"] if c in hourly_merged.columns]

print("dyn_cols:", dyn_cols)
print("visit_cols:", visit_cols)
print("static_cols:", static_cols)
print("context_cols:", context_cols)

# Load the final CSV as your canonical "hourly" frame for the next cells:
hourly = pd.read_csv(PATH_HOURLY, parse_dates=["hour"])
print("hourly shape:", hourly.shape)
