## DTW Feature Engineering – Cohort 2 (POLYPHARMACY_ED, non_opioid_ed), Configurable Age Band

This notebook configures DTW-based trajectory features for `non_opioid_ed` (polypharmacy ED)
across configurable older age bands (e.g. **65-74**, **75-84**, **85-94**) for the TRAIN window (2016–2018).

- **Source events**: `model_data/cohort_name=non_opioid_ed/age_band={AGE_BAND}/model_events.parquet`
- **High-signal activity alphabet**: FP-Growth TRAIN target-only itemsets
  `4_fpgrowth_analysis/outputs/non_opioid_ed/target/{AGE_BAND_FNAME}/train/*_itemsets_target_only.json`.
- **Activities**: `DRUG:<code>`, `ICD:<code>`, `CPT:<code>` from `drug_name`, all ICD columns, and `procedure_code`,
  with an emphasis on **DRUG:** trajectories for polypharmacy burden.
- **Outputs**: per-patient DTW distance features
  `non_opioid_ed_{AGE_BAND_FNAME}_train_target_dtw_features.csv` saved under
  `6_dtw_analysis/outputs/non_opioid_ed/{AGE_BAND_FNAME}/features/` (and optionally uploaded to S3).

For each age band we:
- Build per-patient ordered DRUG/ICD/CPT sequences (target-only, filtered by FP-Growth itemsets).
- Map activities to integer IDs.
- Compute DTW distances to a small set of prototype trajectories.
- Export a **patient-level DTW feature table** aligned with the final modeling pipeline.



In [None]:
# ---- Config for Cohort 2 (POLYPHARMACY_ED, non_opioid_ed), configurable age band ----

from pathlib import Path
import json
import math
import numpy as np
import pandas as pd

PROJECT_ROOT = Path.cwd().resolve()  # assume you launch from project root

COHORT_NAME = "non_opioid_ed"

# Valid polypharmacy age bands (cohorts 6–8)
VALID_AGE_BANDS = ["65-74", "75-84", "85-94"]
AGE_BAND = "65-74"  # <-- change this to run for another age band

if AGE_BAND not in VALID_AGE_BANDS:
    raise ValueError(f"Invalid AGE_BAND: {AGE_BAND}. Choose one of: {VALID_AGE_BANDS}")

TRAIN_YEARS = [2016, 2017, 2018]
AGE_BAND_FNAME = AGE_BAND.replace("-", "_")

MODEL_DATA_PATH = (
    PROJECT_ROOT
    / "model_data"
    / f"cohort_name={COHORT_NAME}"
    / f"age_band={AGE_BAND}"
    / "model_events.parquet"
)

FPGROWTH_ROOT = PROJECT_ROOT / "4_fpgrowth_analysis" / "outputs" / COHORT_NAME
TARGET_DIR_TRAIN = FPGROWTH_ROOT / "target" / AGE_BAND_FNAME / "train"

ITEMSETS_DRUG_PATH = TARGET_DIR_TRAIN / "drug_name_itemsets_target_only.json"
ITEMSETS_ICD_PATH = TARGET_DIR_TRAIN / "icd_code_itemsets_target_only.json"
ITEMSETS_MEDICAL_PATH = TARGET_DIR_TRAIN / "medical_code_itemsets_target_only.json"

print("Project root:", PROJECT_ROOT)
print("Cohort:", COHORT_NAME)
print("Age band:", AGE_BAND)
print("Model data path:", MODEL_DATA_PATH)
print("FP-Growth target TRAIN dir:", TARGET_DIR_TRAIN)

if not MODEL_DATA_PATH.exists():
    raise FileNotFoundError(f"model_data parquet not found: {MODEL_DATA_PATH}")

DTW_OUTPUT_ROOT = PROJECT_ROOT / "6_dtw_analysis" / "outputs" / COHORT_NAME / AGE_BAND_FNAME
DTW_FEATURE_ROOT = DTW_OUTPUT_ROOT / "features"
DTW_FEATURE_ROOT.mkdir(parents=True, exist_ok=True)

print("DTW output root:", DTW_OUTPUT_ROOT)
print("DTW feature root:", DTW_FEATURE_ROOT)


def save_dtw_csv(df: pd.DataFrame, filename: str, upload_to_s3: bool = False) -> Path:
    out_path = DTW_FEATURE_ROOT / filename
    df.to_csv(out_path, index=False)
    print(f"[DTW] Saved {len(df)} rows to {out_path}")

    if upload_to_s3:
        s3_key = f"gold/dtw/{COHORT_NAME}/{AGE_BAND}/{filename}"
        s3_uri = f"s3://pgxdatalake/{s3_key}"
        cmd = f"aws s3 cp \"{out_path}\" \"{s3_uri}\""
        print("[DTW] Uploading to S3 with command:\n  ", cmd)
        import os
        os.system(cmd)

    return out_path



In [None]:
# ---- Load model_data and build target patient activity sequences ----

import duckdb

con = duckdb.connect()

query = f"""
SELECT *
FROM read_parquet('{MODEL_DATA_PATH}')
WHERE event_year IN ({', '.join(str(y) for y in TRAIN_YEARS)})
"""

pgx_df = con.execute(query).df()
con.close()

print("Loaded", len(pgx_df), "events from model_data for", COHORT_NAME, "age_band=", AGE_BAND,
      "years", TRAIN_YEARS)

# Target-only for DTW trajectories
pgx_df_target1 = pgx_df[pgx_df["target"] == 1].copy()
print("Target=1 rows:", len(pgx_df_target1))

# Build allowed_codes from FP-Growth TRAIN target-only itemsets (drug, icd, medical_code)
allowed_codes: set[str] = set()

for path in [ITEMSETS_DRUG_PATH, ITEMSETS_ICD_PATH, ITEMSETS_MEDICAL_PATH]:
    if path.exists():
        with open(path, "r") as f:
            data = json.load(f)
        for row in data:
            for code in row.get("itemsets", []):
                allowed_codes.add(code)
        print(f"Loaded {len(data)} itemsets from {path.name}")
    else:
        print(f"[WARN] Itemsets file not found: {path}")

print("Total allowed codes from itemsets:", len(allowed_codes))

activity_rows: list[tuple[str, str, str]] = []

for _, row in pgx_df_target1.iterrows():
    pid = row["mi_person_key"]
    event_date = row["event_date"]

    # Drug (primary focus for polypharmacy)
    drug = row.get("drug_name")
    if isinstance(drug, str) and drug not in ("", "NA") and drug in allowed_codes:
        activity_rows.append((pid, event_date, f"DRUG:{drug}"))

    # ICDs and CPT as contextual activities
    for col in [
        "primary_icd_diagnosis_code",
        "two_icd_diagnosis_code",
        "three_icd_diagnosis_code",
        "four_icd_diagnosis_code",
        "five_icd_diagnosis_code",
        "six_icd_diagnosis_code",
        "seven_icd_diagnosis_code",
        "eight_icd_diagnosis_code",
        "nine_icd_diagnosis_code",
        "ten_icd_diagnosis_code",
    ]:
        code = row.get(col)
        if isinstance(code, str) and code not in ("", "NA") and code in allowed_codes:
            activity_rows.append((pid, event_date, f"ICD:{code}"))

    proc = row.get("procedure_code")
    if isinstance(proc, str) and proc not in ("", "NA") and proc in allowed_codes:
        activity_rows.append((pid, event_date, f"CPT:{proc}"))

activity_df = pd.DataFrame(activity_rows, columns=["mi_person_key", "event_date", "activity"])
activity_df["event_date"] = pd.to_datetime(activity_df["event_date"])

print("Activity table shape:", activity_df.shape)

activity_df = activity_df.sort_values(["mi_person_key", "event_date"])

sequences = (
    activity_df
    .groupby("mi_person_key")["activity"]
    .apply(list)
    .reset_index()
    .rename(columns={"activity": "activity_sequence"})
)

print("Built", len(sequences), "patient sequences for DTW")



In [None]:
# ---- DTW distance computation to prototype trajectories ----

from typing import List, Dict

unique_activities = sorted({a for seq in sequences["activity_sequence"] for a in seq})
activity_to_id: Dict[str, int] = {a: i for i, a in enumerate(unique_activities)}

print("Unique activities:", len(unique_activities))

int_sequences: Dict[str, List[int]] = {}
for _, row in sequences.iterrows():
    pid = row["mi_person_key"]
    seq = [activity_to_id[a] for a in row["activity_sequence"]]
    int_sequences[pid] = seq


def dtw_distance(s1: List[int], s2: List[int]) -> float:
    n, m = len(s1), len(s2)
    if n == 0 or m == 0:
        return math.inf
    dp = np.full((n + 1, m + 1), np.inf)
    dp[0, 0] = 0.0
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            cost = 0.0 if s1[i - 1] == s2[j - 1] else 1.0
            dp[i, j] = cost + min(dp[i - 1, j], dp[i, j - 1], dp[i - 1, j - 1])
    return float(dp[n, m])


MAX_PROTOTYPES = 20
prototype_ids: List[str] = []
for pid, seq in int_sequences.items():
    if seq:
        prototype_ids.append(pid)
    if len(prototype_ids) >= MAX_PROTOTYPES:
        break

print("Selected", len(prototype_ids), "prototype patients for DTW features")

feature_rows: List[Dict[str, float]] = []

for pid, seq in int_sequences.items():
    row: Dict[str, float] = {"mi_person_key": pid}
    for k, proto_pid in enumerate(prototype_ids, start=1):
        d = dtw_distance(seq, int_sequences[proto_pid])
        row[f"dtw_dist_proto_{k}"] = d
    feature_rows.append(row)

features_df = pd.DataFrame(feature_rows)
print("DTW feature table shape:", features_df.shape)

out_filename = f"{COHORT_NAME}_{AGE_BAND_FNAME}_train_target_dtw_features.csv"
save_dtw_csv(features_df, out_filename, upload_to_s3=False)

