## Cohort 2 (POLYPHARMACY_ED, `non_opioid_ed`) – bupaR Pipeline (Configurable Age Band)

This notebook configures the bupaR pipeline to use the **model_data** and **FP-Growth outputs**
for cohort 2 (`non_opioid_ed` – polypharmacy ED visits), with a configurable older age band
(e.g. **65-74**, **75-84**, **85-94**).

In the config cell below, set `age_band` to one of the valid polypharmacy age bands. The notebook will
then read the corresponding `model_data` and FP-Growth TRAIN outputs and produce event logs and features
for that age cohort.

- **Source events**: `model_data/cohort_name=non_opioid_ed/age_band={age_band}/model_events.parquet`
- **Training window**: event_year in {2016, 2017, 2018} (labeled as `train` in FP-Growth outputs)
- **Target ICD codes**: HCG* (e.g. pregnancy-related HCG codes) – we focus on **events up to HCG**.
- **Target-only patterns**: `4_fpgrowth_analysis/outputs/non_opioid_ed/target/{age_band_fname}/train/*_itemsets_target_only.json`
- **Combined patterns**: `4_fpgrowth_analysis/outputs/non_opioid_ed/combined/{age_band_fname}/train/*_itemsets.json`

Below we:
- Build target-only and combined (target+control) event logs.
- Analyze **before HCG** (pre-target) sequences only.
- Export pre-HCG and time-to-HCG per-patient features, plus target-only traces/process matrices,
  for integration into tabular modeling datasets.

**Cohort emphasis**: for `non_opioid_ed` (polypharmacy ED), we treat **drug_name events and their
sequences over time** as primary signals (medication burden/combinations), with ICD/CPT codes
used mainly for contextual diagnoses and procedures.



In [None]:
# ---- Config for Cohort 2 (POLYPHARMACY_ED, non_opioid_ed), configurable age band ----

project_root <- getwd()  # assume you launched from project root

cohort_name <- "non_opioid_ed"

# Choose one of the valid polypharmacy age bands
valid_age_bands <- c("65-74", "75-84", "85-94")
age_band <- "65-74"  # <-- change this for other age cohorts

if (!age_band %in% valid_age_bands) {
  stop("Invalid age_band. Choose one of: ", paste(valid_age_bands, collapse = ", "))
}

age_band_fname <- gsub("-", "_", age_band)
train_years <- c(2016L, 2017L, 2018L)

# Cohort-specific target ICD definition
# Polypharmacy ED: HCG* codes
# We use a regex pattern so that any activity containing "HCG" in the ICD position is treated as target.
target_icd_patterns <- c("HCG")  # used inside grepl(..., activity)

model_data_path <- file.path(
  project_root,
  "model_data",
  paste0("cohort_name=", cohort_name),
  paste0("age_band=", age_band),
  "model_events.parquet"
)

fpgrowth_root <- file.path(
  project_root,
  "4_fpgrowth_analysis",
  "outputs",
  cohort_name
)

target_dir_train <- file.path(fpgrowth_root, "target", age_band_fname, "train")

itemsets_drug_target_path    <- file.path(target_dir_train, "drug_name_itemsets_target_only.json")
itemsets_icd_target_path     <- file.path(target_dir_train, "icd_code_itemsets_target_only.json")
itemsets_medical_target_path <- file.path(target_dir_train, "medical_code_itemsets_target_only.json")

cat("Using model_data from:\n ", model_data_path, "\n", sep = "")
cat("Using FP-Growth TRAIN outputs from:\n ", target_dir_train, "\n", sep = "")

suppressPackageStartupMessages({
  library(duckdb)
  library(arrow)
  library(dplyr)
  library(tidyr)
  library(jsonlite)
  library(readr)
  library(bupaR)
})

bup_ar_output_root <- file.path(project_root, "5_bupaR_analysis", "outputs")

save_bupar_csv <- function(df, filename) {
  out_dir <- file.path(bup_ar_output_root, cohort_name, age_band_fname, "features")
  dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
  local_path <- file.path(out_dir, filename)
  readr::write_csv(df, local_path)

  s3_key <- sprintf("gold/bupar/%s/%s/%s", cohort_name, age_band, filename)
  s3_uri <- paste0("s3://pgxdatalake/", s3_key)
  cmd <- sprintf("aws s3 cp \"%s\" \"%s\"", local_path, s3_uri)
  cat("Uploading to S3 with command:\n  ", cmd, "\n", sep = "")
  system(cmd)
  invisible(local_path)
}

if (!file.exists(model_data_path)) {
  stop("model_data parquet not found: ", model_data_path,
       "\nRun 3_feature_importance/create_model_data.py for this cohort/age band first.")
}

con <- dbConnect(duckdb::duckdb())

query <- sprintf(
  "SELECT * FROM read_parquet('%s') WHERE event_year IN (%s)",
  model_data_path,
  paste(train_years, collapse = ",")
)

pgx_df <- dbGetQuery(con, query)

cat("Loaded ", nrow(pgx_df), " events for ", cohort_name, " age_band=", age_band,
    " across years ", paste(train_years, collapse=","), "\n", sep="")

pgx_df_target1 <- pgx_df %>%
  filter(target == 1L)

cat("Target=1 rows: ", nrow(pgx_df_target1), "\n", sep = "")

# Load FP-Growth target-only itemsets and build allowed code set

allowed_codes <- character(0)

if (file.exists(itemsets_drug_target_path)) {
  drug_itemsets_target <- fromJSON(itemsets_drug_target_path, simplifyDataFrame = TRUE)
  drug_codes <- unique(unlist(drug_itemsets_target$itemsets))
  allowed_codes <- union(allowed_codes, drug_codes)
  cat("Loaded ", length(drug_codes), " unique drug codes from target-only itemsets.\n", sep = "")
} else {
  warning("Drug target-only itemsets not found at ", itemsets_drug_target_path)
}

if (file.exists(itemsets_icd_target_path)) {
  icd_itemsets_target <- fromJSON(itemsets_icd_target_path, simplifyDataFrame = TRUE)
  icd_codes <- unique(unlist(icd_itemsets_target$itemsets))
  allowed_codes <- union(allowed_codes, icd_codes)
  cat("Loaded ", length(icd_codes), " unique ICD codes from target-only itemsets.\n", sep = "")
} else {
  warning("ICD target-only itemsets not found at ", itemsets_icd_target_path)
}

if (file.exists(itemsets_medical_target_path)) {
  medical_itemsets_target <- fromJSON(itemsets_medical_target_path, simplifyDataFrame = TRUE)
  medical_codes <- unique(unlist(medical_itemsets_target$itemsets))
  allowed_codes <- union(allowed_codes, medical_codes)
  cat("Loaded ", length(medical_codes), " unique medical (ICD+CPT) codes from target-only itemsets.\n", sep = "")
} else {
  warning("Medical target-only itemsets not found at ", itemsets_medical_target_path)
}

cat("Total unique allowed codes from FP-Growth itemsets: ",
    length(allowed_codes), "\n", sep = "")

pgx_df_target1_long <- pgx_df_target1 %>%
  transmute(
    mi_person_key,
    event_date,
    drug_name,
    primary_icd_diagnosis_code,
    two_icd_diagnosis_code,
    three_icd_diagnosis_code,
    four_icd_diagnosis_code,
    five_icd_diagnosis_code,
    six_icd_diagnosis_code,
    seven_icd_diagnosis_code,
    eight_icd_diagnosis_code,
    nine_icd_diagnosis_code,
    ten_icd_diagnosis_code,
    procedure_code
  ) %>%
  mutate(across(
    c(
      drug_name,
      primary_icd_diagnosis_code,
      two_icd_diagnosis_code,
      three_icd_diagnosis_code,
      four_icd_diagnosis_code,
      five_icd_diagnosis_code,
      six_icd_diagnosis_code,
      seven_icd_diagnosis_code,
      eight_icd_diagnosis_code,
      nine_icd_diagnosis_code,
      ten_icd_diagnosis_code,
      procedure_code
    ),
    as.character
  )) %>%
  pivot_longer(
    cols = c(
      drug_name,
      primary_icd_diagnosis_code,
      two_icd_diagnosis_code,
      three_icd_diagnosis_code,
      four_icd_diagnosis_code,
      five_icd_diagnosis_code,
      six_icd_diagnosis_code,
      seven_icd_diagnosis_code,
      eight_icd_diagnosis_code,
      nine_icd_diagnosis_code,
      ten_icd_diagnosis_code,
      procedure_code
    ),
    names_to = "source",
    values_to = "code"
  ) %>%
  filter(!is.na(code), code != "", code != "NA") %>%
  {
    if (length(allowed_codes) > 0) {
      dplyr::filter(., code %in% allowed_codes)
    } else {
      .
    }
  } %>%
  mutate(
    activity = dplyr::case_when(
      source == "drug_name" ~ paste0("DRUG:", code),
      grepl("icd_diagnosis_code", source) ~ paste0("ICD:", code),
      source == "procedure_code" ~ paste0("CPT:", code),
      TRUE ~ code
    ),
    timestamp = as.POSIXct(event_date)
  )

# Target-only eventlog

target_eventlog <- pgx_df_target1_long %>%
  transmute(
    case_id              = mi_person_key,
    activity             = activity,
    timestamp            = timestamp,
    activity_instance_id = dplyr::row_number(),
    lifecycle_id         = "complete",
    resource_id          = "Patient"
  ) %>%
  eventlog(
    case_id              = "case_id",
    activity_id          = "activity",
    activity_instance_id = "activity_instance_id",
    lifecycle_id         = "lifecycle_id",
    resource_id          = "resource_id",
    timestamp            = "timestamp"
  )

print(target_eventlog)

# Combined TARGET + CONTROL eventlog for Sankey

control_cohort_name <- "opioid_ed"  # polypharmacy control is opioid_ed in this context

control_model_data_path <- file.path(
  project_root,
  "model_data",
  paste0("cohort_name=", control_cohort_name),
  paste0("age_band=", age_band),
  "model_events.parquet"
)

if (file.exists(control_model_data_path)) {
  query_control <- sprintf(
    "SELECT * FROM read_parquet('%s') WHERE event_year IN (%s)",
    control_model_data_path,
    paste(train_years, collapse = ",")
  )
  pgx_df_control <- dbGetQuery(con, query_control)
  cat("Loaded ", nrow(pgx_df_control), " control events for ", control_cohort_name,
      " age_band=", age_band, " across years ", paste(train_years, collapse=","), "\n", sep="")
} else {
  warning("Control model_data parquet not found: ", control_model_data_path)
  pgx_df_control <- pgx_df[0, ]
}

pgx_df_all <- bind_rows(
  pgx_df_target1 %>% mutate(group = "target"),
  pgx_df_control %>% mutate(group = "control")
)

pgx_df_all_long <- pgx_df_all %>%
  transmute(
    mi_person_key,
    event_date,
    group,
    drug_name,
    primary_icd_diagnosis_code,
    two_icd_diagnosis_code,
    three_icd_diagnosis_code,
    four_icd_diagnosis_code,
    five_icd_diagnosis_code,
    six_icd_diagnosis_code,
    seven_icd_diagnosis_code,
    eight_icd_diagnosis_code,
    nine_icd_diagnosis_code,
    ten_icd_diagnosis_code,
    procedure_code
  ) %>%
  mutate(across(
    c(
      drug_name,
      primary_icd_diagnosis_code,
      two_icd_diagnosis_code,
      three_icd_diagnosis_code,
      four_icd_diagnosis_code,
      five_icd_diagnosis_code,
      six_icd_diagnosis_code,
      seven_icd_diagnosis_code,
      eight_icd_diagnosis_code,
      nine_icd_diagnosis_code,
      ten_icd_diagnosis_code,
      procedure_code
    ),
    as.character
  )) %>%
  pivot_longer(
    cols = c(
      drug_name,
      primary_icd_diagnosis_code,
      two_icd_diagnosis_code,
      three_icd_diagnosis_code,
      four_icd_diagnosis_code,
      five_icd_diagnosis_code,
      six_icd_diagnosis_code,
      seven_icd_diagnosis_code,
      eight_icd_diagnosis_code,
      nine_icd_diagnosis_code,
      ten_icd_diagnosis_code,
      procedure_code
    ),
    names_to = "source",
    values_to = "code"
  ) %>%
  filter(!is.na(code), code != "", code != "NA") %>%
  {
    if (length(allowed_codes) > 0) {
      dplyr::filter(., code %in% allowed_codes)
    } else {
      .
    }
  } %>%
  mutate(
    activity = dplyr::case_when(
      source == "drug_name" ~ paste0("DRUG:", code),
      grepl("icd_diagnosis_code", source) ~ paste0("ICD:", code),
      source == "procedure_code" ~ paste0("CPT:", code),
      TRUE ~ code
    ),
    timestamp = as.POSIXct(event_date)
  )

sankey_eventlog <- pgx_df_all_long %>%
  transmute(
    case_id              = mi_person_key,
    activity             = activity,
    timestamp            = timestamp,
    group                = group,
    activity_instance_id = dplyr::row_number(),
    lifecycle_id         = "complete",
    resource_id          = "Patient"
  ) %>%
  eventlog(
    case_id              = "case_id",
    activity_id          = "activity",
    activity_instance_id = "activity_instance_id",
    lifecycle_id         = "lifecycle_id",
    resource_id          = "resource_id",
    timestamp            = "timestamp"
  )

print(sankey_eventlog)



### Before HCG (Pre-Target Sequences)

For polypharmacy ED (`non_opioid_ed`), we only use **events up to the first HCG-coded event**
for modeling. This section:

- Identifies the first HCG-coded ICD event per case, using `target_icd_patterns`.
- Builds a `pre_target_eventlog` containing all DRUG/ICD/CPT activities up to and including that event.
- Extracts sequence summaries and per-patient counts.
- Computes **time-to-HCG** and 30/90/180-day windowed event counts for use as predictive features.



In [None]:
# ---- Pre-HCG (before first HCG ICD) sequences + time-to-event features ----

suppressPackageStartupMessages({
  library(dplyr)
  library(bupaR)
  library(edeaR)
  library(lubridate)
})

# Use configurable target_icd_patterns (here, HCG-style ICD codes)

ev_all <- events(target_eventlog) %>%
  arrange(case_id, timestamp) %>%
  group_by(case_id) %>%
  mutate(
    event_index = row_number(),
    is_target_icd = Reduce(`|`, lapply(target_icd_patterns, function(p) grepl(p, activity))),
    has_target   = any(is_target_icd),
    first_target_index = ifelse(has_target,
                                min(event_index[is_target_icd]),
                                NA_integer_)
  ) %>%
  ungroup()

# Events up to and including the first HCG-coded event per case

events_pre_target <- ev_all %>%
  filter(!is.na(first_target_index),
         event_index <= first_target_index)

pre_target_eventlog <- events_pre_target %>%
  eventlog(
    case_id     = "case_id",
    activity_id = "activity",
    timestamp   = "timestamp"
  )

cat("Pre-HCG eventlog summary:\n")
print(pre_target_eventlog)

# 1) Trace explorer: pre-HCG trajectories

trace_explorer(pre_target_eventlog, coverage = 0.8)

# 2) Per-patient pre-HCG sequence features

pre_patient_features <- events(pre_target_eventlog) %>%
  arrange(case_id, timestamp) %>%
  group_by(case_id) %>%
  summarise(
    pre_n_events            = n(),
    pre_n_drug_events       = sum(grepl("^DRUG:", activity)),
    pre_n_icd_events        = sum(grepl("^ICD:", activity)),
    pre_n_cpt_events        = sum(grepl("^CPT:", activity)),
    pre_n_unique_activities = n_distinct(activity),
    .groups = "drop"
  )

save_bupar_csv(
  pre_patient_features,
  sprintf("%s_%s_train_target_pre_hcg_patient_features_bupar.csv", cohort_name, age_band_fname)
)

# 3) Time-to-HCG and windowed counts (30/90/180 days)

# Compute target_time and first_time per case

target_times <- events(target_eventlog) %>%
  arrange(case_id, timestamp) %>%
  group_by(case_id) %>%
  mutate(
    is_target_icd = Reduce(`|`, lapply(target_icd_patterns, function(p) grepl(p, activity))),
    has_target    = any(is_target_icd)
  ) %>%
  filter(has_target) %>%
  summarise(
    target_time = min(timestamp[is_target_icd]),
    first_time  = min(timestamp),
    .groups = "drop"
  )

pre_events_with_t <- events(pre_target_eventlog) %>%
  inner_join(target_times, by = "case_id") %>%
  mutate(
    dt_days = as.numeric(difftime(target_time, timestamp, units = "days"))
  )

hcg_time_features <- pre_events_with_t %>%
  group_by(case_id, target_time, first_time) %>%
  summarise(
    time_to_HCG_days        = as.numeric(max(dt_days, na.rm = TRUE)),
    n_events_30d            = sum(dt_days <= 30),
    n_events_90d            = sum(dt_days <= 90),
    n_events_180d           = sum(dt_days <= 180),
    n_drug_events_30d       = sum(dt_days <= 30 & grepl("^DRUG:", activity)),
    n_drug_events_90d       = sum(dt_days <= 90 & grepl("^DRUG:", activity)),
    n_drug_events_180d      = sum(dt_days <= 180 & grepl("^DRUG:", activity)),
    n_icd_events_30d        = sum(dt_days <= 30 & grepl("^ICD:", activity)),
    n_icd_events_90d        = sum(dt_days <= 90 & grepl("^ICD:", activity)),
    n_icd_events_180d       = sum(dt_days <= 180 & grepl("^ICD:", activity)),
    n_cpt_events_30d        = sum(dt_days <= 30 & grepl("^CPT:", activity)),
    n_cpt_events_90d        = sum(dt_days <= 90 & grepl("^CPT:", activity)),
    n_cpt_events_180d       = sum(dt_days <= 180 & grepl("^CPT:", activity)),
    .groups = "drop"
  )

save_bupar_csv(
  hcg_time_features,
  sprintf("%s_%s_train_target_time_to_hcg_features_bupar.csv", cohort_name, age_band_fname)
)



### Target-only Process Mining Views (Polypharmacy ED)

This section summarizes global target-only patterns for the selected polypharmacy cohort/age band:

- **Trace explorer** for the most frequent trajectories (coverage ~80%).
- **Process matrix** and **process map** for DRUG/ICD/CPT transitions.
- Tabular exports of traces and process matrix for feature engineering and QC.



In [None]:
suppressPackageStartupMessages({
  library(bupaR)
  library(bupaverse)
  library(processmapR)
  library(edeaR)
  library(ggplot2)
})

fig_root <- file.path(bup_ar_output_root, cohort_name, age_band_fname, "figures")
dir.create(fig_root, recursive = TRUE, showWarnings = FALSE)

# 1) Trace Explorer: most frequent target trajectories

p_trace <- trace_explorer(target_eventlog, coverage = 0.8)

# Save trace summary as tabular output
traces_target <- edeaR::traces(target_eventlog)
save_bupar_csv(
  traces_target,
  sprintf("%s_%s_train_target_traces_bupar.csv", cohort_name, age_band_fname)
)

# Save trace explorer plot to PNG + S3
trace_png <- file.path(fig_root, sprintf("%s_%s_train_target_traces_bupar.png", cohort_name, age_band_fname))
try({
  ggsave(trace_png, p_trace, width = 8, height = 6, dpi = 300)
  trace_s3_key <- sprintf("gold/bupar/%s/%s/%s", cohort_name, age_band, basename(trace_png))
  trace_s3_uri <- paste0("s3://pgxdatalake/", trace_s3_key)
  cmd <- sprintf("aws s3 cp \"%s\" \"%s\"", trace_png, trace_s3_uri)
  cat("Uploading trace explorer plot to S3 with command:\n  ", cmd, "\n", sep = "")
  system(cmd)
}, silent = TRUE)

# 2) Process Matrix and CSV export

pm_target <- process_matrix(target_eventlog, type = "frequency")
pm_target_df <- as.data.frame(pm_target)
save_bupar_csv(
  pm_target_df,
  sprintf("%s_%s_train_target_process_matrix_bupar.csv", cohort_name, age_band_fname)
)

# Simple heatmap visualization of the process matrix (activity-to-activity transitions)

ev_target <- events(target_eventlog) %>%
  arrange(case_id, timestamp) %>%
  group_by(case_id) %>%
  mutate(next_activity = lead(activity)) %>%
  ungroup() %>%
  filter(!is.na(next_activity)) %>%
  count(activity, next_activity, name = "n")

pm_heat <- ggplot(ev_target, aes(x = activity, y = next_activity, fill = n)) +
  geom_tile() +
  scale_fill_viridis_c(option = "plasma") +
  labs(
    title = sprintf("Process Matrix (frequency) – %s %s", cohort_name, age_band),
    x = "From activity",
    y = "To activity",
    fill = "Count"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
    plot.title = element_text(hjust = 0.5)
  )

pm_png <- file.path(fig_root, sprintf("%s_%s_train_target_process_matrix_bupar.png", cohort_name, age_band_fname))
try({
  ggsave(pm_png, pm_heat, width = 10, height = 8, dpi = 300)
  pm_s3_key <- sprintf("gold/bupar/%s/%s/%s", cohort_name, age_band, basename(pm_png))
  pm_s3_uri <- paste0("s3://pgxdatalake/", pm_s3_key)
  cmd <- sprintf("aws s3 cp \"%s\" \"%s\"", pm_png, pm_s3_uri)
  cat("Uploading process matrix heatmap to S3 with command:\n  ", cmd, "\n", sep = "")
  system(cmd)
}, silent = TRUE)

# 3) Process Map visualization

process_map(target_eventlog, type = "frequency")



In [None]:
# ---- Time-windowed trace explorer (15/30/60/90 days before HCG) ----

suppressPackageStartupMessages({
  library(dplyr)
  library(bupaR)
})

# We assume `pre_target_eventlog` and `target_times` (for HCG) were already created
# in the pre-HCG section.

if (!exists("pre_target_eventlog")) {
  stop("pre_target_eventlog not found. Run the pre-HCG section first.")
}

if (!exists("target_times")) {
  target_times <- events(target_eventlog) %>%
    arrange(case_id, timestamp) %>%
    group_by(case_id) %>%
    mutate(
      is_target_icd = Reduce(`|`, lapply(target_icd_patterns, function(p) grepl(p, activity))),
      has_target    = any(is_target_icd)
    ) %>%
    filter(has_target) %>%
    summarise(
      target_time = min(timestamp[is_target_icd]),
      first_time  = min(timestamp),
      .groups = "drop"
    )
}

pre_events_with_t <- events(pre_target_eventlog) %>%
  inner_join(target_times, by = "case_id") %>%
  mutate(
    dt_days = as.numeric(difftime(target_time, timestamp, units = "days"))
  )

windows <- c(15, 30, 60, 90)

for (w in windows) {
  cat("\n--- Trace explorer for last ", w, " days before HCG ---\n", sep = "")
  ev_w <- pre_events_with_t %>%
    filter(dt_days <= w)

  if (nrow(ev_w) == 0) {
    cat("No events within ", w, " days before HCG. Skipping.\n", sep = "")
    next
  }

  pre_eventlog_w <- ev_w %>%
    eventlog(
      case_id     = "case_id",
      activity_id = "activity",
      timestamp   = "timestamp"
    )

  trace_explorer(pre_eventlog_w, coverage = 0.8)
}



In [None]:
# ---- Drug-to-HCG features and visualizations ----

suppressPackageStartupMessages({
  library(dplyr)
  library(ggplot2)
})

# Require pre_events_with_t from the time-to-HCG section

if (!exists("pre_events_with_t")) {
  stop("pre_events_with_t not found. Run the time-to-HCG section first.")
}

# 1) Per-patient drug-to-HCG features

pre_drug_with_t <- pre_events_with_t %>%
  filter(grepl("^DRUG:", activity))

if (nrow(pre_drug_with_t) == 0) {
  warning("No DRUG: events found before/at HCG for this cohort/age band.")
} else {
  drug_to_hcg_features <- pre_drug_with_t %>%
    arrange(case_id, timestamp) %>%
    group_by(case_id, target_time, first_time) %>%
    summarise(
      last_drug_before_HCG    = dplyr::last(activity),
      time_from_last_drug_days  = as.numeric(min(dt_days, na.rm = TRUE)),
      n_unique_drugs_30d        = n_distinct(activity[dt_days <= 30]),
      n_unique_drugs_90d        = n_distinct(activity[dt_days <= 90]),
      n_unique_drugs_180d       = n_distinct(activity[dt_days <= 180]),
      .groups = "drop"
    ) %>%
    mutate(
      last_drug_code = sub("^DRUG:", "", last_drug_before_HCG)
    )

  save_bupar_csv(
    drug_to_hcg_features,
    sprintf("%s_%s_train_target_drug_to_hcg_features_bupar.csv", cohort_name, age_band_fname)
  )

  fig_root <- file.path(bup_ar_output_root, cohort_name, age_band_fname, "figures")
  dir.create(fig_root, recursive = TRUE, showWarnings = FALSE)

  # 2) Visualization: distribution of time_from_last_drug_days

  p_time_last_drug <- ggplot(drug_to_hcg_features, aes(x = time_from_last_drug_days)) +
    geom_histogram(bins = 30, fill = "steelblue", color = "white") +
    theme_minimal() +
    labs(
      title = sprintf("Time from last drug to HCG – %s %s", cohort_name, age_band),
      x = "Days from last DRUG: to HCG",
      y = "Number of patients"
    )

  time_last_drug_png <- file.path(
    fig_root,
    sprintf("%s_%s_train_target_time_from_last_drug_to_hcg_hist.png", cohort_name, age_band_fname)
  )

  try({
    ggsave(time_last_drug_png, p_time_last_drug, width = 8, height = 6, dpi = 300)
    s3_key <- sprintf("gold/bupar/%s/%s/%s", cohort_name, age_band, basename(time_last_drug_png))
    s3_uri <- paste0("s3://pgxdatalake/", s3_key)
    cmd <- sprintf("aws s3 cp \"%s\" \"%s\"", time_last_drug_png, s3_uri)
    cat("Uploading time-from-last-drug histogram to S3 with command:\n  ", cmd, "\n", sep = "")
    system(cmd)
  }, silent = TRUE)

  # 3) Visualization: top last_drug_before_HCG codes

  top_last_drugs <- drug_to_hcg_features %>%
    count(last_drug_code, sort = TRUE) %>%
    slice_head(n = 20)

  p_top_last_drugs <- ggplot(top_last_drugs, aes(x = reorder(last_drug_code, n), y = n)) +
    geom_col(fill = "darkorange") +
    coord_flip() +
    theme_minimal() +
    labs(
      title = sprintf("Top last drugs before HCG – %s %s", cohort_name, age_band),
      x = "Last DRUG code before HCG",
      y = "Number of patients"
    )

  top_last_drugs_png <- file.path(
    fig_root,
    sprintf("%s_%s_train_target_top_last_drugs_before_hcg.png", cohort_name, age_band_fname)
  )

  try({
    ggsave(top_last_drugs_png, p_top_last_drugs, width = 8, height = 6, dpi = 300)
    s3_key <- sprintf("gold/bupar/%s/%s/%s", cohort_name, age_band, basename(top_last_drugs_png))
    s3_uri <- paste0("s3://pgxdatalake/", s3_key)
    cmd <- sprintf("aws s3 cp \"%s\" \"%s\"", top_last_drugs_png, s3_uri)
    cat("Uploading top-last-drugs plot to S3 with command:\n  ", cmd, "\n", sep = "")
    system(cmd)
  }, silent = TRUE)
}

