### R Setup

In [None]:
R.version.string
getRversion()

# Setup
library(here)
library(dplyr)
library(readr)
library(survival)
library(ranger)
library(aorsf)
library(catboost)
library(tidyr)
library(purrr)
library(recipes)
library(tibble)
library(janitor)
library(haven)
library(riskRegression)  # For Score() function used in original study
library(prodlim) 

# Source required functions
source(here("scripts", "R", "clean_phts.R"))
source(here("scripts", "R", "make_final_features.R"))
source(here("scripts", "R", "select_rsf.R"))
source(here("scripts", "R", "make_recipe.R"))
source(here("scripts", "R", "make_labels.R"))

### Load Data

In [None]:
# Configuration
n_predictors <- 20  # Target: 20 features as in original study
n_trees_rsf <- 500  # Number of trees for RSF (matching original study)
n_trees_aorsf <- 100  # Number of trees for AORSF
horizon <- 1  # 1-year prediction horizon

# Create output directory
output_dir <- here("feature_importance", "replicate_20_features_output")
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)

cat("=== Replicating 20-Feature Selection ===\n")
cat("Output directory:", output_dir, "\n\n")

# Load and prepare base data
cat("Loading base data...\n")

# For feature selection, we need ALL variables (matching original study workflow)
# Use phts_txpl_ml.sas7bdat to match original study (has censoring implementation)
# clean_phts() is too aggressive (removes >30% missing, drops 'outcome' columns)
# So we'll load the raw SAS file and do minimal cleaning
phts_base <- tryCatch({
  # Find SAS file location (use phts_txpl_ml.sas7bdat to match original study)
  # Original study uses phts_txpl_ml.sas7bdat which has censoring implementation
  sas_path_local <- here("data", "phts_txpl_ml.sas7bdat")
  sas_path_external <- here("graft-loss-parallel-processing", "data", "phts_txpl_ml.sas7bdat")
  sas_path <- if (file.exists(sas_path_local)) sas_path_local else sas_path_external
  
  if (!file.exists(sas_path)) {
    stop("SAS file not found: phts_txpl_ml.sas7bdat. Tried: ", sas_path_local, " and ", sas_path_external)
  }
  
  cat("Reading raw SAS file:", sas_path, "\n")
  cat("Using: phts_txpl_ml.sas7bdat (matching original study)\n")
  
  # Read raw SAS file
  out <- haven::read_sas(sas_path) %>%
    dplyr::filter(TXPL_YEAR >= 2010) %>%
    # Clean column names (snake_case)
    janitor::clean_names() %>%
    # Rename outcome columns to match expected names
    dplyr::rename(
      outcome_int_graft_loss = int_graft_loss,
      outcome_graft_loss = graft_loss
    ) %>%
    # Create ID column
    dplyr::mutate(
      ID = 1:dplyr::n(),
      # Clean character columns
      dplyr::across(
        .cols = where(is.character),
        ~ ifelse(.x %in% c("", "unknown", "missing"), NA_character_, .x)
      ),
      # Convert to factors
      dplyr::across(.cols = where(is.character), as.factor),
      # Create tx_mcsd from txnomcsd if needed
      tx_mcsd = if ('txnomcsd' %in% names(.)) {
        if_else(txnomcsd == 'yes', 0, 1)  # 'yes' = no support, so 0; otherwise 1
      } else if ('txmcsd' %in% names(.)) {
        txmcsd
      } else {
        NA_real_
      }
    ) %>%
    # Rename to time/status for survival analysis
    dplyr::rename(
      time = outcome_int_graft_loss,
      status = outcome_graft_loss
    )
  
  cat("Raw SAS file loaded:", nrow(out), "rows,", ncol(out), "columns\n")
  
  # Create derived variables (matching original study preprocessing)
  cat("Creating derived variables...\n")
  
  # BMI at transplant (US formula: weight_lbs / height_in^2 * 703)
  if (!"bmi_txpl" %in% names(out) && "weight_txpl" %in% names(out) && "height_txpl" %in% names(out)) {
    out$bmi_txpl <- (out$weight_txpl / (out$height_txpl^2)) * 703
    cat("  Created bmi_txpl (US formula)\n")
  }
  
  # eGFR at transplant (Pediatric Schwartz formula: 0.413 * height_cm / creatinine_mg_dL)
  # Note: height_txpl is in inches, but Schwartz formula uses cm, so we need to convert
  # However, the original study uses height_txpl directly, so we'll use that
  if (!"egfr_tx" %in% names(out) && "txcreat_r" %in% names(out) && "height_txpl" %in% names(out)) {
    out$egfr_tx <- ifelse(
      out$txcreat_r <= 0 | is.na(out$txcreat_r),
      NA_real_,
      0.413 * out$height_txpl / out$txcreat_r
    )
    cat("  Created egfr_tx (Schwartz formula)\n")
  }
  
  # PRA at listing (from lsfprat - PRA T-cell at listing)
  if (!"pra_listing" %in% names(out) && "lsfprat" %in% names(out)) {
    out$pra_listing <- out$lsfprat
    cat("  Created pra_listing from lsfprat\n")
  } else if (!"pra_listing" %in% names(out) && "lsfprab" %in% names(out)) {
    # Fallback to lsfprab if lsfprat not available
    out$pra_listing <- out$lsfprab
    cat("  Created pra_listing from lsfprab (fallback)\n")
  }
  
  # Listing year (from age difference: txpl_year - (age_txpl - age_listing))
  if (!"listing_year" %in% names(out) && "txpl_year" %in% names(out) && 
      "age_txpl" %in% names(out) && "age_listing" %in% names(out)) {
    out$listing_year <- as.integer(floor(out$txpl_year - (out$age_txpl - out$age_listing)))
    cat("  Created listing_year from age difference\n")
  } else if (!"listing_year" %in% names(out) && "txpl_year" %in% names(out)) {
    # Fallback: assume listing year is transplant year minus 1
    out$listing_year <- out$txpl_year - 1L
    cat("  Created listing_year (fallback: txpl_year - 1)\n")
  }
  
  # Return with all columns (no aggressive filtering)
  out
  
}, error = function(e) {
  cat("Error loading raw SAS file:", e$message, "\n")
  cat("Falling back to clean_phts()...\n")
  
  # Fallback to clean_phts if raw loading fails
  clean_phts(
    min_txpl_year = 2010,
    predict_horizon = horizon,
    time = outcome_int_graft_loss,
    status = outcome_graft_loss,
    case = 'snake',
    set_to_na = c("", "unknown", "missing")
  )
})

cat("Base data loaded:", nrow(phts_base), "rows,", ncol(phts_base), "columns\n")
cat("Column names:", paste(head(names(phts_base), 20), collapse = ", "), "...\n")


### Prepare Data

In [None]:
# Diagnostic: Check column count and try to load full dataset if needed
cat("All column names (", length(names(phts_base)), "):", paste(names(phts_base), collapse = ", "), "\n")

# Check if we're missing expected columns
expected_cols <- c("prim_dx", "tx_mcsd", "chd_sv", "hxsurg", "txsa_r", "txbun_r",
                   "txecmo", "txpl_year", "weight_txpl", "txalt", "bmi_txpl",
                   "pra_listing", "egfr_tx", "hxmed", "listing_year", "time", "status")
missing_cols <- setdiff(expected_cols, names(phts_base))
if (length(missing_cols) > 0) {
  cat("WARNING: Missing expected columns:", paste(missing_cols, collapse = ", "), "\n")
}

# Check if we should load from a saved RDS file that might have more columns
rds_path_full <- here("graft-loss-parallel-processing", "model_data", "phts_all.rds")
if (file.exists(rds_path_full) && ncol(phts_base) < 50) {
  cat("WARNING: Only", ncol(phts_base), "columns detected. Checking if full dataset exists...\n")
  cat("Checking:", rds_path_full, "\n")
  if (file.exists(rds_path_full)) {
    phts_full <- readRDS(rds_path_full)
    cat("Full dataset found:", nrow(phts_full), "rows,", ncol(phts_full), "columns\n")
    if (ncol(phts_full) > ncol(phts_base)) {
      cat("Using full dataset instead (", ncol(phts_full), "columns vs", ncol(phts_base), ")\n")
      phts_base <- phts_full
    }
  }
}


In [None]:
# Prepare data for modeling (use ALL variables, matching original study workflow)
# Original study: Feature selection from ALL variables → Top 20 → Then identify Wisotzkey variables
# Replicates original study preprocessing: median imputation + remove zero-variance columns
prepare_modeling_data <- function(data) {
  # Find time and status columns (handle different naming conventions)
  time_col <- NULL
  status_col <- NULL

  # Try common time column names
  time_candidates <- c("time", "outcome_int_graft_loss", "int_graft_loss", "ev_time")
  for (col in time_candidates) {
    if (col %in% names(data)) {
      time_col <- col
      break
    }
  }

  # Try common status column names
  status_candidates <- c("status", "outcome_graft_loss", "graft_loss", "ev_type", "outcome")
  for (col in status_candidates) {
    if (col %in% names(data)) {
      status_col <- col
      break
    }
  }

  if (is.null(time_col) || is.null(status_col)) {
    stop("Cannot find time/status columns. Available columns: ",
         paste(names(data), collapse = ", "))
  }

  # Rename to standard names (only if different)
  if (time_col != "time") {
    data <- data %>% rename(time = !!time_col)
  }
  if (status_col != "status") {
    data <- data %>% rename(status = !!status_col)
  }

  # ============================================================================
  # DATA LEAKAGE PREVENTION: Excluded Variables
  # ============================================================================
  # The following variables are excluded to prevent target leakage:
  #
  # IDENTIFIERS:
  #   - ptid_e: Patient identifier
  #
  # OUTCOME/EVENT VARIABLES (post-event information):
  #   - int_dead, int_death: Death interval (post-event)
  #   - graft_loss, txgloss: Graft loss indicator (outcome variable)
  #   - death, event: Generic outcome indicators
  #
  # DONOR/TRANSPLANT POST-EVENT VARIABLES:
  #   - dtx_*: All variables starting with "dtx_" (donor/transplant post-event info)
  #
  # CAUSE OF DEATH VARIABLES (post-event information):
  #   - dpricaus: Primary cause of death (post-event, leaks outcome information)
  #   - deathspc: Death specific cause (post-event)
  #   - concod: Cause of death (post-event)
  #
  # AGE AT DEATH VARIABLES (post-event information):
  #   - age_death: Age at death (post-event, leaks outcome information)
  #
  # PATIENT SUPPORT/STATUS VARIABLES (post-event):
  #   - patsupp: Death: VAD/ECMO (post-event)
  #   - pmorexam: Death: Post Mortem Exam (post-event)
  #   - papooth: Death: Cardiac pathology: Other (post-event)
  #   - pacuref: Death: Cardiac pathology: Acute rejection (post-event)
  #   - pishltgr: Death: ACR grading (post-event)
  #
  # DEATH: CARDIAC PATHOLOGY VARIABLES (post-event - autopsy/post-mortem findings):
  #   - pathero: Death: Cardiac pathology: Graftatherosclerosis (post-event)
  #   - pcadrec: Death: Cardiac pathology: CAD, recent infarction (post-event)
  #   - pcadrem: Death: Cardiac pathology: CAD, remote infarction (post-event)
  #   - pdiffib: Death: Cardiac pathology: Diffuse fibrosis, no acute rej (post-event)
  #
  # PATHOLOGY VARIABLES (post-event):
  #   - cpathneg: Death: Cardiac pathology: No cardiac pathology found (post-event)
  #
  # DONOR COMPLICATION VARIABLES (post-transplant):
  #   - dcardiac, dneuro, dreject, dsecaccs, dpriaccs
  #   - dconmbld, dconmal, dconcard, dconneur, dconrej
  #   - dmajbld, dmalcanc
  #   - All variables starting with "dcon", "dpri", "dsec", "dmaj"
  #
  # COMPLICATION CATEGORIES (post-event):
  #   - All variables starting with "cc_": Complication categories (post-event)
  #
  # SD VARIABLES (post-event):
  #   - All variables starting with "sd": SD variables (post-event)
  #
  # These variables contain information that would not be available at prediction time
  # and would artificially inflate model performance (data leakage).
  # ============================================================================
  
  # First, identify variables to exclude by exact name
  exclude_exact <- c(
    "ptid_e",  # Patient ID
    # Exclude obvious outcome/leakage variables (matching original repository)
    "int_dead", "int_death", "graft_loss", "txgloss", "death", "event",
    # Exclude post-event cause of death and age at death (target leakage)
    "dpricaus",  # Primary cause of death (post-event)
    "age_death", # Age at death (post-event)
    # Additional cause of death / death-related variables (post-event)
    "deathspc",  # Death specific cause (post-event)
    "concod",    # Cause of death (post-event)
    # Patient support/status variables (post-event)
    "patsupp",  # Death: VAD/ECMO (post-event)
    "pmorexam",  # Death: Post Mortem Exam (post-event)
    "papooth",  # Death: Cardiac pathology: Other (post-event)
    "pacuref",  # Death: Cardiac pathology: Acute rejection (post-event)
    "pishltgr",  # Death: ACR grading (post-event)
    # Death: Cardiac pathology variables (post-event - autopsy/post-mortem findings)
    "pathero",  # Death: Cardiac pathology: Graftatherosclerosis (post-event)
    "pcadrec",  # Death: Cardiac pathology: CAD, recent infarction (post-event)
    "pcadrem",  # Death: Cardiac pathology: CAD, remote infarction (post-event)
    "pdiffib",  # Death: Cardiac pathology: Diffuse fibrosis, no acute rej (post-event)
    # Pathology variables (post-event)
    "cpathneg",  # Death: Cardiac pathology: No cardiac pathology found (post-event)
    # Donor complications (post-transplant)
    "dcardiac",  # Donor cardiac complication (post-transplant)
    "dneuro",    # Donor neurological complication (post-transplant)
    "dreject",   # Donor rejection (post-transplant)
    "dsecaccs",  # Donor secondary access (post-transplant)
    "dpriaccs",  # Donor primary access (post-transplant)
    "dconmbld",  # Donor complication major bleeding (post-transplant)
    "dconmal",   # Donor complication malignancy (post-transplant)
    "dconcard",  # Donor complication cardiac (post-transplant)
    "dconneur",  # Donor complication neurological (post-transplant)
    "dconrej",   # Donor complication rejection (post-transplant)
    "dmajbld",   # Donor major bleeding (post-transplant)
    "dmalcanc"   # Donor malignancy/cancer (post-transplant)
  )
  
  # Identify variables starting with prefixes (post-event variables)
  exclude_prefixes <- c(
    "dtx_",   # Donor/transplant post-event
    "cc_",    # Complication categories (post-event)
    "dcon",   # Donor complications (post-transplant)
    "dpri",   # Donor primary (post-transplant)
    "dsec",   # Donor secondary (post-transplant)
    "dmaj",   # Donor major (post-transplant)
    "sd"      # SD variables (post-event, matching survival_helpers.R)
  )
  
  # Collect all variables matching prefixes
  exclude_by_prefix <- character(0)
  for (prefix in exclude_prefixes) {
    exclude_by_prefix <- c(exclude_by_prefix, names(data)[startsWith(names(data), prefix)])
  }
  
  # Combine all exclusions
  exclude_all <- unique(c(exclude_exact, exclude_by_prefix))
  
  # Log what's being excluded
  if (length(exclude_all) > 0) {
    cat("  Excluding", length(exclude_all), "leakage variables:\n")
    cat("    Exact matches:", length(exclude_exact), "variables\n")
    if (length(exclude_by_prefix) > 0) {
      cat("    Prefix matches:", length(exclude_by_prefix), "variables")
      # Show breakdown by prefix
      for (prefix in exclude_prefixes) {
        prefix_vars <- names(data)[startsWith(names(data), prefix)]
        if (length(prefix_vars) > 0) {
          cat("\n      ", prefix, "*:", length(prefix_vars), "variables (e.g.,", 
              paste(head(prefix_vars, 3), collapse = ", "))
          if (length(prefix_vars) > 3) cat(" ...")
          cat(")")
        }
      }
      cat("\n")
    }
  }
  
  # Remove excluded variables
  data <- data %>%
    dplyr::select(-dplyr::any_of(exclude_all))

  # Filter out invalid survival data
  data <- data %>%
    filter(!is.na(time), !is.na(status), time > 0, status %in% c(0, 1))

  cat("  Initial variables:", ncol(data), "(including time, status)\n")
  
  # Apply original study preprocessing: median/mode imputation + remove zero-variance columns
  cat("  Applying original study preprocessing (median imputation + remove zero-variance)...\n")
  
  # 1. Median imputation for numeric columns (matching make_recipe step_impute_median)
  numeric_cols <- names(data)[sapply(data, is.numeric)]
  numeric_cols <- setdiff(numeric_cols, c("time", "status", "ID"))  # Don't impute outcomes/ID
  if (length(numeric_cols) > 0) {
    cat("    Imputing", length(numeric_cols), "numeric columns with median\n")
    for (col in numeric_cols) {
      if (any(is.na(data[[col]]))) {
        median_val <- median(data[[col]], na.rm = TRUE)
        if (!is.finite(median_val)) median_val <- 0  # Fallback if all NA
        data[[col]][is.na(data[[col]])] <- median_val
      }
    }
  }
  
  # 2. Mode imputation for categorical columns (matching make_recipe step_impute_mode)
  factor_cols <- names(data)[sapply(data, is.factor)]
  factor_cols <- setdiff(factor_cols, c("time", "status", "ID"))
  if (length(factor_cols) > 0) {
    cat("    Imputing", length(factor_cols), "factor columns with mode\n")
    for (col in factor_cols) {
      if (any(is.na(data[[col]]))) {
        # Get mode (most frequent value)
        tab <- table(data[[col]], useNA = "no")
        if (length(tab) > 0) {
          mode_val <- names(tab)[which.max(tab)]
          # Add "Missing" level if not present
          if (!"Missing" %in% levels(data[[col]])) {
            data[[col]] <- factor(data[[col]], levels = c(levels(data[[col]]), "Missing"))
          }
          data[[col]][is.na(data[[col]])] <- "Missing"
        }
      }
    }
  }
  
  # 3. Remove zero-variance columns (matching make_recipe step_nzv, but simpler: only true zero-variance)
  # Note: make_recipe uses step_nzv with freq_cut=1000, unique_cut=0.025, but we'll just remove true zero-variance
  zero_var_cols <- names(data)[sapply(data, function(x) {
    if (is.numeric(x)) {
      var_val <- var(x, na.rm = TRUE)
      is.na(var_val) || var_val == 0 || length(unique(na.omit(x))) <= 1
    } else {
      length(unique(na.omit(x))) <= 1
    }
  })]
  # Don't remove time, status, or ID
  zero_var_cols <- setdiff(zero_var_cols, c("time", "status", "ID"))
  
  if (length(zero_var_cols) > 0) {
    cat("    Removing", length(zero_var_cols), "zero-variance columns:", paste(head(zero_var_cols, 10), collapse = ", "))
    if (length(zero_var_cols) > 10) cat(" ...")
    cat("\n")
    data <- data %>% dplyr::select(-dplyr::all_of(zero_var_cols))
  }

  cat("  Final variables:", ncol(data), "(including time, status)\n")
  cat("  Using", ncol(data) - 2, "predictor variables (all available variables after preprocessing)\n")

  return(data)
}


#### Add Wisotzkey Variables

In [None]:
# Define Wisotzkey variables (15 core variables from original study)
wisotzkey_variables <- c(
  "prim_dx",           # Primary Etiology
  "tx_mcsd",           # MCSD at Transplant (with underscore - derived column!)
  "chd_sv",            # Single Ventricle CHD
  "hxsurg",            # Surgeries Prior to Listing
  "txsa_r",            # Serum Albumin at Transplant
  "txbun_r",           # BUN at Transplant
  "txecmo",            # ECMO at Transplant
  "txpl_year",         # Transplant Year
  "weight_txpl",       # Recipient Weight at Transplant
  "txalt",             # ALT at Transplant (cleaned name, not txalt_r)
  "bmi_txpl",          # BMI at Transplant
  "pra_listing",       # PRA at Listing (or lsfprat/lsfprab)
  "egfr_tx",           # eGFR at Transplant
  "hxmed",             # Medical History at Listing
  "listing_year"       # Listing Year
)

subset_wisotzkey_variables <- function(data) {
  # Alternative name mappings
  wisotzkey_alternatives <- list(
    "pra_listing" = c("pra_listing", "lsfprat", "lsfprab"),
    "tx_mcsd"     = c("tx_mcsd", "txmcsd")
  )
  
  available_wisotzkey <- character(0)
  
  for (var in wisotzkey_variables) {
    if (var %in% names(data)) {
      # Found exact name
      available_wisotzkey <- c(available_wisotzkey, var)
    } else if (var %in% names(wisotzkey_alternatives)) {
      # Try alternatives
      alts <- wisotzkey_alternatives[[var]]
      alt_hit <- alts[alts %in% names(data)]
      if (length(alt_hit) > 0) {
        # Use the first matching alternative
        available_wisotzkey <- c(available_wisotzkey, alt_hit[1])
      }
    }
  }
  
  missing_wisotzkey <- setdiff(wisotzkey_variables, available_wisotzkey)
  
  if (length(missing_wisotzkey) > 0) {
    cat("  Warning: Missing Wisotzkey variables:",
        paste(missing_wisotzkey, collapse = ", "), "\n")
  }
  
  cat("  Using", length(available_wisotzkey), "Wisotzkey variables:",
      paste(available_wisotzkey, collapse = ", "), "\n")
  
  # Keep only time/status/ID plus available Wisotzkey vars
  keep_vars <- c("time", "status", "ID", "ptid_e", available_wisotzkey)
  keep_vars <- intersect(keep_vars, names(data))
  
  out <- data %>%
    dplyr::select(dplyr::all_of(keep_vars)) %>%
    dplyr::filter(!is.na(time), !is.na(status),
                  time > 0, status %in% c(0, 1))
  
  return(out)
}

# Identify which features are Wisotzkey variables (after feature selection)
# This matches the original study: select top 20, then identify which are Wisotzkey
identify_wisotzkey_features <- function(feature_names) {
  # Check which Wisotzkey variables are in the selected features
  # Also check for alternative names (e.g., lsfprat instead of pra_listing)
  wisotzkey_alternatives <- list(
    "pra_listing" = c("pra_listing", "lsfprat", "lsfprab"),  # PRA at listing
    "tx_mcsd" = c("tx_mcsd", "txmcsd")  # MCSD at transplant
  )

  wisotzkey_in_selected <- c()
  for (var in wisotzkey_variables) {
    if (var %in% feature_names) {
      wisotzkey_in_selected <- c(wisotzkey_in_selected, var)
    } else if (var %in% names(wisotzkey_alternatives)) {
      # Try alternatives
      for (alt in wisotzkey_alternatives[[var]]) {
        if (alt %in% feature_names) {
          wisotzkey_in_selected <- c(wisotzkey_in_selected, alt)
          break
        }
      }
    }
  }

  return(wisotzkey_in_selected)
}


In [None]:
# Define time period filters
define_time_periods <- function(data) {
  periods <- list()
  
  if (!"txpl_year" %in% names(data)) {
    warning("txpl_year not found - using all data for all periods")
    periods$original <- data
    periods$full <- data
    periods$full_no_covid <- data
    return(periods)
  }
  
  # Original study period: 2010-2019
  periods$original <- data %>%
    filter(txpl_year >= 2010 & txpl_year <= 2019)
  
  # Full study: 2010-2024
  periods$full <- data %>%
    filter(txpl_year >= 2010)
  
  # Full study without COVID: exclude 2020-2023
  periods$full_no_covid <- data %>%
    filter(txpl_year >= 2010 & !(txpl_year >= 2020 & txpl_year <= 2023))
  
  return(periods)
}

# NOTE: prepare_modeling_data() is already defined in cell 7 above
# This duplicate definition is removed to avoid conflicts


### C Index Calculations - Helper Functions

In [None]:
# Helper function to calculate C-index
calculate_cindex <- function(time, status, risk_scores, horizon = NULL) {
  # Remove missing / invalid
  valid_idx <- !is.na(time) & !is.na(status) & !is.na(risk_scores) &
               is.finite(time) & is.finite(risk_scores) & time > 0
  
  time   <- as.numeric(time[valid_idx])
  status <- as.numeric(status[valid_idx])
  risk   <- as.numeric(risk_scores[valid_idx])
  
  n <- length(time)
  events <- sum(status == 1)
  cat("  [cindex] n =", n,
      " valid =", n,
      " events =", events)
  
  if (!is.null(horizon)) {
    cat(", horizon =", horizon)
  }
  cat("\n")
  
  if (n < 10 || events < 1) {
    return(list(cindex_td = NA_real_, cindex_ti = NA_real_))
  }
  if (length(unique(risk)) == 1) {
    return(list(cindex_td = 0.5, cindex_ti = 0.5))
  }
  
  # Always calculate time-independent Harrell's C-index
  num_conc_ti <- 0
  num_disc_ti <- 0
  num_ties_ti <- 0
  
  for (i in seq_len(n)) {
    if (status[i] != 1) next
    for (j in seq_len(n)) {
      if (i == j) next
      # Comparable if event time is earlier for i
      if (time[i] < time[j]) {
        if (risk[i] > risk[j]) {
          num_conc_ti <- num_conc_ti + 1
        } else if (risk[i] < risk[j]) {
          num_disc_ti <- num_disc_ti + 1
        } else {
          num_ties_ti <- num_ties_ti + 1
        }
      }
    }
  }
  
  denom_ti <- num_conc_ti + num_disc_ti + num_ties_ti
  if (denom_ti == 0) {
    cindex_ti <- NA_real_
  } else {
    c_raw_ti <- (num_conc_ti + 0.5 * num_ties_ti) / denom_ti
    cindex_ti <- max(c_raw_ti, 1 - c_raw_ti)  # Orientation-safe
  }
  
  # Calculate time-dependent C-index if horizon is provided
  cindex_td <- NA_real_
  if (!is.null(horizon) && is.finite(horizon) && horizon > 0) {
    # Time-dependent AUC: compare patients with events before horizon vs those at risk at horizon
    # This matches riskRegression::Score() behavior for time-dependent AUC
    num_conc_td <- 0
    num_disc_td <- 0
    num_ties_td <- 0
    
    # Identify patients with events before horizon (cases)
    event_before_horizon <- (status == 1) & (time <= horizon)
    
    # Identify patients at risk at horizon (controls):
    # - Patients with time > horizon (they were at risk at horizon, regardless of eventual status)
    # - Note: Patients censored before horizon (status == 0 & time <= horizon) are excluded
    #   because we don't know if they would have had an event before horizon
    at_risk_at_horizon <- (time > horizon)
    
    n_events_before <- sum(event_before_horizon)
    n_at_risk <- sum(at_risk_at_horizon)
    
    cat("  [cindex] Events before horizon:", n_events_before, 
        ", At risk at horizon:", n_at_risk, "\n")
    
    if (n_events_before > 0 && n_at_risk > 0) {
      # Compare each patient with event before horizon to each patient at risk at horizon
      for (i in seq_len(n)) {
        if (!event_before_horizon[i]) next  # Only consider patients with events before horizon
        
        for (j in seq_len(n)) {
          if (i == j) next
          if (!at_risk_at_horizon[j]) next  # Only compare to patients at risk at horizon
          
          # For time-dependent AUC: higher risk should predict event before horizon
          # Patient i has event before horizon (case), patient j is at risk at horizon (control)
          # So risk[i] > risk[j] is concordant (higher risk → event before horizon)
          if (risk[i] > risk[j]) {
            num_conc_td <- num_conc_td + 1
          } else if (risk[i] < risk[j]) {
            num_disc_td <- num_disc_td + 1
          } else {
            num_ties_td <- num_ties_td + 1
          }
        }
      }
      
      denom_td <- num_conc_td + num_disc_td + num_ties_td
      if (denom_td > 0) {
        c_raw_td <- (num_conc_td + 0.5 * num_ties_td) / denom_td
        cindex_td <- max(c_raw_td, 1 - c_raw_td)  # Orientation-safe
      }
    } else {
      cat("  [cindex] Warning: Insufficient events for time-dependent C-index\n")
    }
  }
  
  return(list(cindex_td = as.numeric(cindex_td), cindex_ti = as.numeric(cindex_ti)))
}


# Predict risk at given times from a ranger survival model
ranger_predictrisk <- function(object, newdata, times) {
  # Try several predict() interfaces for ranger
  ptemp <- NULL
  
  # 1) modern: new_data (sometimes used via tidymodels wrappers)
  ptemp <- tryCatch({
    predict(object, new_data = newdata, type = "response")$survival
  }, error = function(e) NULL)
  
  # 2) older: data
  if (is.null(ptemp)) {
    ptemp <- tryCatch({
      predict(object, data = newdata, type = "response")$survival
    }, error = function(e) NULL)
  }
  
  # 3) legacy: newdata
  if (is.null(ptemp)) {
    ptemp <- tryCatch({
      predict(object, newdata = newdata, type = "response")$survival
    }, error = function(e) NULL)
  }
  
  if (is.null(ptemp)) {
    stop("Could not call predict() on ranger object with any known interface")
  }
  
  # Log times and unique.death.times for debugging
  cat("    [ranger_predictrisk] times parameter:", times, "\n")
  cat("    [ranger_predictrisk] unique.death.times range:", 
      paste(range(object$unique.death.times, na.rm=TRUE), collapse=" to "), "\n")
  
  # Map requested eval time(s) to survival index
  pos <- prodlim::sindex(
    jump.times = object$unique.death.times,
    eval.times = times
  )
  
  cat("    [ranger_predictrisk] sindex pos:", paste(pos, collapse=", "), "\n")
  
  # survival matrix is n x T; handle times before first event (pos == 0)
  p <- cbind(1, ptemp)[, pos + 1, drop = FALSE]
  
  cat("    [ranger_predictrisk] Final risk matrix dim:", paste(dim(p), collapse="x"), "\n")
  
  # Return risk = 1 - survival at specified time(s)
  1 - p
}
                      
# Compute CatBoost C-index using riskRegression::Score
catboost_cindex_score <- function(predictions, time, status, horizon) {
  
  # Dependencies
  if (!requireNamespace("riskRegression", quietly = TRUE))
    stop("riskRegression package not installed.")
  if (!requireNamespace("survival", quietly = TRUE))
    stop("survival package not installed.")
  
  # Convert CatBoost signed-time predictions to risk scores
  # CatBoost: higher predicted signed-time → longer survival → lower risk
  # So risk = -predictions
  risk_scores <- -as.numeric(predictions)
  
  # Log CatBoost risk score conversion
  cat("  [CatBoost DEBUG] Risk score conversion:\n")
  cat("    Original predictions range:", paste(range(predictions, na.rm=TRUE), collapse=" to "), "\n")
  cat("    Risk scores range:", paste(range(risk_scores, na.rm=TRUE), collapse=" to "), "\n")
  cat("    Risk scores length:", length(risk_scores), "\n")
  cat("    Risk scores class:", paste(class(risk_scores), collapse=", "), "\n")
  
  # Construct data frame for Score()
  score_data <- data.frame(
    time   = as.numeric(time),
    status = as.numeric(status)
  )
  
  # Log before Score() call
  cat("  [CatBoost DEBUG] Before Score() call:\n")
  cat("    score_data rows:", nrow(score_data), "\n")
  cat("    risk_scores length:", length(risk_scores), "\n")
  risk_matrix <- as.matrix(risk_scores)
  cat("    as.matrix() dim:", paste(dim(risk_matrix), collapse="x"), "\n")
  cat("    as.matrix() class:", paste(class(risk_matrix), collapse=", "), "\n")
  cat("    score_data$time range:", paste(range(score_data$time, na.rm=TRUE), collapse=" to "), "\n")
  cat("    score_data$status sum:", sum(score_data$status, na.rm=TRUE), "\n")
  cat("    horizon:", horizon, "\n")
  
  # Calculate both time-dependent and time-independent C-index
  cindex_result <- calculate_cindex(time, status, risk_scores, horizon = horizon)
  cindex_ti <- cindex_result$cindex_ti
  
  # Try riskRegression::Score for time-dependent (matching original study)
  cindex_td <- tryCatch({
    evaluation <- riskRegression::Score(
      object  = list(CatBoost = risk_matrix),  # must be n × 1 matrix
      formula = survival::Surv(time, status) ~ 1,
      data    = score_data,
      times   = horizon,
      summary = "risks",
      metrics = "auc",
      se.fit  = FALSE
    )
    
    auc_tab <- evaluation$AUC$score
    
    # If multiple rows (multiple times), pick the closest to horizon
    if ("times" %in% names(auc_tab)) {
      this_row <- which.min(abs(auc_tab$times - horizon))
    } else {
      this_row <- 1L
    }
    
    as.numeric(auc_tab$AUC[this_row])
    
  }, error = function(e) {
    # Fallback to calculate_cindex if Score() fails
    cat("  Warning: Score() failed, using manual calculate_cindex():", e$message, "\n")
    cindex_result$cindex_td
  })
  
  return(list(cindex_td = cindex_td, cindex_ti = cindex_ti))
}


### Random Survival Forest (RSF) Feature Importance - Helper Function

In [None]:
select_features_rsf <- function(data,
                                n_predictors = 20,
                                n_trees      = 500,
                                horizon      = 1) {
  cat("  Running RSF feature selection (permutation importance)...\n")
  cat("  [RSF] Horizon parameter:", horizon, "\n")
  
  if (!requireNamespace("survival", quietly = TRUE))
    stop("The 'survival' package is required for RSF.")
  if (!requireNamespace("prodlim", quietly = TRUE))
    stop("The 'prodlim' package is required for RSF risk predictions.")
  if (!requireNamespace("riskRegression", quietly = TRUE)) {
    cat("  [RSF] 'riskRegression' not available; will skip Score() and use manual C-index only.\n")
  }
  if (!exists("ranger_predictrisk", mode = "function")) {
    stop("ranger_predictrisk() must be defined before calling select_features_rsf().")
  }
  if (!exists("calculate_cindex", mode = "function")) {
    stop("calculate_cindex() must be defined before calling select_features_rsf().")
  }
  
  # Extract original time/status
  time_vec   <- data$time
  status_vec <- data$status
  
  cat("  [RSF] Time range:",
      paste(range(time_vec, na.rm = TRUE), collapse = " to "), "\n")
  cat("  [RSF] Time units check: If time is in years, horizon should be ~1; if in days, ~365.\n")
  cat("  [RSF] Status table:\n")
  print(table(status_vec, useNA = "ifany"))
  
  # Prepare predictors (no ID / time / status)
  recipe_prep <- make_recipe(data, dummy_code = FALSE) %>% recipes::prep()
  
  prepared_data <- recipes::juice(recipe_prep) %>%
    dplyr::select(-dplyr::any_of(c("ID", "ptid_e", "time", "status")))
  
  # Align rows
  n_rows <- min(nrow(prepared_data), length(time_vec), length(status_vec))
  
  rsf_data <- prepared_data[1:n_rows, , drop = FALSE]
  rsf_data$time   <- as.numeric(time_vec[1:n_rows])
  rsf_data$status <- as.integer(status_vec[1:n_rows])
  
  # Filter invalid survival data
  valid_rows <- !is.na(rsf_data$time) & !is.na(rsf_data$status) &
                rsf_data$time > 0 & rsf_data$status %in% c(0L, 1L)
  rsf_data <- rsf_data[valid_rows, , drop = FALSE]
  
  if (nrow(rsf_data) < 10) {
    stop("Not enough valid rows for RSF after filtering")
  }
  
  cat("  RSF data dims:", nrow(rsf_data), "x", ncol(rsf_data), "\n")
  cat("  RSF time range:", range(rsf_data$time, na.rm = TRUE), "\n")
  cat("  RSF status table:\n")
  print(table(rsf_data$status, useNA = "ifany"))
  
  # Fit RSF
  rsf_model <- ranger::ranger(
    survival::Surv(time, status) ~ .,
    data              = rsf_data,
    num.trees         = n_trees,
    importance        = "permutation",
    min.node.size     = 20,
    splitrule         = "extratrees",
    num.random.splits = 10
  )
  
  # 1) Risk predictions at horizon using ranger_predictrisk
  cat("  [RSF DEBUG] Calling ranger_predictrisk with horizon =", horizon, "\n")
  rsf_predictions <- tryCatch({
    risk_pred <- ranger_predictrisk(
      object  = rsf_model,
      newdata = rsf_data,
      times   = horizon
    )
    
    cat("  [RSF DEBUG] Raw prediction object:\n")
    cat("    Class:", paste(class(risk_pred), collapse = ", "), "\n")
    cat("    Type :", typeof(risk_pred), "\n")
    if (is.matrix(risk_pred)) {
      cat("    Dim  :", paste(dim(risk_pred), collapse = " x "), "\n")
      cat("    Head :", paste(round(head(risk_pred[, 1], 5), 4), collapse = ", "), "\n")
      cat("    Range:", paste(range(risk_pred[, 1], na.rm = TRUE), collapse = " to "), "\n")
      pred_vec <- as.numeric(risk_pred[, 1])
    } else {
      cat("    Length:", length(risk_pred), "\n")
      cat("    Head  :", paste(round(head(risk_pred, 5), 4), collapse = ", "), "\n")
      cat("    Range :", paste(range(risk_pred, na.rm = TRUE), collapse = " to "), "\n")
      pred_vec <- as.numeric(risk_pred)
    }
    
    cat("  [RSF DEBUG] After extraction:\n")
    cat("    Class :", paste(class(pred_vec), collapse = ", "), "\n")
    cat("    Length:", length(pred_vec), "\n")
    cat("    Any NA:", any(is.na(pred_vec)), "(", sum(is.na(pred_vec)), ")\n")
    cat("    Any Inf:", any(is.infinite(pred_vec)), "(", sum(is.infinite(pred_vec)), ")\n")
    cat("    Range :", paste(range(pred_vec, na.rm = TRUE), collapse = " to "), "\n")
    
    pred_vec
  }, error = function(e) {
    cat("  Warning: RSF risk prediction failed:", e$message, "\n")
    return(NULL)
  })
  
  # 2) C-index: manual + riskRegression::Score()
  cindex_td <- NA_real_
  cindex_ti <- NA_real_
  
  if (!is.null(rsf_predictions) && length(rsf_predictions) > 0) {
    # Use original time_vec and status_vec to match CatBoost approach
    n_use <- min(length(rsf_predictions), length(time_vec), nrow(rsf_data))
    
    rsf_time_vec   <- time_vec[1:n_use]
    rsf_status_vec <- status_vec[1:n_use]
    rsf_pred_vec   <- rsf_predictions[1:n_use]
    
    valid_idx <- !is.na(rsf_time_vec) & !is.na(rsf_status_vec) & 
                 !is.na(rsf_pred_vec) &
                 is.finite(rsf_time_vec) & is.finite(rsf_pred_vec) &
                 rsf_time_vec > 0
    
    if (sum(valid_idx) < 10) {
      cat("  Warning: Too few valid observations for C-index\n")
    } else {
      score_data <- data.frame(
        time   = as.numeric(rsf_time_vec[valid_idx]),
        status = as.integer(rsf_status_vec[valid_idx]),
        row.names = NULL
      )
      rsf_predictions_clean <- rsf_pred_vec[valid_idx]
      
      # Log what we're passing to calculate_cindex / Score()
      cat("  [RSF DEBUG] Before C-index computation:\n")
      cat("    score_data rows:", nrow(score_data), "\n")
      cat("    predictions length:", length(rsf_predictions_clean), "\n")
      cat("    predictions class:", paste(class(rsf_predictions_clean), collapse=", "), "\n")
      pred_matrix <- as.matrix(rsf_predictions_clean)
      cat("    as.matrix() dim:", paste(dim(pred_matrix), collapse="x"), "\n")
      cat("    as.matrix() class:", paste(class(pred_matrix), collapse=", "), "\n")
      cat("    score_data$time range:", paste(range(score_data$time, na.rm=TRUE), collapse=" to "), "\n")
      cat("    score_data$status sum:", sum(score_data$status, na.rm=TRUE), "\n")
      cat("    horizon:", horizon, "\n")
      
      # Manual C-index (time-independent + our time-dependent surrogate)
      cindex_result <- calculate_cindex(
        time   = score_data$time,
        status = score_data$status,
        risk   = rsf_predictions_clean,
        horizon = horizon
      )
      cindex_ti <- cindex_result$cindex_ti
      
      # Try riskRegression::Score for time-dependent C-index
      cindex_td <- tryCatch({
        evaluation <- riskRegression::Score(
          object  = list(RSF = pred_matrix),
          formula = survival::Surv(time, status) ~ 1,
          data    = score_data,
          times   = horizon,
          summary = "risks",
          metrics = "auc",
          se.fit  = FALSE
        )
        
        auc_tab <- evaluation$AUC$score
        if ("times" %in% names(auc_tab)) {
          this_row <- which.min(abs(auc_tab$times - horizon))
        } else {
          this_row <- 1L
        }
        as.numeric(auc_tab$AUC[this_row])
      }, error = function(e) {
        cat("  Warning: Score() failed, using manual calculate_cindex():", e$message, "\n")
        cindex_result$cindex_td
      })
      
      # Logging C-index values
      if (is.na(cindex_td)) {
        cat("  Warning: RSF time-dependent C-index is NA\n")
        cat("    Time range:", paste(range(score_data$time, na.rm = TRUE), collapse = " to "), "\n")
        cat("    Status sum:", sum(score_data$status, na.rm = TRUE), "\n")
        cat("    Prediction range:", paste(range(rsf_predictions_clean, na.rm = TRUE), collapse = " to "), "\n")
      } else {
        cat("  RSF time-dependent C-index:", round(cindex_td, 4), "\n")
      }
      
      if (!is.na(cindex_ti)) {
        cat("  RSF time-independent C-index:", round(cindex_ti, 4), "\n")
      }
    }
  } else {
    cat("  Warning: RSF predictions are NULL or empty\n")
  }
  
  # 3) Feature importance table
  importance_df <- tibble::enframe(rsf_model$variable.importance) %>%
    dplyr::arrange(dplyr::desc(value)) %>%
    dplyr::slice(1:n_predictors) %>%
    dplyr::rename(feature = name, importance = value)
  
  # Attach C-index values (same for all rows, convenient for downstream inspection)
  importance_df$cindex_td <- cindex_td
  importance_df$cindex_ti <- cindex_ti
  
  cat("  RSF selected", nrow(importance_df), "features\n")
  
  return(importance_df)
}

### CatBoost Feature Importance - Helper Function

In [None]:
select_features_catboost <- function(data,
                                     n_predictors = 20,
                                     iterations   = 2000,
                                     horizon      = 1) {
  cat("  Running CatBoost feature importance...\n")
  
  # Check dependencies
  if (!requireNamespace("catboost", quietly = TRUE)) {
    warning("CatBoost package not available - skipping CatBoost feature importance")
    return(NULL)
  }
  if (!requireNamespace("riskRegression", quietly = TRUE)) {
    stop("The 'riskRegression' package is required for CatBoost C-index but is not installed.")
  }
  if (!requireNamespace("survival", quietly = TRUE)) {
    stop("The 'survival' package is required for CatBoost C-index but is not installed.")
  }
  if (!exists("catboost_cindex_score", mode = "function")) {
    stop("catboost_cindex_score() must be defined before calling select_features_catboost().")
  }
  
  # Base feature set: remove ID, ptid_e, time, status
  feature_data <- data %>%
    dplyr::select(-dplyr::any_of(c("ID", "ptid_e", "time", "status")))
  
  # Create recipe (no dummy coding for CatBoost)
  recipe_prep <- tryCatch({
    make_recipe(data, dummy_code = FALSE) %>% recipes::prep()
  }, error = function(e) {
    cat("  Warning: Recipe preparation failed, using raw data:", e$message, "\n")
    return(NULL)
  })
  
  if (is.null(recipe_prep)) {
    # Fallback: use cleaned raw features
    prepared_data <- feature_data
  } else {
    prepared_data <- recipes::juice(recipe_prep) %>%
      # drop IDs and outcome columns
      dplyr::select(-dplyr::any_of(c("ID", "ptid_e", "time", "status")))
  }
  
  # Ensure we have matching rows
  n_rows <- min(nrow(prepared_data), nrow(data))
  prepared_data <- prepared_data[1:n_rows, , drop = FALSE]
  
  # Prepare time and status for CatBoost (signed-time label)
  # +time for events, -time for censored
  time_vec   <- data$time[1:n_rows]
  status_vec <- data$status[1:n_rows]
  signed_time <- ifelse(status_vec == 1, time_vec, -time_vec)
  
  # Quick log
  cat("  [CatBoost] Time range:",
      paste(range(time_vec, na.rm = TRUE), collapse = " to "), "\n")
  cat("  [CatBoost] Status table:\n")
  print(table(status_vec, useNA = "ifany"))
  
  # Convert character columns to factors for CatBoost
  prepared_data <- prepared_data %>%
    dplyr::mutate(dplyr::across(where(is.character), as.factor))
  
  # Create CatBoost pool (factors → categorical automatically in R)
  train_pool <- tryCatch({
    catboost::catboost.load_pool(
      data  = prepared_data,
      label = signed_time
    )
  }, error = function(e) {
    cat("  Error creating CatBoost pool:", e$message, "\n")
    return(NULL)
  })
  
  if (is.null(train_pool)) {
    return(NULL)
  }
  
  # Train CatBoost model
  catboost_params <- list(
    loss_function  = "RMSE",  # Using signed-time as regression proxy
    depth          = 6,
    learning_rate  = 0.05,
    iterations     = iterations,
    l2_leaf_reg    = 3.0,
    random_seed    = 42,
    verbose        = 0
  )
  
  catboost_model <- tryCatch({
    catboost::catboost.train(train_pool, params = catboost_params)
  }, error = function(e) {
    cat("  Error training CatBoost model:", e$message, "\n")
    return(NULL)
  })
  
  if (is.null(catboost_model)) {
    return(NULL)
  }
  
  # Get predictions for C-index calculation (signed-time)
  catboost_predictions <- tryCatch({
    catboost::catboost.predict(catboost_model, train_pool)
  }, error = function(e) {
    cat("  Warning: Could not get CatBoost predictions:", e$message, "\n")
    return(NULL)
  })
  
  # Calculate C-index using original study method (riskRegression::Score, wrapped in catboost_cindex_score)
  cindex_td <- NA_real_
  cindex_ti <- NA_real_
  
  if (!is.null(catboost_predictions)) {
    cindex_result <- catboost_cindex_score(
      predictions = catboost_predictions,  # signed-time
      time        = time_vec,
      status      = status_vec,
      horizon     = horizon
    )
    
    cindex_td <- cindex_result$cindex_td
    cindex_ti <- cindex_result$cindex_ti
    
    if (is.na(cindex_td)) {
      cat("  Warning: CatBoost time-dependent C-index calculation returned NA\n")
      cat("    Time range:", paste(range(time_vec, na.rm = TRUE), collapse = " to "), "\n")
      cat("    Status sum:", sum(status_vec, na.rm = TRUE), "\n")
    } else {
      cat("  CatBoost time-dependent C-index:", round(cindex_td, 4), "\n")
    }
    
    if (!is.na(cindex_ti)) {
      cat("  CatBoost time-independent C-index:", round(cindex_ti, 4), "\n")
    }
  } else {
    cat("  Warning: CatBoost predictions are NULL; skipping C-index\n")
  }
  
  # Extract feature importance
  importance_raw <- tryCatch({
    catboost::catboost.get_feature_importance(
      catboost_model,
      pool = train_pool,
      type = "FeatureImportance"
    )
  }, error = function(e) {
    cat("  Error extracting CatBoost importance:", e$message, "\n")
    return(NULL)
  })
  
  if (is.null(importance_raw)) {
    return(NULL)
  }
  
  # Create importance data frame
  importance_df <- data.frame(
    feature    = names(prepared_data),
    importance = as.numeric(importance_raw),
    stringsAsFactors = FALSE
  ) %>%
    dplyr::arrange(dplyr::desc(importance)) %>%
    dplyr::slice(1:min(n_predictors, nrow(.)))
  
  # Add C-index columns (scalar, same for all rows)
  importance_df$cindex_td <- cindex_td
  importance_df$cindex_ti <- cindex_ti
  
  cat("  CatBoost selected", nrow(importance_df), "features\n")
  
  return(importance_df)
}


### AORSF Model

In [None]:
# AORSF Feature Importance
select_features_aorsf <- function(data, n_predictors = 20, n_trees = 100, horizon = 1) {
  cat("  Running AORSF feature importance...\n")
  
  # Check if AORSF is available (self-contained)
  aorsf_available <- requireNamespace("aorsf", quietly = TRUE)
  
  if (!aorsf_available) {
    warning("aorsf package not available - skipping AORSF feature importance")
    return(NULL)
  }
  
  # Extract time and status before processing
  time_vec <- data$time
  status_vec <- data$status
  
  # Use same preprocessing as RSF and CatBoost: make_recipe with dummy_code = FALSE
  # This ensures feature names are consistent across all methods
  recipe_prep <- tryCatch({
    make_recipe(data, dummy_code = FALSE) %>% recipes::prep()
  }, error = function(e) {
    cat("  Warning: Recipe preparation failed, using raw data:", e$message, "\n")
    return(NULL)
  })
  
  if (is.null(recipe_prep)) {
    # Fallback: use cleaned raw features (matching RSF/CatBoost fallback)
    prepared_data <- data %>%
      dplyr::select(-dplyr::any_of(c("ID", "ptid_e", "time", "status")))
  } else {
    # Use juice() to get processed data (matching RSF/CatBoost)
    prepared_data <- recipes::juice(recipe_prep) %>%
      dplyr::select(-dplyr::any_of(c("ID", "ptid_e", "time", "status")))
  }
  
  # Ensure we have matching rows
  n_rows <- min(nrow(prepared_data), length(time_vec), length(status_vec))
  
  # Create AORSF data with time and status properly added (matching RSF structure)
  aorsf_data <- prepared_data[1:n_rows, , drop = FALSE]
  aorsf_data$time <- as.numeric(time_vec[1:n_rows])
  aorsf_data$status <- as.integer(status_vec[1:n_rows])
  
  # Ensure time and status are numeric/integer
  aorsf_data$time <- as.numeric(aorsf_data$time)
  aorsf_data$status <- as.integer(aorsf_data$status)
  
  # Remove any rows with invalid survival data
  valid_rows <- !is.na(aorsf_data$time) & !is.na(aorsf_data$status) & 
                aorsf_data$time > 0 & aorsf_data$status %in% c(0, 1)
  aorsf_data <- aorsf_data[valid_rows, ]
  
  if (nrow(aorsf_data) < 10) {
    stop("Not enough valid rows for AORSF after filtering")
  }
  
  # Remove constant columns (AORSF requirement)
  constant_cols <- names(aorsf_data)[sapply(aorsf_data, function(x) {
    if (is.numeric(x)) {
      length(unique(na.omit(x))) == 1
    } else {
      length(unique(na.omit(x))) == 1
    }
  })]
  
  if (length(constant_cols) > 0) {
    cat("  Removing constant columns:", paste(constant_cols, collapse = ", "), "\n")
    aorsf_data <- aorsf_data %>% select(-all_of(constant_cols))
  }
  
  # Convert character columns to factors for AORSF
  aorsf_data <- aorsf_data %>%
    mutate(across(where(is.character), as.factor))
  
  # Fit AORSF model
  aorsf_model <- tryCatch({
    set.seed(42)
    aorsf::orsf(
      data = aorsf_data,
      formula = Surv(time, status) ~ .,
      n_tree = n_trees,
      na_action = 'impute_meanmode'
    )
  }, error = function(e) {
    cat("  Error fitting AORSF model:", e$message, "\n")
    return(NULL)
  })
  
  if (is.null(aorsf_model)) {
    return(NULL)
  }
  
  # Get predictions for C-index calculation
  aorsf_predictions <- tryCatch({
    risk_pred <- predict(aorsf_model, new_data = aorsf_data, pred_type = 'risk', pred_horizon = horizon)
    
    # Log AORSF prediction structure
    cat("  [AORSF DEBUG] Raw prediction object:\n")
    cat("    Class:", paste(class(risk_pred), collapse=", "), "\n")
    cat("    Type:", typeof(risk_pred), "\n")
    if (is.matrix(risk_pred)) {
      cat("    Dimensions:", paste(dim(risk_pred), collapse="x"), "\n")
      cat("    First few values:", paste(head(risk_pred[, 1], 5), collapse=", "), "\n")
      cat("    Range:", paste(range(risk_pred[, 1], na.rm=TRUE), collapse=" to "), "\n")
    } else {
      cat("    Length:", length(risk_pred), "\n")
      cat("    First few values:", paste(head(risk_pred, 5), collapse=", "), "\n")
      cat("    Range:", paste(range(risk_pred, na.rm=TRUE), collapse=" to "), "\n")
    }
    
    # Extract as vector
    if (is.matrix(risk_pred)) {
      pred_vec <- as.numeric(risk_pred[, 1])
    } else {
      pred_vec <- as.numeric(risk_pred)
    }
    
    # Log extracted vector
    cat("  [AORSF DEBUG] After extraction:\n")
    cat("    Class:", paste(class(pred_vec), collapse=", "), "\n")
    cat("    Length:", length(pred_vec), "\n")
    cat("    Any NA:", any(is.na(pred_vec)), "(", sum(is.na(pred_vec)), ")\n")
    cat("    Any Inf:", any(is.infinite(pred_vec)), "(", sum(is.infinite(pred_vec)), ")\n")
    cat("    Range:", paste(range(pred_vec, na.rm=TRUE), collapse=" to "), "\n")
    
    pred_vec
  }, error = function(e) {
    cat("  Warning: AORSF risk prediction failed:", e$message, "\n")
    return(NULL)
  })
  
  # Calculate C-index using riskRegression::Score (consistent with RSF and CatBoost)
  cindex_td <- NA_real_
  cindex_ti <- NA_real_
  if (!is.null(aorsf_predictions)) {
    # Ensure predictions and data are aligned
    n_use <- min(length(aorsf_predictions), nrow(aorsf_data))
    
    aorsf_time_vec <- aorsf_data$time[1:n_use]
    aorsf_status_vec <- aorsf_data$status[1:n_use]
    aorsf_pred_vec <- aorsf_predictions[1:n_use]
    
    # Ensure no missing values
    valid_idx <- !is.na(aorsf_time_vec) & !is.na(aorsf_status_vec) & 
                 !is.na(aorsf_pred_vec) & 
                 is.finite(aorsf_time_vec) & is.finite(aorsf_pred_vec) &
                 aorsf_time_vec > 0
    
    if (sum(valid_idx) < 10) {
      cat("  Warning: Too few valid observations for C-index\n")
    } else {
      # Build a clean scoring dataset
      score_data <- data.frame(
        time   = as.numeric(aorsf_time_vec[valid_idx]),
        status = as.integer(aorsf_status_vec[valid_idx]),
        row.names = NULL
      )
      aorsf_predictions_clean <- aorsf_pred_vec[valid_idx]
      
      # Log what we're passing to Score()
      cat("  [AORSF DEBUG] Before Score() call:\n")
      cat("    score_data rows:", nrow(score_data), "\n")
      cat("    predictions length:", length(aorsf_predictions_clean), "\n")
      pred_matrix <- as.matrix(aorsf_predictions_clean)
      cat("    as.matrix() dim:", paste(dim(pred_matrix), collapse="x"), "\n")
      cat("    score_data$time range:", paste(range(score_data$time, na.rm=TRUE), collapse=" to "), "\n")
      cat("    score_data$status sum:", sum(score_data$status, na.rm=TRUE), "\n")
      cat("    horizon:", horizon, "\n")
      
      # Calculate both time-dependent and time-independent C-index
      cindex_result <- calculate_cindex(score_data$time, score_data$status, aorsf_predictions_clean, horizon = horizon)
      cindex_ti <- cindex_result$cindex_ti
      
      # Try riskRegression::Score for time-dependent (matching original study)
      cindex_td <- tryCatch({
        evaluation <- riskRegression::Score(
          object  = list(AORSF = pred_matrix),
          formula = survival::Surv(time, status) ~ 1,
          data    = score_data,
          times   = horizon,
          summary = "risks",
          metrics = "auc",
          se.fit  = FALSE
        )
        
        auc_tab <- evaluation$AUC$score
        if ("times" %in% names(auc_tab)) {
          this_row <- which.min(abs(auc_tab$times - horizon))
        } else {
          this_row <- 1L
        }
        as.numeric(auc_tab$AUC[this_row])
      }, error = function(e) {
        cat("  Warning: Score() failed, using manual calculate_cindex():", e$message, "\n")
        # Fallback to manual time-dependent C-index calculation
        cindex_result$cindex_td
      })
    }
    
    if (is.na(cindex_td)) {
      cat("  Warning: AORSF time-dependent C-index is NA\n")
    } else {
      cat("  AORSF time-dependent C-index:", round(cindex_td, 4), "\n")
    }
    
    if (!is.na(cindex_ti)) {
      cat("  AORSF time-independent C-index:", round(cindex_ti, 4), "\n")
    }
  } else {
    cat("  Warning: AORSF predictions are NULL or empty\n")
  }
  
  # Extract feature importance using negate method (most common)
  importance_raw <- tryCatch({
    aorsf::orsf_vi_negate(aorsf_model)
  }, error = function(e) {
    cat("  Error extracting AORSF importance:", e$message, "\n")
    return(NULL)
  })
  
  if (is.null(importance_raw)) {
    return(NULL)
  }
  
  # CRITICAL FIX: Ensure feature names match the input data column names
  # orsf_vi_negate() returns feature names from the model, which should match
  # the column names in aorsf_data (excluding time and status)
  # Get the actual feature names from the data (before time/status were added)
  feature_names_from_data <- setdiff(names(aorsf_data), c("time", "status"))
  
  # Verify that importance_raw names match feature_names_from_data
  importance_names <- names(importance_raw)
  if (!identical(sort(importance_names), sort(feature_names_from_data))) {
    cat("  [AORSF WARNING] Feature name mismatch detected!\n")
    cat("    Importance names:", paste(head(importance_names, 10), collapse = ", "), "...\n")
    cat("    Data names:", paste(head(feature_names_from_data, 10), collapse = ", "), "...\n")
    cat("    Missing from importance:", paste(setdiff(feature_names_from_data, importance_names), collapse = ", "), "\n")
    cat("    Extra in importance:", paste(setdiff(importance_names, feature_names_from_data), collapse = ", "), "\n")
    
    # Use feature names from data to ensure consistency with RSF/CatBoost
    # Map importance values to correct feature names
    if (length(importance_names) == length(feature_names_from_data)) {
      # If counts match, assume same order (risky but necessary)
      names(importance_raw) <- feature_names_from_data
      cat("    [FIXED] Re-mapped importance names to match data column names\n")
    } else {
      # If counts don't match, try to match by position or use data names
      cat("    [WARNING] Count mismatch - using data column names\n")
      # Create a new importance vector with correct names
      if (length(importance_raw) <= length(feature_names_from_data)) {
        importance_raw <- setNames(importance_raw, feature_names_from_data[1:length(importance_raw)])
      }
    }
  }
  
  # Create importance data frame (now with correct feature names)
  importance_df <- tibble::enframe(importance_raw, name = "feature", value = "importance") %>%
    dplyr::arrange(dplyr::desc(importance)) %>%
    dplyr::slice(1:min(n_predictors, nrow(.))) %>%
    dplyr::mutate(cindex_td = cindex_td, cindex_ti = cindex_ti)
  
  cat("  AORSF selected", nrow(importance_df), "features\n")
  cat("  [AORSF] Feature names match RSF/CatBoost:", 
      all(importance_df$feature %in% feature_names_from_data), "\n")
  
  return(importance_df)
}


In [None]:
# Build final model with Wisotzkey variables
# This function builds a model (RSF, CatBoost, or AORSF) using only the specified Wisotzkey variables
fit_final_model_wisotzkey <- function(data, wisotzkey_vars, method = "AORSF", n_trees = 100, horizon = 1) {
  cat("  Building final", method, "model with", length(wisotzkey_vars), "Wisotzkey variables...\n")
  
  if (length(wisotzkey_vars) == 0) {
    cat("  Warning: No Wisotzkey variables provided - skipping final model\n")
    return(list(
      cindex_td = NA_real_,
      cindex_ti = NA_real_,
      n_features = 0,
      features = character(0)
    ))
  }
  
  # Check method-specific dependencies
  if (method == "AORSF" && !requireNamespace("aorsf", quietly = TRUE)) {
    cat("  Warning: aorsf package not available - skipping final model\n")
    return(list(
      cindex_td = NA_real_,
      cindex_ti = NA_real_,
      n_features = length(wisotzkey_vars),
      features = wisotzkey_vars
    ))
  }
  if (method == "CatBoost" && !requireNamespace("catboost", quietly = TRUE)) {
    cat("  Warning: catboost package not available - skipping final model\n")
    return(list(
      cindex_td = NA_real_,
      cindex_ti = NA_real_,
      n_features = length(wisotzkey_vars),
      features = wisotzkey_vars
    ))
  }
  if (method == "RSF" && !requireNamespace("ranger", quietly = TRUE)) {
    cat("  Warning: ranger package not available - skipping final model\n")
    return(list(
      cindex_td = NA_real_,
      cindex_ti = NA_real_,
      n_features = length(wisotzkey_vars),
      features = wisotzkey_vars
    ))
  }
  
  # Extract time and status
  time_vec <- data$time
  status_vec <- data$status
  
  # Check which Wisotzkey variables are actually available in the data
  available_vars <- intersect(wisotzkey_vars, names(data))
  missing_vars <- setdiff(wisotzkey_vars, names(data))
  
  if (length(missing_vars) > 0) {
    cat("  Warning: Missing Wisotzkey variables:", paste(missing_vars, collapse = ", "), "\n")
  }
  
  if (length(available_vars) == 0) {
    cat("  Warning: No Wisotzkey variables available in data - skipping final model\n")
    return(list(
      cindex_td = NA_real_,
      cindex_ti = NA_real_,
      n_features = 0,
      features = character(0)
    ))
  }
  
  # Create data frame with only Wisotzkey variables, time, and status
  final_data <- data %>%
    dplyr::select(dplyr::all_of(c("time", "status", available_vars)))
  
  # Filter invalid survival data
  valid_rows <- !is.na(final_data$time) & !is.na(final_data$status) &
                final_data$time > 0 & final_data$status %in% c(0, 1)
  final_data <- final_data[valid_rows, ]
  
  if (nrow(final_data) < 10) {
    cat("  Warning: Not enough valid rows for final model\n")
    return(list(
      cindex_td = NA_real_,
      cindex_ti = NA_real_,
      n_features = length(available_vars),
      features = available_vars
    ))
  }
  
  # Remove constant columns
  constant_cols <- names(final_data)[sapply(final_data, function(x) {
    if (is.numeric(x)) {
      length(unique(na.omit(x))) == 1
    } else {
      length(unique(na.omit(x))) == 1
    }
  })]
  
  if (length(constant_cols) > 0) {
    cat("  Removing constant columns:", paste(constant_cols, collapse = ", "), "\n")
    final_data <- final_data %>% dplyr::select(-dplyr::all_of(constant_cols))
    # Remove from available_vars if they were constant
    available_vars <- setdiff(available_vars, constant_cols)
  }
  
  # Convert character columns to factors
  final_data <- final_data %>%
    dplyr::mutate(dplyr::across(where(is.character), as.factor))
  
  # Ensure time and status are numeric/integer
  final_data$time <- as.numeric(final_data$time)
  final_data$status <- as.integer(final_data$status)
  
  # Fit model based on method
  final_model <- NULL
  final_predictions <- NULL
  
  if (method == "AORSF") {
    # Fit AORSF model
    final_model <- tryCatch({
      set.seed(42)
      aorsf::orsf(
        data = final_data,
        formula = survival::Surv(time, status) ~ .,
        n_tree = n_trees,
        na_action = 'impute_meanmode'
      )
    }, error = function(e) {
      cat("  Error fitting final", method, "model:", e$message, "\n")
      return(NULL)
    })
    
    if (!is.null(final_model)) {
      # Get predictions for C-index calculation
      final_predictions <- tryCatch({
        risk_pred <- predict(final_model, new_data = final_data, pred_type = 'risk', pred_horizon = horizon)
        if (is.matrix(risk_pred)) {
          as.numeric(risk_pred[, 1])
        } else {
          as.numeric(risk_pred)
        }
      }, error = function(e) {
        cat("  Warning: Final", method, "model prediction failed:", e$message, "\n")
        return(NULL)
      })
    }
    
  } else if (method == "RSF") {
    # Fit RSF model
    final_model <- tryCatch({
      set.seed(42)
      ranger::ranger(
        survival::Surv(time, status) ~ .,
        data = final_data,
        num.trees = n_trees,
        importance = "permutation",
        min.node.size = 20,
        splitrule = "extratrees",
        num.random.splits = 10
      )
    }, error = function(e) {
      cat("  Error fitting final", method, "model:", e$message, "\n")
      return(NULL)
    })
    
    if (!is.null(final_model) && exists("ranger_predictrisk", mode = "function")) {
      # Get predictions using ranger_predictrisk
      final_predictions <- tryCatch({
        risk_pred <- ranger_predictrisk(
          object = final_model,
          newdata = final_data,
          times = horizon
        )
        if (is.matrix(risk_pred)) {
          as.numeric(risk_pred[, 1])
        } else {
          as.numeric(risk_pred)
        }
      }, error = function(e) {
        cat("  Warning: Final", method, "model prediction failed:", e$message, "\n")
        return(NULL)
      })
    }
    
  } else if (method == "CatBoost") {
    # Prepare CatBoost data (signed-time label)
    time_vec <- final_data$time
    status_vec <- final_data$status
    signed_time <- ifelse(status_vec == 1, time_vec, -time_vec)
    
    # Remove time and status for features
    catboost_features <- final_data %>%
      dplyr::select(-dplyr::all_of(c("time", "status")))
    
    # Convert character to factor
    catboost_features <- catboost_features %>%
      dplyr::mutate(dplyr::across(where(is.character), as.factor))
    
    # Create CatBoost pool
    train_pool <- tryCatch({
      catboost::catboost.load_pool(
        data = catboost_features,
        label = signed_time
      )
    }, error = function(e) {
      cat("  Error creating CatBoost pool:", e$message, "\n")
      return(NULL)
    })
    
    if (!is.null(train_pool)) {
      # Train CatBoost model
      catboost_params <- list(
        loss_function = "RMSE",
        depth = 6,
        learning_rate = 0.05,
        iterations = 2000,
        l2_leaf_reg = 3.0,
        random_seed = 42,
        verbose = 0
      )
      
      final_model <- tryCatch({
        catboost::catboost.train(train_pool, params = catboost_params)
      }, error = function(e) {
        cat("  Error training final", method, "model:", e$message, "\n")
        return(NULL)
      })
      
      if (!is.null(final_model)) {
        # Get predictions (signed-time, will convert to risk)
        final_predictions <- tryCatch({
          signed_pred <- catboost::catboost.predict(final_model, train_pool)
          # Convert signed-time to risk: risk = -signed_time
          -as.numeric(signed_pred)
        }, error = function(e) {
          cat("  Warning: Final", method, "model prediction failed:", e$message, "\n")
          return(NULL)
        })
      }
    }
  }
  
  if (is.null(final_model) || is.null(final_predictions)) {
    return(list(
      cindex_td = NA_real_,
      cindex_ti = NA_real_,
      n_features = length(available_vars),
      features = available_vars
    ))
  }
  
  # Calculate C-index
  cindex_td <- NA_real_
  cindex_ti <- NA_real_
  
  if (!is.null(final_predictions) && length(final_predictions) > 0) {
    n_use <- min(length(final_predictions), nrow(final_data))
    
    final_time_vec <- final_data$time[1:n_use]
    final_status_vec <- final_data$status[1:n_use]
    final_pred_vec <- final_predictions[1:n_use]
    
    valid_idx <- !is.na(final_time_vec) & !is.na(final_status_vec) &
                 !is.na(final_pred_vec) &
                 is.finite(final_time_vec) & is.finite(final_pred_vec) &
                 final_time_vec > 0
    
    if (sum(valid_idx) >= 10) {
      score_data <- data.frame(
        time   = as.numeric(final_time_vec[valid_idx]),
        status = as.integer(final_status_vec[valid_idx]),
        row.names = NULL
      )
      final_predictions_clean <- final_pred_vec[valid_idx]
      
      # Calculate both time-dependent and time-independent C-index
      cindex_result <- calculate_cindex(
        score_data$time,
        score_data$status,
        final_predictions_clean,
        horizon = horizon
      )
      cindex_ti <- cindex_result$cindex_ti
      
      # Try riskRegression::Score for time-dependent
      cindex_td <- tryCatch({
        pred_matrix <- as.matrix(final_predictions_clean)
        evaluation <- riskRegression::Score(
          object  = list(Final = pred_matrix),
          formula = survival::Surv(time, status) ~ 1,
          data    = score_data,
          times   = horizon,
          summary = "risks",
          metrics = "auc",
          se.fit  = FALSE
        )
        
        auc_tab <- evaluation$AUC$score
        if ("times" %in% names(auc_tab)) {
          this_row <- which.min(abs(auc_tab$times - horizon))
        } else {
          this_row <- 1L
        }
        as.numeric(auc_tab$AUC[this_row])
      }, error = function(e) {
        cindex_result$cindex_td
      })
      
      cat("  Final", method, "model C-index (time-dependent):", round(cindex_td, 4), "\n")
      cat("  Final", method, "model C-index (time-independent):", round(cindex_ti, 4), "\n")
    }
  }
  
  return(list(
    cindex_td = cindex_td,
    cindex_ti = cindex_ti,
    n_features = length(available_vars),
    features = available_vars
  ))
}

# Helper function to get all available Wisotzkey variables from data
get_all_available_wisotzkey <- function(data) {
  wisotzkey_alternatives <- list(
    "pra_listing" = c("pra_listing", "lsfprat", "lsfprab"),
    "tx_mcsd" = c("tx_mcsd", "txmcsd")
  )
  
  available_wisotzkey <- character(0)
  
  for (var in wisotzkey_variables) {
    if (var %in% names(data)) {
      available_wisotzkey <- c(available_wisotzkey, var)
    } else if (var %in% names(wisotzkey_alternatives)) {
      # Try alternatives
      alts <- wisotzkey_alternatives[[var]]
      alt_hit <- alts[alts %in% names(data)]
      if (length(alt_hit) > 0) {
        available_wisotzkey <- c(available_wisotzkey, alt_hit[1])
      }
    }
  }
  
  return(available_wisotzkey)
}


### Combined Analysis - Helper Function

In [None]:
# Main analysis function
analyze_time_period <- function(period_name, period_data) {
  cat("\n=== Analyzing:", period_name, "===\n")
  cat("  Sample size:", nrow(period_data), "patients\n")
  cat("  Event rate:", round(mean(period_data$status, na.rm = TRUE) * 100, 2), "%\n")
  
  if (nrow(period_data) < 100) {
    warning(paste("Sample size too small for", period_name, "- skipping"))
    return(NULL)
  }
  
  # Prepare data
  prepared_data <- prepare_modeling_data(period_data)
  
  if (nrow(prepared_data) < 50) {
    warning(paste("Too few valid rows after preparation for", period_name, "- skipping"))
    return(NULL)
  }

  # Count predictor variables (exclude time, status, ID columns)
  predictor_vars <- setdiff(names(prepared_data), c("time", "status", "ID", "ptid_e"))
  n_available <- length(predictor_vars)
  n_predictors_adj <- min(n_predictors, n_available)
  
  if (n_predictors_adj < n_predictors) {
    cat("  Note: Requested", n_predictors, "predictors but only", n_available, 
        "variables available. Using", n_predictors_adj, "predictors.\n")
  }

  # RSF feature selection
  rsf_features <- tryCatch({
    select_features_rsf(prepared_data, n_predictors = n_predictors_adj, n_trees = n_trees_rsf, horizon = horizon)
  }, error = function(e) {
    cat("  ERROR in RSF feature selection:", e$message, "\n")
    return(NULL)
  })
  
  # CatBoost feature importance
  catboost_features <- tryCatch({
    select_features_catboost(prepared_data, n_predictors = n_predictors_adj, iterations = 2000, horizon = horizon)
  }, error = function(e) {
    cat("  ERROR in CatBoost feature importance:", e$message, "\n")
    return(NULL)
  })

  # AORSF feature importance
  aorsf_features <- tryCatch({
    select_features_aorsf(prepared_data, n_predictors = n_predictors_adj, n_trees = n_trees_aorsf, horizon = horizon)
  }, error = function(e) {
    cat("  ERROR in AORSF feature importance:", e$message, "\n")
    return(NULL)
  })
  
  # Validate feature name consistency across methods
  if (!is.null(rsf_features) && !is.null(catboost_features) && !is.null(aorsf_features)) {
    rsf_feature_names <- rsf_features$feature
    catboost_feature_names <- catboost_features$feature
    aorsf_feature_names <- aorsf_features$feature
    
    # Check if feature names overlap
    rsf_catboost_overlap <- length(intersect(rsf_feature_names, catboost_feature_names))
    rsf_aorsf_overlap <- length(intersect(rsf_feature_names, aorsf_feature_names))
    catboost_aorsf_overlap <- length(intersect(catboost_feature_names, aorsf_feature_names))
    
    cat("  Feature name consistency check:\n")
    cat("    RSF-CatBoost overlap:", rsf_catboost_overlap, "features\n")
    cat("    RSF-AORSF overlap:", rsf_aorsf_overlap, "features\n")
    cat("    CatBoost-AORSF overlap:", catboost_aorsf_overlap, "features\n")
    
    if (rsf_aorsf_overlap == 0 || catboost_aorsf_overlap == 0) {
      cat("    [WARNING] AORSF feature names don't match RSF/CatBoost!\n")
      cat("    RSF features:", paste(head(rsf_feature_names, 5), collapse = ", "), "...\n")
      cat("    AORSF features:", paste(head(aorsf_feature_names, 5), collapse = ", "), "...\n")
    }
  }
  
 # Extract C-index values (both time-dependent and time-independent)
  rsf_cindex_td <- if (!is.null(rsf_features) && "cindex_td" %in% names(rsf_features)) {
    rsf_features$cindex_td[1]
  } else {
    NA_real_
  }
  
  rsf_cindex_ti <- if (!is.null(rsf_features) && "cindex_ti" %in% names(rsf_features)) {
    rsf_features$cindex_ti[1]
  } else {
    NA_real_
  }
  
  catboost_cindex_td <- if (!is.null(catboost_features) && "cindex_td" %in% names(catboost_features)) {
    catboost_features$cindex_td[1]
  } else {
    NA_real_
  }
  
  catboost_cindex_ti <- if (!is.null(catboost_features) && "cindex_ti" %in% names(catboost_features)) {
    catboost_features$cindex_ti[1]
  } else {
    NA_real_
  }
  
  aorsf_cindex_td <- if (!is.null(aorsf_features) && "cindex_td" %in% names(aorsf_features)) {
    aorsf_features$cindex_td[1]
  } else {
    NA_real_
  }
  
  aorsf_cindex_ti <- if (!is.null(aorsf_features) && "cindex_ti" %in% names(aorsf_features)) {
    aorsf_features$cindex_ti[1]
  } else {
    NA_real_
  }

   # Identify which of the top 20 features are Wisotzkey variables (matching original study)
  rsf_wisotzkey <- if (!is.null(rsf_features)) {
    identify_wisotzkey_features(rsf_features$feature)
  } else {
    character(0)
  }

  catboost_wisotzkey <- if (!is.null(catboost_features)) {
    identify_wisotzkey_features(catboost_features$feature)
  } else {
    character(0)
  }

  aorsf_wisotzkey <- if (!is.null(aorsf_features)) {
    identify_wisotzkey_features(aorsf_features$feature)
  } else {
    character(0)
  }

  # Log Wisotzkey overlap
  cat("  Wisotzkey variables in top 20:\n")
  cat("    RSF:", length(rsf_wisotzkey), "of", length(wisotzkey_variables), "\n")
  if (length(rsf_wisotzkey) > 0) {
    cat("      ", paste(rsf_wisotzkey, collapse = ", "), "\n")
  }
  cat("    CatBoost:", length(catboost_wisotzkey), "of", length(wisotzkey_variables), "\n")
  if (length(catboost_wisotzkey) > 0) {
    cat("      ", paste(catboost_wisotzkey, collapse = ", "), "\n")
  }
  cat("    AORSF:", length(aorsf_wisotzkey), "of", length(wisotzkey_variables), "\n")
  if (length(aorsf_wisotzkey) > 0) {
    cat("      ", paste(aorsf_wisotzkey, collapse = ", "), "\n")
  }

  # Build final models with Wisotzkey variables
  # Step 1: Final models with Wisotzkey variables found in top 20 (may be fewer than 15)
  cat("\n  === Building Final Models with Wisotzkey Variables ===\n")
  
  # Get all available Wisotzkey variables
  all_available_wisotzkey <- get_all_available_wisotzkey(prepared_data)
  
  # Build final models for each method with selected Wisotzkey variables (from top 20)
  final_rsf_selected <- if (length(rsf_wisotzkey) > 0) {
    fit_final_model_wisotzkey(
      data = prepared_data,
      wisotzkey_vars = rsf_wisotzkey,
      method = "RSF",
      n_trees = n_trees_rsf,
      horizon = horizon
    )
  } else {
    list(cindex_td = NA_real_, cindex_ti = NA_real_, n_features = 0, features = character(0))
  }
  
  final_catboost_selected <- if (length(catboost_wisotzkey) > 0) {
    fit_final_model_wisotzkey(
      data = prepared_data,
      wisotzkey_vars = catboost_wisotzkey,
      method = "CatBoost",
      n_trees = 2000,  # CatBoost uses iterations, not trees
      horizon = horizon
    )
  } else {
    list(cindex_td = NA_real_, cindex_ti = NA_real_, n_features = 0, features = character(0))
  }
  
  final_aorsf_selected <- if (length(aorsf_wisotzkey) > 0) {
    fit_final_model_wisotzkey(
      data = prepared_data,
      wisotzkey_vars = aorsf_wisotzkey,
      method = "AORSF",
      n_trees = n_trees_aorsf,
      horizon = horizon
    )
  } else {
    list(cindex_td = NA_real_, cindex_ti = NA_real_, n_features = 0, features = character(0))
  }
  
  # Step 2: Final models with ALL 15 Wisotzkey variables (if available)
  final_rsf_all <- if (length(all_available_wisotzkey) > 0) {
    fit_final_model_wisotzkey(
      data = prepared_data,
      wisotzkey_vars = all_available_wisotzkey,
      method = "RSF",
      n_trees = n_trees_rsf,
      horizon = horizon
    )
  } else {
    list(cindex_td = NA_real_, cindex_ti = NA_real_, n_features = 0, features = character(0))
  }
  
  final_catboost_all <- if (length(all_available_wisotzkey) > 0) {
    fit_final_model_wisotzkey(
      data = prepared_data,
      wisotzkey_vars = all_available_wisotzkey,
      method = "CatBoost",
      n_trees = 2000,
      horizon = horizon
    )
  } else {
    list(cindex_td = NA_real_, cindex_ti = NA_real_, n_features = 0, features = character(0))
  }
  
  final_aorsf_all <- if (length(all_available_wisotzkey) > 0) {
    fit_final_model_wisotzkey(
      data = prepared_data,
      wisotzkey_vars = all_available_wisotzkey,
      method = "AORSF",
      n_trees = n_trees_aorsf,
      horizon = horizon
    )
  } else {
    list(cindex_td = NA_real_, cindex_ti = NA_real_, n_features = 0, features = character(0))
  }
  
  cat("  Final model summary:\n")
  cat("    Selected Wisotzkey (from top 20):\n")
  cat("      RSF:", final_rsf_selected$n_features, "variables\n")
  cat("      CatBoost:", final_catboost_selected$n_features, "variables\n")
  cat("      AORSF:", final_aorsf_selected$n_features, "variables\n")
  cat("    All Wisotzkey (all 15):\n")
  cat("      RSF:", final_rsf_all$n_features, "variables\n")
  cat("      CatBoost:", final_catboost_all$n_features, "variables\n")
  cat("      AORSF:", final_aorsf_all$n_features, "variables\n")
  
  # Combine results
  results <- list(
    period = period_name,
    n_patients = nrow(prepared_data),
    event_rate = mean(prepared_data$status, na.rm = TRUE),
    rsf_features = rsf_features,
    catboost_features = catboost_features,
    aorsf_features = aorsf_features,
    rsf_wisotzkey = rsf_wisotzkey,
    catboost_wisotzkey = catboost_wisotzkey,
    aorsf_wisotzkey = aorsf_wisotzkey,
    rsf_cindex_td = rsf_cindex_td,
    rsf_cindex_ti = rsf_cindex_ti,
    catboost_cindex_td = catboost_cindex_td,
    catboost_cindex_ti = catboost_cindex_ti,
    aorsf_cindex_td = aorsf_cindex_td,
    aorsf_cindex_ti = aorsf_cindex_ti,
    # Final models with selected Wisotzkey variables (from top 20)
    final_rsf_selected_cindex_td = final_rsf_selected$cindex_td,
    final_rsf_selected_cindex_ti = final_rsf_selected$cindex_ti,
    final_rsf_selected_n_features = final_rsf_selected$n_features,
    final_rsf_selected_features = final_rsf_selected$features,
    final_catboost_selected_cindex_td = final_catboost_selected$cindex_td,
    final_catboost_selected_cindex_ti = final_catboost_selected$cindex_ti,
    final_catboost_selected_n_features = final_catboost_selected$n_features,
    final_catboost_selected_features = final_catboost_selected$features,
    final_aorsf_selected_cindex_td = final_aorsf_selected$cindex_td,
    final_aorsf_selected_cindex_ti = final_aorsf_selected$cindex_ti,
    final_aorsf_selected_n_features = final_aorsf_selected$n_features,
    final_aorsf_selected_features = final_aorsf_selected$features,
    # Final models with all Wisotzkey variables (all 15)
    final_rsf_all_cindex_td = final_rsf_all$cindex_td,
    final_rsf_all_cindex_ti = final_rsf_all$cindex_ti,
    final_rsf_all_n_features = final_rsf_all$n_features,
    final_rsf_all_features = final_rsf_all$features,
    final_catboost_all_cindex_td = final_catboost_all$cindex_td,
    final_catboost_all_cindex_ti = final_catboost_all$cindex_ti,
    final_catboost_all_n_features = final_catboost_all$n_features,
    final_catboost_all_features = final_catboost_all$features,
    final_aorsf_all_cindex_td = final_aorsf_all$cindex_td,
    final_aorsf_all_cindex_ti = final_aorsf_all$cindex_ti,
    final_aorsf_all_n_features = final_aorsf_all$n_features,
    final_aorsf_all_features = final_aorsf_all$features
  )
  
  return(results)
}



### Run Analysis

In [None]:
# Run analysis for all time periods
cat("\n=== Defining Time Periods ===\n")
time_periods <- define_time_periods(phts_base)

cat("Original study (2010-2019):", nrow(time_periods$original), "patients\n")
cat("Full study (2010-2024):", nrow(time_periods$full), "patients\n")
cat("Full study without COVID (exclude 2020-2023):", nrow(time_periods$full_no_covid), "patients\n")

# Analyze each period (using ALL variables, matching original study workflow)
all_results <- list()
all_results$original <- analyze_time_period("original_study_2010_2019", time_periods$original)
all_results$full <- analyze_time_period("full_study_2010_2024", time_periods$full)
all_results$full_no_covid <- analyze_time_period("full_study_no_covid_2010_2024_excl_2020_2023", time_periods$full_no_covid)

# Save individual results
cat("\n=== Saving Results ===\n")
for (period_name in names(all_results)) {
  if (is.null(all_results[[period_name]])) next
  
  results <- all_results[[period_name]]
  
  # Save RSF features
  if (!is.null(results$rsf_features)) {
    rsf_file <- file.path(output_dir, paste0(period_name, "_rsf_top20.csv"))
    write_csv(results$rsf_features, rsf_file)
    cat("  Saved:", rsf_file, "\n")
  }
  
  # Save CatBoost features
  if (!is.null(results$catboost_features)) {
    catboost_file <- file.path(output_dir, paste0(period_name, "_catboost_top20.csv"))
    write_csv(results$catboost_features, catboost_file)
    cat("  Saved:", catboost_file, "\n")
  }

  # Save AORSF features
  if (!is.null(results$aorsf_features)) {
    aorsf_file <- file.path(output_dir, paste0(period_name, "_aorsf_top20.csv"))
    write_csv(results$aorsf_features, aorsf_file)
    cat("  Saved:", aorsf_file, "\n")
  }

  # Save Wisotzkey overlap analysis
  wisotzkey_overlap <- data.frame(
    method = c("RSF", "CatBoost", "AORSF"),
    n_wisotzkey_in_top20 = c(
      length(results$rsf_wisotzkey),
      length(results$catboost_wisotzkey),
      length(results$aorsf_wisotzkey)
    ),
    wisotzkey_features = c(
      paste(results$rsf_wisotzkey, collapse = "; "),
      paste(results$catboost_wisotzkey, collapse = "; "),
      paste(results$aorsf_wisotzkey, collapse = "; ")
    ),
    stringsAsFactors = FALSE
  )
  wisotzkey_file <- file.path(output_dir, paste0(period_name, "_wisotzkey_overlap.csv"))
  write_csv(wisotzkey_overlap, wisotzkey_file)
  cat("  Saved:", wisotzkey_file, "\n")

  # Save final model results
  final_models_summary <- data.frame(
    method = c("RSF", "CatBoost", "AORSF", "RSF", "CatBoost", "AORSF"),
    model_type = c("Selected_Wisotzkey", "Selected_Wisotzkey", "Selected_Wisotzkey",
                   "All_Wisotzkey", "All_Wisotzkey", "All_Wisotzkey"),
    n_features = c(
      results$final_rsf_selected_n_features,
      results$final_catboost_selected_n_features,
      results$final_aorsf_selected_n_features,
      results$final_rsf_all_n_features,
      results$final_catboost_all_n_features,
      results$final_aorsf_all_n_features
    ),
    cindex_td = c(
      results$final_rsf_selected_cindex_td,
      results$final_catboost_selected_cindex_td,
      results$final_aorsf_selected_cindex_td,
      results$final_rsf_all_cindex_td,
      results$final_catboost_all_cindex_td,
      results$final_aorsf_all_cindex_td
    ),
    cindex_ti = c(
      results$final_rsf_selected_cindex_ti,
      results$final_catboost_selected_cindex_ti,
      results$final_aorsf_selected_cindex_ti,
      results$final_rsf_all_cindex_ti,
      results$final_catboost_all_cindex_ti,
      results$final_aorsf_all_cindex_ti
    ),
    features = c(
      paste(results$final_rsf_selected_features, collapse = "; "),
      paste(results$final_catboost_selected_features, collapse = "; "),
      paste(results$final_aorsf_selected_features, collapse = "; "),
      paste(results$final_rsf_all_features, collapse = "; "),
      paste(results$final_catboost_all_features, collapse = "; "),
      paste(results$final_aorsf_all_features, collapse = "; ")
    ),
    stringsAsFactors = FALSE
  )
  final_models_file <- file.path(output_dir, paste0(period_name, "_final_models_wisotzkey.csv"))
  write_csv(final_models_summary, final_models_file)
  cat("  Saved:", final_models_file, "\n")
}


# Create comparison tables
cat("\n=== Creating Comparison Tables ===\n")

# RSF comparison across periods
rsf_comparison <- map_dfr(names(all_results), function(period_name) {
  if (is.null(all_results[[period_name]]) || is.null(all_results[[period_name]]$rsf_features)) {
    return(NULL)
  }
  all_results[[period_name]]$rsf_features %>%
    mutate(period = period_name, rank = row_number()) %>%
    select(period, rank, feature, importance, cindex_td, cindex_ti)
})

if (nrow(rsf_comparison) > 0) {
  rsf_comparison_file <- file.path(output_dir, "rsf_comparison_all_periods.csv")
  write_csv(rsf_comparison, rsf_comparison_file)
  cat("  Saved:", rsf_comparison_file, "\n")
  
  # Create wide format comparison
  rsf_wide <- rsf_comparison %>%
    select(period, rank, feature) %>%
    pivot_wider(names_from = period, values_from = feature, values_fill = NA)
  
  rsf_wide_file <- file.path(output_dir, "rsf_comparison_wide.csv")
  write_csv(rsf_wide, rsf_wide_file)
  cat("  Saved:", rsf_wide_file, "\n")
}

# CatBoost comparison across periods
catboost_comparison <- map_dfr(names(all_results), function(period_name) {
  if (is.null(all_results[[period_name]]) || is.null(all_results[[period_name]]$catboost_features)) {
    return(NULL)
  }
  all_results[[period_name]]$catboost_features %>%
    mutate(period = period_name, rank = row_number()) %>%
    select(period, rank, feature, importance, cindex_td, cindex_ti)
})

if (nrow(catboost_comparison) > 0) {
  catboost_comparison_file <- file.path(output_dir, "catboost_comparison_all_periods.csv")
  write_csv(catboost_comparison, catboost_comparison_file)
  cat("  Saved:", catboost_comparison_file, "\n")
  
  # Create wide format comparison
  catboost_wide <- catboost_comparison %>%
    select(period, rank, feature) %>%
    pivot_wider(names_from = period, values_from = feature, values_fill = NA)
  
  catboost_wide_file <- file.path(output_dir, "catboost_comparison_wide.csv")
  write_csv(catboost_wide, catboost_wide_file)
  cat("  Saved:", catboost_wide_file, "\n")
}


# AORSF comparison across periods
aorsf_comparison <- map_dfr(names(all_results), function(period_name) {
  if (is.null(all_results[[period_name]]) || is.null(all_results[[period_name]]$aorsf_features)) {
    return(NULL)
  }
  all_results[[period_name]]$aorsf_features %>%
    mutate(period = period_name, rank = row_number()) %>%
    select(period, rank, feature, importance, cindex_td, cindex_ti)
})

if (nrow(aorsf_comparison) > 0) {
  aorsf_comparison_file <- file.path(output_dir, "aorsf_comparison_all_periods.csv")
  write_csv(aorsf_comparison, aorsf_comparison_file)
  cat("  Saved:", aorsf_comparison_file, "\n")
  
  # Create wide format comparison
  aorsf_wide <- aorsf_comparison %>%
    select(period, rank, feature) %>%
    pivot_wider(names_from = period, values_from = feature, values_fill = NA)
  
  aorsf_wide_file <- file.path(output_dir, "aorsf_comparison_wide.csv")
  write_csv(aorsf_wide, aorsf_wide_file)
  cat("  Saved:", aorsf_wide_file, "\n")
}


# Feature overlap analysis
cat("\n=== Feature Overlap Analysis ===\n")

# RSF overlap
if (nrow(rsf_comparison) > 0) {
  rsf_features_by_period <- rsf_comparison %>%
    group_by(period) %>%
    summarise(features = list(feature), .groups = 'drop')
  
  if (nrow(rsf_features_by_period) > 1) {
    # Find common features across all periods
    all_rsf_features <- Reduce(intersect, rsf_features_by_period$features)
    cat("RSF features common to all periods:", length(all_rsf_features), "\n")
    if (length(all_rsf_features) > 0) {
      cat("  ", paste(head(all_rsf_features, 10), collapse = ", "), "\n")
    }
    
    # Save overlap analysis
    overlap_file <- file.path(output_dir, "rsf_feature_overlap.csv")
    write_csv(data.frame(feature = all_rsf_features), overlap_file)
    cat("  Saved:", overlap_file, "\n")
  }
}

# CatBoost overlap
if (nrow(catboost_comparison) > 0) {
  catboost_features_by_period <- catboost_comparison %>%
    group_by(period) %>%
    summarise(features = list(feature), .groups = 'drop')
  
  if (nrow(catboost_features_by_period) > 1) {
    # Find common features across all periods
    all_catboost_features <- Reduce(intersect, catboost_features_by_period$features)
    cat("CatBoost features common to all periods:", length(all_catboost_features), "\n")
    if (length(all_catboost_features) > 0) {
      cat("  ", paste(head(all_catboost_features, 10), collapse = ", "), "\n")
    }
    
    # Save overlap analysis
    overlap_file <- file.path(output_dir, "catboost_feature_overlap.csv")
    write_csv(data.frame(feature = all_catboost_features), overlap_file)
    cat("  Saved:", overlap_file, "\n")
  }
}

# AORSF overlap
if (nrow(aorsf_comparison) > 0) {
  aorsf_features_by_period <- aorsf_comparison %>%
    group_by(period) %>%
    summarise(features = list(feature), .groups = 'drop')
  
  if (nrow(aorsf_features_by_period) > 1) {
    # Find common features across all periods
    all_aorsf_features <- Reduce(intersect, aorsf_features_by_period$features)
    cat("AORSF features common to all periods:", length(all_aorsf_features), "\n")
    if (length(all_aorsf_features) > 0) {
      cat("  ", paste(head(all_aorsf_features, 10), collapse = ", "), "\n")
    }
    
    # Save overlap analysis
    overlap_file <- file.path(output_dir, "aorsf_feature_overlap.csv")
    write_csv(data.frame(feature = all_aorsf_features), overlap_file)
    cat("  Saved:", overlap_file, "\n")
  }
}


# Summary statistics
cat("\n=== Summary Statistics ===\n")
summary_stats <- map_dfr(names(all_results), function(period_name) {
  if (is.null(all_results[[period_name]])) {
    return(data.frame(
      period = period_name,
      n_patients = NA,
      event_rate = NA,
      n_rsf_features = NA,
      n_catboost_features = NA,
      n_aorsf_features = NA,
      n_rsf_wisotzkey = NA,
      n_catboost_wisotzkey = NA,
      n_aorsf_wisotzkey = NA,
      rsf_cindex_td = NA_real_,
      rsf_cindex_ti = NA_real_,
      catboost_cindex_td = NA_real_,
      catboost_cindex_ti = NA_real_,
      aorsf_cindex_td = NA_real_,
      aorsf_cindex_ti = NA_real_,
      final_rsf_selected_cindex_td = NA_real_,
      final_rsf_selected_cindex_ti = NA_real_,
      final_rsf_selected_n_features = NA,
      final_catboost_selected_cindex_td = NA_real_,
      final_catboost_selected_cindex_ti = NA_real_,
      final_catboost_selected_n_features = NA,
      final_aorsf_selected_cindex_td = NA_real_,
      final_aorsf_selected_cindex_ti = NA_real_,
      final_aorsf_selected_n_features = NA,
      final_rsf_all_cindex_td = NA_real_,
      final_rsf_all_cindex_ti = NA_real_,
      final_rsf_all_n_features = NA,
      final_catboost_all_cindex_td = NA_real_,
      final_catboost_all_cindex_ti = NA_real_,
      final_catboost_all_n_features = NA,
      final_aorsf_all_cindex_td = NA_real_,
      final_aorsf_all_cindex_ti = NA_real_,
      final_aorsf_all_n_features = NA
    ))
  }
  results <- all_results[[period_name]]
  data.frame(
    period = period_name,
    n_patients = results$n_patients,
    event_rate = round(results$event_rate * 100, 2),
    n_rsf_features = ifelse(is.null(results$rsf_features), 0, nrow(results$rsf_features)),
    n_catboost_features = ifelse(is.null(results$catboost_features), 0, nrow(results$catboost_features)),
    n_aorsf_features = ifelse(is.null(results$aorsf_features), 0, nrow(results$aorsf_features)),
    n_rsf_wisotzkey = ifelse(is.null(results$rsf_wisotzkey), 0, length(results$rsf_wisotzkey)),
    n_catboost_wisotzkey = ifelse(is.null(results$catboost_wisotzkey), 0, length(results$catboost_wisotzkey)),
    n_aorsf_wisotzkey = ifelse(is.null(results$aorsf_wisotzkey), 0, length(results$aorsf_wisotzkey)),
    rsf_cindex_td = round(ifelse(is.null(results$rsf_cindex_td), NA_real_, results$rsf_cindex_td), 4),
    rsf_cindex_ti = round(ifelse(is.null(results$rsf_cindex_ti), NA_real_, results$rsf_cindex_ti), 4),
    catboost_cindex_td = round(ifelse(is.null(results$catboost_cindex_td), NA_real_, results$catboost_cindex_td), 4),
    catboost_cindex_ti = round(ifelse(is.null(results$catboost_cindex_ti), NA_real_, results$catboost_cindex_ti), 4),
    aorsf_cindex_td = round(ifelse(is.null(results$aorsf_cindex_td), NA_real_, results$aorsf_cindex_td), 4),
    aorsf_cindex_ti = round(ifelse(is.null(results$aorsf_cindex_ti), NA_real_, results$aorsf_cindex_ti), 4),
    final_rsf_selected_cindex_td = round(ifelse(is.null(results$final_rsf_selected_cindex_td), NA_real_, results$final_rsf_selected_cindex_td), 4),
    final_rsf_selected_cindex_ti = round(ifelse(is.null(results$final_rsf_selected_cindex_ti), NA_real_, results$final_rsf_selected_cindex_ti), 4),
    final_rsf_selected_n_features = ifelse(is.null(results$final_rsf_selected_n_features), NA, results$final_rsf_selected_n_features),
    final_catboost_selected_cindex_td = round(ifelse(is.null(results$final_catboost_selected_cindex_td), NA_real_, results$final_catboost_selected_cindex_td), 4),
    final_catboost_selected_cindex_ti = round(ifelse(is.null(results$final_catboost_selected_cindex_ti), NA_real_, results$final_catboost_selected_cindex_ti), 4),
    final_catboost_selected_n_features = ifelse(is.null(results$final_catboost_selected_n_features), NA, results$final_catboost_selected_n_features),
    final_aorsf_selected_cindex_td = round(ifelse(is.null(results$final_aorsf_selected_cindex_td), NA_real_, results$final_aorsf_selected_cindex_td), 4),
    final_aorsf_selected_cindex_ti = round(ifelse(is.null(results$final_aorsf_selected_cindex_ti), NA_real_, results$final_aorsf_selected_cindex_ti), 4),
    final_aorsf_selected_n_features = ifelse(is.null(results$final_aorsf_selected_n_features), NA, results$final_aorsf_selected_n_features),
    final_rsf_all_cindex_td = round(ifelse(is.null(results$final_rsf_all_cindex_td), NA_real_, results$final_rsf_all_cindex_td), 4),
    final_rsf_all_cindex_ti = round(ifelse(is.null(results$final_rsf_all_cindex_ti), NA_real_, results$final_rsf_all_cindex_ti), 4),
    final_rsf_all_n_features = ifelse(is.null(results$final_rsf_all_n_features), NA, results$final_rsf_all_n_features),
    final_catboost_all_cindex_td = round(ifelse(is.null(results$final_catboost_all_cindex_td), NA_real_, results$final_catboost_all_cindex_td), 4),
    final_catboost_all_cindex_ti = round(ifelse(is.null(results$final_catboost_all_cindex_ti), NA_real_, results$final_catboost_all_cindex_ti), 4),
    final_catboost_all_n_features = ifelse(is.null(results$final_catboost_all_n_features), NA, results$final_catboost_all_n_features),
    final_aorsf_all_cindex_td = round(ifelse(is.null(results$final_aorsf_all_cindex_td), NA_real_, results$final_aorsf_all_cindex_td), 4),
    final_aorsf_all_cindex_ti = round(ifelse(is.null(results$final_aorsf_all_cindex_ti), NA_real_, results$final_aorsf_all_cindex_ti), 4),
    final_aorsf_all_n_features = ifelse(is.null(results$final_aorsf_all_n_features), NA, results$final_aorsf_all_n_features)
  )
})

summary_file <- file.path(output_dir, "summary_statistics.csv")
write_csv(summary_stats, summary_file)
cat("  Saved:", summary_file, "\n")
print(summary_stats)

# Create combined C-index comparison table (both time-dependent and time-independent)
cat("\n=== Creating Combined C-index Comparison ===\n")

# Time-dependent C-index comparison
cindex_td_comparison <- summary_stats %>%
  select(period, rsf_cindex_td, catboost_cindex_td, aorsf_cindex_td) %>%
  pivot_longer(cols = c(rsf_cindex_td, catboost_cindex_td, aorsf_cindex_td),
               names_to = "method",
               values_to = "cindex") %>%
  mutate(
    method = case_when(
      method == "rsf_cindex_td" ~ "RSF",
      method == "catboost_cindex_td" ~ "CatBoost",
      method == "aorsf_cindex_td" ~ "AORSF",
      TRUE ~ method
    ),
    cindex_type = "time_dependent"
  )

# Time-independent C-index comparison
cindex_ti_comparison <- summary_stats %>%
  select(period, rsf_cindex_ti, catboost_cindex_ti, aorsf_cindex_ti) %>%
  pivot_longer(cols = c(rsf_cindex_ti, catboost_cindex_ti, aorsf_cindex_ti),
               names_to = "method",
               values_to = "cindex") %>%
  mutate(
    method = case_when(
      method == "rsf_cindex_ti" ~ "RSF",
      method == "catboost_cindex_ti" ~ "CatBoost",
      method == "aorsf_cindex_ti" ~ "AORSF",
      TRUE ~ method
    ),
    cindex_type = "time_independent"
  )

# Combine both
cindex_comparison <- bind_rows(cindex_td_comparison, cindex_ti_comparison)

cindex_comparison_file <- file.path(output_dir, "cindex_comparison_all_methods.csv")
write_csv(cindex_comparison, cindex_comparison_file)
cat("  Saved:", cindex_comparison_file, "\n")

# Create wide format C-index comparison (time-dependent)
cindex_td_wide <- cindex_td_comparison %>%
  select(period, method, cindex) %>%
  pivot_wider(names_from = method, values_from = cindex)

cindex_td_wide_file <- file.path(output_dir, "cindex_td_comparison_wide.csv")
write_csv(cindex_td_wide, cindex_td_wide_file)
cat("  Saved:", cindex_td_wide_file, "\n")

# Create wide format C-index comparison (time-independent)
cindex_ti_wide <- cindex_ti_comparison %>%
  select(period, method, cindex) %>%
  pivot_wider(names_from = method, values_from = cindex)

cindex_ti_wide_file <- file.path(output_dir, "cindex_ti_comparison_wide.csv")
write_csv(cindex_ti_wide, cindex_ti_wide_file)
cat("  Saved:", cindex_ti_wide_file, "\n")

# Create combined wide format
cindex_wide <- summary_stats %>%
  select(period, rsf_cindex_td, rsf_cindex_ti, catboost_cindex_td, catboost_cindex_ti,
         aorsf_cindex_td, aorsf_cindex_ti,
         final_rsf_selected_cindex_td, final_rsf_selected_cindex_ti,
         final_catboost_selected_cindex_td, final_catboost_selected_cindex_ti,
         final_aorsf_selected_cindex_td, final_aorsf_selected_cindex_ti,
         final_rsf_all_cindex_td, final_rsf_all_cindex_ti,
         final_catboost_all_cindex_td, final_catboost_all_cindex_ti,
         final_aorsf_all_cindex_td, final_aorsf_all_cindex_ti)

cindex_wide_file <- file.path(output_dir, "cindex_comparison_wide.csv")
write_csv(cindex_wide, cindex_wide_file)
cat("  Saved:", cindex_wide_file, "\n")

# Create final model comparison table
cat("\n=== Creating Final Model Comparison ===\n")
final_model_comparison <- summary_stats %>%
  select(period,
         final_rsf_selected_cindex_td, final_rsf_selected_cindex_ti,
         final_catboost_selected_cindex_td, final_catboost_selected_cindex_ti,
         final_aorsf_selected_cindex_td, final_aorsf_selected_cindex_ti,
         final_rsf_all_cindex_td, final_rsf_all_cindex_ti,
         final_catboost_all_cindex_td, final_catboost_all_cindex_ti,
         final_aorsf_all_cindex_td, final_aorsf_all_cindex_ti) %>%
  pivot_longer(cols = c(final_rsf_selected_cindex_td, final_rsf_selected_cindex_ti,
                        final_catboost_selected_cindex_td, final_catboost_selected_cindex_ti,
                        final_aorsf_selected_cindex_td, final_aorsf_selected_cindex_ti,
                        final_rsf_all_cindex_td, final_rsf_all_cindex_ti,
                        final_catboost_all_cindex_td, final_catboost_all_cindex_ti,
                        final_aorsf_all_cindex_td, final_aorsf_all_cindex_ti),
               names_to = "metric",
               values_to = "cindex") %>%
  mutate(
    method = case_when(
      grepl("rsf", metric) ~ "RSF",
      grepl("catboost", metric) ~ "CatBoost",
      grepl("aorsf", metric) ~ "AORSF",
      TRUE ~ "Unknown"
    ),
    model_type = case_when(
      grepl("selected", metric) ~ "Selected_Wisotzkey",
      grepl("_all", metric) ~ "All_Wisotzkey",
      TRUE ~ "Unknown"
    ),
    cindex_type = case_when(
      grepl("_td", metric) ~ "time_dependent",
      grepl("_ti", metric) ~ "time_independent",
      TRUE ~ "unknown"
    )
  ) %>%
  select(period, method, model_type, cindex_type, cindex)

final_model_comparison_file <- file.path(output_dir, "final_model_comparison.csv")
write_csv(final_model_comparison, final_model_comparison_file)
cat("  Saved:", final_model_comparison_file, "\n")

# Create wide format final model comparison
final_model_wide <- summary_stats %>%
  select(period,
         final_rsf_selected_n_features, final_rsf_selected_cindex_td, final_rsf_selected_cindex_ti,
         final_catboost_selected_n_features, final_catboost_selected_cindex_td, final_catboost_selected_cindex_ti,
         final_aorsf_selected_n_features, final_aorsf_selected_cindex_td, final_aorsf_selected_cindex_ti,
         final_rsf_all_n_features, final_rsf_all_cindex_td, final_rsf_all_cindex_ti,
         final_catboost_all_n_features, final_catboost_all_cindex_td, final_catboost_all_cindex_ti,
         final_aorsf_all_n_features, final_aorsf_all_cindex_td, final_aorsf_all_cindex_ti)

final_model_wide_file <- file.path(output_dir, "final_model_comparison_wide.csv")
write_csv(final_model_wide, final_model_wide_file)
cat("  Saved:", final_model_wide_file, "\n")

cat("\n=== Analysis Complete ===\n")
cat("All results saved to:", output_dir, "\n")
cat("\nMethods compared:\n")
cat("  - RSF: Random Survival Forest with permutation importance\n")
cat("  - CatBoost: Gradient boosting with feature importance\n")
cat("  - AORSF: Accelerated Oblique Random Survival Forest with negate importance\n")
cat("\nAll three methods provide:\n")
cat("  - Top 20 feature rankings\n")
cat("  - Feature importance scores\n")
cat("  - C-index (Concordance Index) performance metrics\n")
