### R Setup

In [None]:
R.version.string
getRversion()

# Setup
library(here)
library(dplyr)
library(readr)
library(survival)
library(ranger)
library(catboost)
library(tidyr)
library(purrr)
library(recipes)
library(tibble)
library(haven)
library(riskRegression)  # For Score() function used in original study
library(prodlim) 

# Source required functions
source(here("scripts", "R", "clean_phts.R"))
source(here("scripts", "R", "make_final_features.R"))
source(here("scripts", "R", "select_rsf.R"))
source(here("scripts", "R", "make_recipe.R"))
source(here("scripts", "R", "make_labels.R"))

[1] ‘4.4.3’

here() starts at /home/pgx3874/graft-loss


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘recipes’


The following object is masked from ‘package:stats’:

    step


riskRegression version 2025.09.17



### Load Data

In [2]:
# Configuration
n_predictors <- 20  # Target: 20 features as in original study
n_trees_rsf <- 500  # Number of trees for RSF (matching original study)
horizon <- 1  # 1-year prediction horizon

# Create output directory
output_dir <- here("feature_importance", "replicate_20_features_output")
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)

cat("=== Replicating 20-Feature Selection ===\n")
cat("Output directory:", output_dir, "\n\n")

# Load and prepare base data
cat("Loading base data...\n")

# Try to load data - handle different possible column names
phts_base <- tryCatch({
  clean_phts(
    min_txpl_year = 2010,
    predict_horizon = horizon,
    time = outcome_int_graft_loss,
    status = outcome_graft_loss,
    case = 'snake',
    set_to_na = c("", "unknown", "missing")
  )
}, error = function(e) {
  # Try alternative: load from saved RDS if available
  rds_path <- here("graft-loss-parallel-processing", "model_data", "phts_simple.rds")
  if (file.exists(rds_path)) {
    cat("Loading from RDS:", rds_path, "\n")
    return(readRDS(rds_path))
  } else {
    stop("Cannot load data: ", e$message)
  }
})

cat("Base data loaded:", nrow(phts_base), "rows,", ncol(phts_base), "columns\n")
cat("Column names:", paste(head(names(phts_base), 20), collapse = ", "), "...\n")


=== Replicating 20-Feature Selection ===
Output directory: /home/pgx3874/graft-loss/feature_importance/replicate_20_features_output 

Loading base data...
[clean_phts] raw 'time' arg (deparse): outcome_int_graft_loss
[clean_phts] raw 'status' arg (deparse): outcome_graft_loss
[clean_phts] SAS file path: /home/pgx3874/graft-loss/data/phts_txpl_ml.sas7bdat
[clean_phts] File exists: TRUE
[clean_phts] File size: 26738688 bytes
[clean_phts] File permissions: 0
[clean_phts] Successfully read SAS file: 5835 rows, 476 columns
[clean_phts] Starting data processing pipeline...
[clean_phts] Completed mutate() transformations
[clean_phts] Filtering PRA columns...
[clean_phts] Running clean_names()...
Loading from RDS: /home/pgx3874/graft-loss/graft-loss-parallel-processing/model_data/phts_simple.rds 
Base data loaded: 5835 rows, 22 columns
Column names: prim_dx, tx_mcsd, chd_sv, hxsurg, txsa_r, txbun_r, txecmo, txpl_year, weight_txpl, txalt, hxmed, age_txpl, height_txpl, txcreat_r, age_listing, bm

### Prepare Data

In [3]:
# Define time period filters
define_time_periods <- function(data) {
  periods <- list()
  
  if (!"txpl_year" %in% names(data)) {
    warning("txpl_year not found - using all data for all periods")
    periods$original <- data
    periods$full <- data
    periods$full_no_covid <- data
    return(periods)
  }
  
  # Original study period: 2010-2019
  periods$original <- data %>%
    filter(txpl_year >= 2010 & txpl_year <= 2019)
  
  # Full study: 2010-2024
  periods$full <- data %>%
    filter(txpl_year >= 2010)
  
  # Full study without COVID: exclude 2020-2023
  periods$full_no_covid <- data %>%
    filter(txpl_year >= 2010 & !(txpl_year >= 2020 & txpl_year <= 2023))
  
  return(periods)
}

# Prepare data for modeling (remove leakage variables, prepare features)
prepare_modeling_data <- function(data) {
  # Find time and status columns (handle different naming conventions)
  time_col <- NULL
  status_col <- NULL
  
  # Try common time column names
  time_candidates <- c("time", "outcome_int_graft_loss", "int_graft_loss", "ev_time")
  for (col in time_candidates) {
    if (col %in% names(data)) {
      time_col <- col
      break
    }
  }
  
  # Try common status column names
  status_candidates <- c("status", "outcome_graft_loss", "graft_loss", "ev_type", "outcome")
  for (col in status_candidates) {
    if (col %in% names(data)) {
      status_col <- col
      break
    }
  }
  
  if (is.null(time_col) || is.null(status_col)) {
    stop("Cannot find time/status columns. Available columns: ", 
         paste(names(data), collapse = ", "))
  }
  
  # Rename to standard names (only if different)
  if (time_col != "time") {
    data <- data %>% rename(time = !!time_col)
  }
  if (status_col != "status") {
    data <- data %>% rename(status = !!status_col)
  }
  
  # Remove obvious leakage/outcome variables (but keep time/status we just renamed)
  leakage_vars <- c(
    'int_dead', 'int_death', 'txgloss', 
    'death', 'event', 'outcome_int_graft_loss', 'outcome_graft_loss',
    'int_graft_loss', 'graft_loss', 'ev_type', 'ev_time'
  )
  # Only remove if they exist and aren't our time/status columns
  leakage_vars <- setdiff(leakage_vars, c(time_col, status_col, "time", "status"))
  data <- data %>% select(-any_of(leakage_vars))
  
  # Filter out invalid survival data
  data <- data %>%
    filter(!is.na(time), !is.na(status), time > 0, status %in% c(0, 1))
  
  return(data)
}


### C Index Calculations - Helper Functions

In [None]:
# Helper function to calculate C-index (robust with multiple fallbacks)
calculate_cindex <- function(time, status, risk_scores) {
  # Remove missing / invalid
  valid_idx <- !is.na(time) & !is.na(status) & !is.na(risk_scores) &
               is.finite(time) & is.finite(risk_scores) & time > 0
  
  cat("  [cindex] n =", length(time),
      " valid =", sum(valid_idx),
      " events =", sum(status[valid_idx]), "\n")
  
  if (sum(valid_idx) < 10) {
    return(NA_real_)
  }
  
  time_clean   <- time[valid_idx]
  status_clean <- status[valid_idx]
  score_clean  <- risk_scores[valid_idx]
  
  if (sum(status_clean) == 0) {
    return(NA_real_)
  }
  
  if (length(unique(score_clean)) == 1) {
    return(0.5)  # random discrimination
  }
  
  # 1) Try survival::concordance (Harrell's C) for score and -score
  c_try <- tryCatch({
    c1 <- survival::concordance(survival::Surv(time_clean, status_clean) ~ score_clean)$concordance
    c2 <- survival::concordance(survival::Surv(time_clean, status_clean) ~ -score_clean)$concordance
    as.numeric(max(c1, c2))
  }, error = function(e) {
    NA_real_
  })
  if (!is.na(c_try)) return(c_try)
  
  # 2) Try survConcordance (different internals / older interface)
  c_try2 <- tryCatch({
    sc1 <- survival::survConcordance(survival::Surv(time_clean, status_clean) ~ score_clean)
    sc2 <- survival::survConcordance(survival::Surv(time_clean, status_clean) ~ -score_clean)
    # survConcordance stores concordant/discordant counts in $concordance for newer versions
    # try to compute ratio if possible
    if (!is.null(sc1$conc)) {
      c1 <- sc1$conc / sc1$n
      c2 <- sc2$conc / sc2$n
      as.numeric(max(c1, c2))
    } else if (!is.null(sc1$concordance)) {
      as.numeric(max(sc1$concordance, sc2$concordance))
    } else {
      NA_real_
    }
  }, error = function(e) NA_real_)
  if (!is.na(c_try2)) return(c_try2)
  
  # 3) Try riskRegression::Score if available and applicable (optional)
  c_try3 <- tryCatch({
    if (requireNamespace("riskRegression", quietly = TRUE)) {
      score_data <- data.frame(time = as.numeric(time_clean), status = as.integer(status_clean))
      eval_obj <- tryCatch({
        riskRegression::Score(object = list(PRED = as.matrix(score_clean)),
                              formula = survival::Surv(time, status) ~ 1,
                              data = score_data,
                              times = median(time_clean, na.rm = TRUE),
                              summary = "risks",
                              metrics = "auc",
                              se.fit = FALSE)
      }, error = function(e) NULL)
      if (!is.null(eval_obj) && !is.null(eval_obj$AUC$score)) {
        auc_tab <- eval_obj$AUC$score
        if ("times" %in% names(auc_tab)) {
          this_row <- which.min(abs(auc_tab$times - median(time_clean, na.rm = TRUE)))
        } else {
          this_row <- 1L
        }
        return(as.numeric(auc_tab$AUC[this_row]))
      }
    }
    NA_real_
  }, error = function(e) NA_real_)
  if (!is.na(c_try3)) return(c_try3)
  
  # 4) Final fallback: sampled pairwise Harrell's C (exact O(n^2) is expensive),
  #    so sample up to 2000 observations to estimate C-index if dataset is large.
  harrell_sample_c <- function(timev, statusv, scorev, max_sample = 2000) {
    n <- length(timev)
    if (n > max_sample) {
      set.seed(42)
      idx <- sample(seq_len(n), max_sample)
      timev <- timev[idx]; statusv <- statusv[idx]; scorev <- scorev[idx]
      n <- max_sample
    }
    usable <- 0L; concordant <- 0L; ties <- 0L
    for (i in seq_len(n - 1)) {
      for (j in seq.int(i + 1, n)) {
        if (statusv[i] == 1 && timev[i] < timev[j]) {
          usable <- usable + 1L
          if (scorev[i] > scorev[j]) concordant <- concordant + 1L
          else if (scorev[i] == scorev[j]) ties <- ties + 1L
        } else if (statusv[j] == 1 && timev[j] < timev[i]) {
          usable <- usable + 1L
          if (scorev[j] > scorev[i]) concordant <- concordant + 1L
          else if (scorev[i] == scorev[j]) ties <- ties + 1L
        }
      }
    }
    if (usable == 0L) return(NA_real_)
    (concordant + 0.5 * ties) / usable
  }
  
  c_final <- tryCatch({
    harrell_sample_c(time_clean, status_clean, score_clean, max_sample = 2000)
  }, error = function(e) NA_real_)
  
  if (is.na(c_final)) return(NA_real_)
  return(as.numeric(c_final))
}

# Predict risk at given times from a ranger survival model
ranger_predictrisk <- function(object, newdata, times) {
  # Try several predict() interfaces for ranger
  ptemp <- NULL
  
  # 1) modern: new_data (sometimes used via tidymodels wrappers)
  ptemp <- tryCatch({
    predict(object, new_data = newdata, type = "response")$survival
  }, error = function(e) NULL)
  
  # 2) older: data
  if (is.null(ptemp)) {
    ptemp <- tryCatch({
      predict(object, data = newdata, type = "response")$survival
    }, error = function(e) NULL)
  }
  
  # 3) legacy: newdata
  if (is.null(ptemp)) {
    ptemp <- tryCatch({
      predict(object, newdata = newdata, type = "response")$survival
    }, error = function(e) NULL)
  }
  
  if (is.null(ptemp)) {
    stop("Could not call predict() on ranger object with any known interface")
  }
  
  # Map requested eval time(s) to survival index
  pos <- prodlim::sindex(
    jump.times = object$unique.death.times,
    eval.times = times
  )
  
  # survival matrix is n x T; handle times before first event (pos == 0)
  p <- cbind(1, ptemp)[, pos + 1, drop = FALSE]
  
  # Return risk = 1 - survival at specified time(s)
  1 - p
}

# Compute CatBoost C-index using riskRegression::Score
catboost_cindex_score <- function(predictions, time, status, horizon) {
  
  # Dependencies
  if (!requireNamespace("riskRegression", quietly = TRUE))
    stop("riskRegression package not installed.")
  if (!requireNamespace("survival", quietly = TRUE))
    stop("survival package not installed.")
  
  # Convert CatBoost signed-time predictions to risk scores
  # CatBoost: higher predicted signed-time → longer survival → lower risk
  # So risk = -predictions
  risk_scores <- -as.numeric(predictions)
  
  # Construct data frame for Score()
  score_data <- data.frame(
    time   = as.numeric(time),
    status = as.numeric(status)
  )
  
  # Run Score()
  cindex <- tryCatch({
    evaluation <- riskRegression::Score(
      object  = list(CatBoost = as.matrix(risk_scores)),  # must be n × 1 matrix
      formula = survival::Surv(time, status) ~ 1,
      data    = score_data,
      times   = horizon,
      summary = "risks",
      metrics = "auc",
      se.fit  = FALSE
    )
    
    auc_tab <- evaluation$AUC$score
    
    # If multiple rows (multiple times), pick the closest to horizon
    if ("times" %in% names(auc_tab)) {
      this_row <- which.min(abs(auc_tab$times - horizon))
    } else {
      this_row <- 1L
    }
    
    as.numeric(auc_tab$AUC[this_row])
    
  }, error = function(e) {
    # Fallback to concordance if Score() fails
    warning("Score() failed, using survival::concordance(): ", e$message)
    tryCatch({
      concord <- survival::concordance(
        survival::Surv(time, status) ~ risk_scores
      )
      as.numeric(concord$concordance)
    }, error = function(e2) NA_real_)
  })
  
  return(cindex)
}


### Random Survival Forest (RSF) Feature Importance - Helper Function

In [5]:
# RSF Feature Selection with Permutation Importance and C-index
select_features_rsf <- function(data,
                                n_predictors = 20,
                                n_trees      = 500,
                                horizon      = 365) {
  cat("  Running RSF feature selection (permutation importance)...\n")
  
  # Dependencies
  if (!requireNamespace("survival", quietly = TRUE))
    stop("The 'survival' package is required for RSF.")
  if (!requireNamespace("riskRegression", quietly = TRUE))
    stop("The 'riskRegression' package is required for RSF C-index.")
  if (!requireNamespace("prodlim", quietly = TRUE))
    stop("The 'prodlim' package is required for RSF risk predictions.")
  
  if (!exists("ranger_predictrisk", mode = "function")) {
    stop("ranger_predictrisk() must be defined before calling select_features_rsf().")
  }
  
  # Create recipe and prepare predictors (no time/status/IDs as features)
  recipe_prep <- make_recipe(data, dummy_code = FALSE) %>% recipes::prep()
  
  prepared_data <- recipes::juice(recipe_prep) %>%
    dplyr::select(-dplyr::any_of(c("ID", "ptid_e", "time", "status")))
  
  # Align rows
  n_rows <- min(nrow(prepared_data), nrow(data))
  
  rsf_data <- prepared_data[1:n_rows, , drop = FALSE] %>%
    dplyr::bind_cols(
      tibble::tibble(
        time   = data$time[1:n_rows],
        status = data$status[1:n_rows]
      )
    )
  
  # Fit RSF
  rsf_model <- ranger::ranger(
    survival::Surv(time, status) ~ .,
    data              = rsf_data,
    num.trees         = n_trees,
    importance        = "permutation",
    min.node.size     = 20,
    splitrule         = "extratrees",
    num.random.splits = 10
  )
  
  # 1) Risk predictions at horizon using original-study ranger_predictrisk
  rsf_predictions <- tryCatch({
    risk_pred <- ranger_predictrisk(
      object  = rsf_model,
      newdata = rsf_data,
      times   = horizon
    )
    if (is.matrix(risk_pred)) as.numeric(risk_pred[, 1]) else as.numeric(risk_pred)
  }, error = function(e) {
    cat("  Warning: RSF risk prediction failed:", e$message, "\n")
    return(NULL)
  })
  
  # 2) C-index via riskRegression::Score (original study style)
  cindex <- NA_real_
  if (!is.null(rsf_predictions)) {
    # Build a clean scoring dataset with proper types
    score_data <- data.frame(
      time   = as.numeric(rsf_data$time),
      status = as.integer(rsf_data$status)
    )
    
    cindex <- tryCatch({
      evaluation <- riskRegression::Score(
        object  = list(RSF = as.matrix(rsf_predictions)),
        formula = survival::Surv(time, status) ~ 1,
        data    = score_data,
        times   = horizon,
        summary = "risks",
        metrics = "auc",
        se.fit  = FALSE
      )
      
      auc_tab <- evaluation$AUC$score
      if ("times" %in% names(auc_tab)) {
        this_row <- which.min(abs(auc_tab$times - horizon))
      } else {
        this_row <- 1L
      }
      as.numeric(auc_tab$AUC[this_row])
    }, error = function(e) {
      cat("  Warning: Score() failed, using concordance():", e$message, "\n")
      calculate_cindex(score_data$time, score_data$status, rsf_predictions)
    })
    
    if (is.na(cindex)) {
      cat("  Warning: RSF C-index is NA\n")
      cat("    Time range:", range(score_data$time, na.rm = TRUE), "\n")
      cat("    Status sum:", sum(score_data$status, na.rm = TRUE), "\n")
      cat("    Prediction range:", range(rsf_predictions, na.rm = TRUE), "\n")
    } else {
      cat("  RSF C-index:", round(cindex, 4), "\n")
    }
  } else {
    cat("  Warning: RSF predictions are NULL or empty\n")
  }
  
  # 3) Feature importance table
  importance_df <- tibble::enframe(rsf_model$variable.importance) %>%
    dplyr::arrange(dplyr::desc(value)) %>%
    dplyr::slice(1:n_predictors) %>%
    dplyr::rename(feature = name, importance = value)
  
  # Add scalar C-index column (avoid mutate(cindex = cindex))
  importance_df$cindex <- cindex
  
  cat("  RSF selected", nrow(importance_df), "features\n")
  
  return(importance_df)
}


### CatBoost Feature Importance - Helper Function

In [6]:
# CatBoost Feature Importance
select_features_catboost <- function(data,
                                     n_predictors = 20,
                                     iterations   = 2000,
                                     horizon      = 365) {
  cat("  Running CatBoost feature importance...\n")
  
  # Check dependencies
  if (!requireNamespace("catboost", quietly = TRUE)) {
    warning("CatBoost package not available - skipping CatBoost feature importance")
    return(NULL)
  }
  if (!requireNamespace("riskRegression", quietly = TRUE)) {
    stop("The 'riskRegression' package is required for CatBoost C-index but is not installed.")
  }
  if (!requireNamespace("survival", quietly = TRUE)) {
    stop("The 'survival' package is required for CatBoost C-index but is not installed.")
  }
  
  # Base feature set: remove ID, ptid_e, time, status
  feature_data <- data %>%
    dplyr::select(-dplyr::any_of(c("ID", "ptid_e", "time", "status")))
  
  # Create recipe (no dummy coding for CatBoost)
  recipe_prep <- tryCatch({
    make_recipe(data, dummy_code = FALSE) %>% recipes::prep()
  }, error = function(e) {
    cat("  Warning: Recipe preparation failed, using raw data:", e$message, "\n")
    return(NULL)
  })
  
  if (is.null(recipe_prep)) {
    # Fallback: use cleaned raw features
    prepared_data <- feature_data
  } else {
    prepared_data <- recipes::juice(recipe_prep) %>%
      # drop IDs and outcome columns
      dplyr::select(-dplyr::any_of(c("ID", "ptid_e", "time", "status")))
  }
  
  # Ensure we have matching rows
  n_rows <- min(nrow(prepared_data), nrow(data))
  prepared_data <- prepared_data[1:n_rows, , drop = FALSE]
  
  # Prepare time and status for CatBoost (signed-time label)
  # +time for events, -time for censored
  time_vec   <- data$time[1:n_rows]
  status_vec <- data$status[1:n_rows]
  signed_time <- ifelse(status_vec == 1, time_vec, -time_vec)
  
  # Convert character columns to factors for CatBoost
  prepared_data <- prepared_data %>%
    dplyr::mutate(dplyr::across(where(is.character), as.factor))
  
  # Create CatBoost pool (factors → categorical automatically in R)
  train_pool <- tryCatch({
    catboost::catboost.load_pool(
      data  = prepared_data,
      label = signed_time
    )
  }, error = function(e) {
    cat("  Error creating CatBoost pool:", e$message, "\n")
    return(NULL)
  })
  
  if (is.null(train_pool)) {
    return(NULL)
  }
  
  # Train CatBoost model
  catboost_params <- list(
    loss_function  = "RMSE",  # Using signed-time as regression proxy
    depth          = 6,
    learning_rate  = 0.05,
    iterations     = iterations,
    l2_leaf_reg    = 3.0,
    random_seed    = 42,
    verbose        = 0
  )
  
  catboost_model <- tryCatch({
    catboost::catboost.train(train_pool, params = catboost_params)
  }, error = function(e) {
    cat("  Error training CatBoost model:", e$message, "\n")
    return(NULL)
  })
  
  if (is.null(catboost_model)) {
    return(NULL)
  }
  
  # Get predictions for C-index calculation (signed-time)
  catboost_predictions <- tryCatch({
    catboost::catboost.predict(catboost_model, train_pool)
  }, error = function(e) {
    cat("  Warning: Could not get CatBoost predictions:", e$message, "\n")
    return(NULL)
  })
  
  # Calculate C-index using original study method (riskRegression::Score)
  cindex <- NA_real_
  if (!is.null(catboost_predictions)) {
    cindex <- catboost_cindex_score(
      predictions = catboost_predictions,  # signed-time
      time        = time_vec,
      status      = status_vec,
      horizon     = horizon
    )
    
    if (is.na(cindex)) {
      cat("  Warning: CatBoost C-index calculation returned NA\n")
      cat("    Time range:", range(time_vec, na.rm = TRUE), "\n")
      cat("    Status sum:", sum(status_vec, na.rm = TRUE), "\n")
    } else {
      cat("  CatBoost C-index:", round(cindex, 4), "\n")
    }
  }
  
  # Extract feature importance
  importance_raw <- tryCatch({
    catboost::catboost.get_feature_importance(
      catboost_model,
      pool = train_pool,
      type = "FeatureImportance"
    )
  }, error = function(e) {
    cat("  Error extracting CatBoost importance:", e$message, "\n")
    return(NULL)
  })
  
  if (is.null(importance_raw)) {
    return(NULL)
  }
  
  # Create importance data frame
  importance_df <- data.frame(
    feature    = names(prepared_data),
    importance = as.numeric(importance_raw),
    stringsAsFactors = FALSE
  ) %>%
    dplyr::arrange(dplyr::desc(importance)) %>%
    dplyr::slice(1:min(n_predictors, nrow(.)))
  
  # Add C-index as a scalar column
  importance_df$cindex <- cindex
  
  cat("  CatBoost selected", nrow(importance_df), "features\n")
  
  return(importance_df)
}


### Combined Analysis - Helper Function

In [7]:
# Main analysis function
analyze_time_period <- function(period_name, period_data) {
  cat("\n=== Analyzing:", period_name, "===\n")
  cat("  Sample size:", nrow(period_data), "patients\n")
  cat("  Event rate:", round(mean(period_data$status, na.rm = TRUE) * 100, 2), "%\n")
  
  if (nrow(period_data) < 100) {
    warning(paste("Sample size too small for", period_name, "- skipping"))
    return(NULL)
  }
  
  # Prepare data
  prepared_data <- prepare_modeling_data(period_data)
  
  if (nrow(prepared_data) < 50) {
    warning(paste("Too few valid rows after preparation for", period_name, "- skipping"))
    return(NULL)
  }
  
  # RSF feature selection
  rsf_features <- tryCatch({
    select_features_rsf(prepared_data, n_predictors = n_predictors, n_trees = n_trees_rsf)
  }, error = function(e) {
    cat("  ERROR in RSF feature selection:", e$message, "\n")
    return(NULL)
  })
  
  # CatBoost feature importance
  catboost_features <- tryCatch({
    select_features_catboost(prepared_data, n_predictors = n_predictors)
  }, error = function(e) {
    cat("  ERROR in CatBoost feature importance:", e$message, "\n")
    return(NULL)
  })
  
  # Extract C-index values
  rsf_cindex <- if (!is.null(rsf_features) && "cindex" %in% names(rsf_features)) {
    rsf_features$cindex[1]
  } else {
    NA_real_
  }
  
  catboost_cindex <- if (!is.null(catboost_features) && "cindex" %in% names(catboost_features)) {
    catboost_features$cindex[1]
  } else {
    NA_real_
  }
  
  # Combine results
  results <- list(
    period = period_name,
    n_patients = nrow(prepared_data),
    event_rate = mean(prepared_data$status, na.rm = TRUE),
    rsf_features = rsf_features,
    catboost_features = catboost_features,
    rsf_cindex = rsf_cindex,
    catboost_cindex = catboost_cindex
  )
  
  return(results)
}


### Run Analysis

In [None]:
# Run analysis for all time periods
cat("\n=== Defining Time Periods ===\n")
time_periods <- define_time_periods(phts_base)

cat("Original study (2010-2019):", nrow(time_periods$original), "patients\n")
cat("Full study (2010-2024):", nrow(time_periods$full), "patients\n")
cat("Full study without COVID (exclude 2020-2023):", nrow(time_periods$full_no_covid), "patients\n")

# Analyze each period
all_results <- list()
all_results$original <- analyze_time_period("original_study_2010_2019", time_periods$original)
all_results$full <- analyze_time_period("full_study_2010_2024", time_periods$full)
all_results$full_no_covid <- analyze_time_period("full_study_no_covid_2010_2024_excl_2020_2023", time_periods$full_no_covid)

# Save individual results
cat("\n=== Saving Results ===\n")
for (period_name in names(all_results)) {
  if (is.null(all_results[[period_name]])) next
  
  results <- all_results[[period_name]]
  
  # Save RSF features
  if (!is.null(results$rsf_features)) {
    rsf_file <- file.path(output_dir, paste0(period_name, "_rsf_top20.csv"))
    write_csv(results$rsf_features, rsf_file)
    cat("  Saved:", rsf_file, "\n")
  }
  
  # Save CatBoost features
  if (!is.null(results$catboost_features)) {
    catboost_file <- file.path(output_dir, paste0(period_name, "_catboost_top20.csv"))
    write_csv(results$catboost_features, catboost_file)
    cat("  Saved:", catboost_file, "\n")
  }
}

# Create comparison tables
cat("\n=== Creating Comparison Tables ===\n")

# RSF comparison across periods
rsf_comparison <- map_dfr(names(all_results), function(period_name) {
  if (is.null(all_results[[period_name]]) || is.null(all_results[[period_name]]$rsf_features)) {
    return(NULL)
  }
  all_results[[period_name]]$rsf_features %>%
    mutate(period = period_name, rank = row_number()) %>%
    select(period, rank, feature, importance, cindex)
})

if (nrow(rsf_comparison) > 0) {
  rsf_comparison_file <- file.path(output_dir, "rsf_comparison_all_periods.csv")
  write_csv(rsf_comparison, rsf_comparison_file)
  cat("  Saved:", rsf_comparison_file, "\n")
  
  # Create wide format comparison
  rsf_wide <- rsf_comparison %>%
    select(period, rank, feature) %>%
    pivot_wider(names_from = period, values_from = feature, values_fill = NA)
  
  rsf_wide_file <- file.path(output_dir, "rsf_comparison_wide.csv")
  write_csv(rsf_wide, rsf_wide_file)
  cat("  Saved:", rsf_wide_file, "\n")
}

# CatBoost comparison across periods
catboost_comparison <- map_dfr(names(all_results), function(period_name) {
  if (is.null(all_results[[period_name]]) || is.null(all_results[[period_name]]$catboost_features)) {
    return(NULL)
  }
  all_results[[period_name]]$catboost_features %>%
    mutate(period = period_name, rank = row_number()) %>%
    select(period, rank, feature, importance, cindex)
})

if (nrow(catboost_comparison) > 0) {
  catboost_comparison_file <- file.path(output_dir, "catboost_comparison_all_periods.csv")
  write_csv(catboost_comparison, catboost_comparison_file)
  cat("  Saved:", catboost_comparison_file, "\n")
  
  # Create wide format comparison
  catboost_wide <- catboost_comparison %>%
    select(period, rank, feature) %>%
    pivot_wider(names_from = period, values_from = feature, values_fill = NA)
  
  catboost_wide_file <- file.path(output_dir, "catboost_comparison_wide.csv")
  write_csv(catboost_wide, catboost_wide_file)
  cat("  Saved:", catboost_wide_file, "\n")
}

# Feature overlap analysis
cat("\n=== Feature Overlap Analysis ===\n")

# RSF overlap
if (nrow(rsf_comparison) > 0) {
  rsf_features_by_period <- rsf_comparison %>%
    group_by(period) %>%
    summarise(features = list(feature), .groups = 'drop')
  
  if (nrow(rsf_features_by_period) > 1) {
    # Find common features across all periods
    all_rsf_features <- Reduce(intersect, rsf_features_by_period$features)
    cat("RSF features common to all periods:", length(all_rsf_features), "\n")
    if (length(all_rsf_features) > 0) {
      cat("  ", paste(head(all_rsf_features, 10), collapse = ", "), "\n")
    }
    
    # Save overlap analysis
    overlap_file <- file.path(output_dir, "rsf_feature_overlap.csv")
    write_csv(data.frame(feature = all_rsf_features), overlap_file)
    cat("  Saved:", overlap_file, "\n")
  }
}

# CatBoost overlap
if (nrow(catboost_comparison) > 0) {
  catboost_features_by_period <- catboost_comparison %>%
    group_by(period) %>%
    summarise(features = list(feature), .groups = 'drop')
  
  if (nrow(catboost_features_by_period) > 1) {
    # Find common features across all periods
    all_catboost_features <- Reduce(intersect, catboost_features_by_period$features)
    cat("CatBoost features common to all periods:", length(all_catboost_features), "\n")
    if (length(all_catboost_features) > 0) {
      cat("  ", paste(head(all_catboost_features, 10), collapse = ", "), "\n")
    }
    
    # Save overlap analysis
    overlap_file <- file.path(output_dir, "catboost_feature_overlap.csv")
    write_csv(data.frame(feature = all_catboost_features), overlap_file)
    cat("  Saved:", overlap_file, "\n")
  }
}

# Summary statistics
cat("\n=== Summary Statistics ===\n")
summary_stats <- map_dfr(names(all_results), function(period_name) {
  if (is.null(all_results[[period_name]])) {
    return(data.frame(
      period = period_name,
      n_patients = NA,
      event_rate = NA,
      n_rsf_features = NA,
      n_catboost_features = NA,
      rsf_cindex = NA_real_,
      catboost_cindex = NA_real_
    ))
  }
  results <- all_results[[period_name]]
  data.frame(
    period = period_name,
    n_patients = results$n_patients,
    event_rate = round(results$event_rate * 100, 2),
    n_rsf_features = ifelse(is.null(results$rsf_features), 0, nrow(results$rsf_features)),
    n_catboost_features = ifelse(is.null(results$catboost_features), 0, nrow(results$catboost_features)),
    rsf_cindex = round(ifelse(is.null(results$rsf_cindex), NA_real_, results$rsf_cindex), 4),
    catboost_cindex = round(ifelse(is.null(results$catboost_cindex), NA_real_, results$catboost_cindex), 4)
  )
})

summary_file <- file.path(output_dir, "summary_statistics.csv")
write_csv(summary_stats, summary_file)
cat("  Saved:", summary_file, "\n")
print(summary_stats)

cat("\n=== Analysis Complete ===\n")
cat("All results saved to:", output_dir, "\n")




=== Defining Time Periods ===
Original study (2010-2019): 4036 patients
Full study (2010-2024): 5835 patients
Full study without COVID (exclude 2020-2023): 4196 patients
