# Feature selection RSF

In [2]:
setwd("/home/data/project_code/landstrom_core/prognostic_model_development_revision/r/notebooks")

library(tidyverse)
library(survival)
library(survminer)
library(glmnet)
library(WriteXLS)
library(ggfortify)
library(circlize)
library(ComplexHeatmap)
library(parallel)
library(broom)
library(survcomp)
library(survivalROC)
library(gtsummary)
source("../getTCGAData.R")
source("../preprocessTCGAData.R")
source("../KM_analysis.R")
source("../Heatmaps.R")
source("../enet.R")

Loading required package: S4Vectors

Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:lubridate’:

    intersect, setdiff, union


The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min



Attaching package: ‘S4Vectors’


The following objects are masked from ‘package:Matrix’:

    expand, unname


The foll

In [3]:
# Define the cancer type 
cancer.type = "PRAD"

# Read in the table including the clinical features for each cancer type
clin.feat.tb = read.table("/lustre/projects/landstrom_core/data/clin_features_final.csv", sep = "\t", header = T)

# Get Clinical variables
clin.var = unlist(strsplit(clin.feat.tb$Features[clin.feat.tb$Ctype == cancer.type], split = ","))

# Ensembl id mapping file 
ens.id.mapping = "/home/organisms/Human/hg38/Homo_sapiens.GRCh38_March2022/ENSEMBLE_to_SYMBOL.csv"

# Output dir 
out.dir.data = file.path("/lustre/projects/landstrom_core/data/rdata/manuscript_work/revisions/tgfbeta", cancer.type)
dir.create(out.dir.data, recursive = T)

tcga.cn = getTCGACopyNumberData(cancer.type)

tcga.expr = getTCGAExpressionData(cancer.type, annotation.file = ens.id.mapping)

# Get cancer specific clinical data 
tcga.clin = getClinData(cancer.type)

# Get the end point related clinical data 
tcga.endpoints = getClinEndpointData(cancer.type) %>% dplyr::select(bcr_patient_barcode, OS, OS.time, DSS, DSS.time, DFI, DFI.time, PFI, PFI.time)

# Merge end point data to clinical data 
tcga.clin = dplyr::left_join(tcga.clin, tcga.endpoints, by = "bcr_patient_barcode")

write.csv(tcga.clin, file.path(out.dir.data, "clinical_data.csv"))

tcga.cn = selectPrimaryT(tcga.cn)

tcga.cn = dropDuplicateSamples(tcga.cn)

tcga.cn.datamat = prepareDataMatrix(tcga.cn)
saveRDS(tcga.cn.datamat, file = file.path(out.dir.data, "copy_number_status.rds"))

tcga.expr = selectPrimaryT(tcga.expr)

tcga.expr = dropDuplicateSamples(tcga.expr)

tcga.expr.datamat = prepareDataMatrix(tcga.expr)

saveRDS(tcga.expr.datamat, 
        file = file.path(out.dir.data, "raw_expressions.rds"))

tcga.dataset = mergeTCGAdata(clin.data = tcga.clin,
                                  data = list("CN" = tcga.cn.datamat, 
                                              "EXP" = tcga.expr.datamat), 
                                  data.suffixes = c("cn","exp"))


“'/lustre/projects/landstrom_core/data/rdata/manuscript_work/revisions/tgfbeta/PRAD' already exists”


In [4]:
# Define function for adding the clinical variables 
addClinVar = function(data, clin.var) {
    if ("Age" %in% clin.var) {
        data$CLIN$Age.clin <- data$CLIN$age_at_diagnosis.clin
    } 
    if ("Tumor.stage" %in% clin.var){
        data$CLIN$Tumor.stage.clin = factor(map_chr(data$CLIN$ajcc_pathologic_stage.clin, reformatTumorStage))
    }
    if ("Gender" %in% clin.var){
        data$CLIN$Gender.clin <- factor(data$CLIN$gender.clin)    
    } 
    if ("Gleason.group" %in% clin.var) {
        
        # Determine the Gleason group 
        data$CLIN$Gleason.group.clin = map2_chr(data$CLIN$primary_gleason_grade.clin, 
                                           data$CLIN$secondary_gleason_grade.clin, 
                                           determineGleasonGroup)

        # Set up the factor levels 
        data$CLIN$Gleason.group.clin = factor(data$CLIN$Gleason.group.clin, 
                                    levels = c("Gleason_group_1", "Gleason_group_2"))
    }
    return(data)
}

# Add clinical variables to dataset
tcga.dataset = addClinVar(tcga.dataset, clin.var)

saveRDS(tcga.dataset, file = file.path(out.dir.data, "tcga.dataset.rds"))

rm("tcga.expr")
rm("tcga.cn")
rm("tcga.clin")

In [5]:
# Try here converting "-" characters to 0 for compatibility 
colnames(tcga.dataset$EXP) = str_replace(colnames(tcga.dataset$EXP), "-", "0")

# Read in the preprocessed dataset if continued 
#tcga.dataset = readRDS(file.path(out.dir.data, "tcga.dataset.rds"))

# Raw expression data 
#tcga.expr.raw.datamat = readRDS(file.path(out.dir.data, "raw_expressions.rds"))

# Define and create the root directory for results 
dir.res.root = file.path("/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta/", cancer.type)
dir.create(dir.res.root, recursive = T)

# Define and create the results for the KM analysis 
dir.res.km = file.path(dir.res.root, "Kaplan_Meier_plots")
dir.create(dir.res.km)

tgfbeta.genes.df = read.csv("/lustre/projects/landstrom_core/data/TGFbeta_list_test.tsv", sep = "\t", header = F)

# Read in the original gene list 
customer.genes.df = read.csv("/lustre/projects/landstrom_core/data/Customer_genes.tsv", sep = "\t", header = F)
customer.genes = customer.genes.df$V1
print(customer.genes)

# Modified gene names "-" to 0 conversion for compatibility
gene.list  = unique(c(tgfbeta.genes.df$V1, customer.genes))

clinical.end.point.stats = tcga.dataset$CLIN %>% 
                                   dplyr::select(c("OS.clin","DSS.clin","DFI.clin","PFI.clin")) %>%
                                   pivot_longer(everything()) %>%
                                   mutate(value = factor(value)) %>%
                                   group_by(name, value) %>%
                                   summarise(N = n()) %>% 
                                   pivot_wider(names_from =  value,
                                               values_from = N)

“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD' already exists”
“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots' already exists”


 [1] "APPL2"  "APPL1"  "AURKB"  "AURKA"  "VPS4A"  "VPS4B"  "KIF23"  "TRAF6" 
 [9] "PTEN"   "AR"     "TP53"   "MYC"    "TGFBR1"


[1m[22m`summarise()` has grouped output by 'name'. You can override using the
`.groups` argument.


In [6]:
# Here we store all the training and validation splits 
train_and_validation_ls = list()

# Variables selected 
variables_selected_ls = list()

# Number of samples in training and validation cohorts 
nsamples_step1_ls = list()

selectVariables = function(clinical.endpoint,
                           clinical.variables,
                           gene.list, 
                           data.suffixes){

    # Construct the clinical features   
    clinical.features =  c(paste0(clinical.endpoint, ".clin"),
                           paste0(clinical.endpoint, ".time.clin"),
                           paste0(clinical.variables, ".clin"))

    # Constructed the sequencing data features 
    seq.features = unlist(map(data.suffixes, 
                           .f = function(x, gene.list){paste(gene.list, x, sep = ".")}, 
                           gene.list = gene.list))

    # Construct the list of variables 
    feature.ls = c(clinical.features, seq.features)

    return(feature.ls)
}



#
# Function for splitting data randomly into 
# training and validation set 
#
splitCases = function(obj, split, variables, seed) {
    
    
  # keep track of statistics 
  ncases.initial = c()
    
  complete.ls = list()
    
  # We will only select the cases with complete data 
  for (i in 1:length(names(obj))){
      
      # Extract the data 
      data.type = names(obj)[i]
      data = obj[[data.type]]
      
      # Find the variables of interest 
      variables.dat = variables[variables %in% colnames(data)]
      
      
      # Check how many individuals there are with non missing data
      # Boolean
      complete = complete.cases(data[,variables.dat])
      
      complete.ls[[i]] = as.data.frame(complete)
      
      # Update the number of complete cases
      ncases.initial = c(ncases.initial, sum(complete))
      
      
  }
    
  # Select complete cases 
  complete.across.all = apply(bind_cols(complete.ls),1, all)
    
  print(paste0("Including ", sum(complete.across.all)  ," cases out of ", max(ncases.initial), " cases"))

  
  # Get the list of complete samples to be included 
  samples.included = obj$CLIN$Participant.ID[complete.across.all]
  
  if (split != 1.0) {
    
    set.seed(seed)
    trainIdx <- sample(length(samples.included), split*length(samples.included), replace = FALSE)
    
    samples.train <- samples.included[trainIdx]
    samples.valid <- samples.included[-trainIdx]
       
  } else {
    
    data.train <- samples.included
    data.valid <- samples.included
    
  } 
  result.split = list("train" = samples.train,
                      "validation" = samples.valid)
  return(result.split)
}



for (end.point in c("OS","DSS","DFI","PFI")){

    
    #
    # No need to change the first part 
    #

    #
    # MODIFICATION. HERE WE SELECT CLIN ENDPOINT SPECIFIC VARIABLES (DE GENES)
    # 
    #gene.list = gene.lists[[end.point]]
    
    # Selected variables 
    variables.selected = selectVariables(clinical.endpoint = end.point,
                                         clinical.variables = clin.var,
                                         gene.list = gene.list, 
                                         data.suffixes = c("cn","exp"))
    
    variables_selected_ls[[end.point]] = variables.selected
    
    
    #
    # Splitting function needs to be change to accommondate the 
    # altered data structure 
    #
    
    # Data set is split randomly into training and validation sets. Only complete cases 
    # are selected.
    train_and_validation = splitCases(obj = tcga.dataset, 
                                  split = 0.75, 
                                  variables = variables.selected, 
                                  seed = 42)
    
    # Update list
    train_and_validation_ls[[end.point]] = train_and_validation 
    
    
    # Store number of  
    nsamples.step1 = c(length(train_and_validation$train), length(train_and_validation$validation))
    names(nsamples.step1) = c("ntrain.step1", "nvalid.step1")
    nsamples_step1_ls[[end.point]] = nsamples.step1
}

splitDataset = function(obj, train_and_validation_ls){

    # 
    split.obs.by.endpoint = list()
    
    for (end.point in names(train_and_validation_ls)){
        
        train.samples = train_and_validation_ls[[end.point]]$train
        validation.samples = train_and_validation_ls[[end.point]]$validation
        
        # New entry 
        split.obs.by.endpoint[[end.point]] = list()
        
        for (data.type in names(obj)){
            
            split.obs.by.endpoint[[end.point]][["CLIN"]] = list()  
            split.obs.by.endpoint[[end.point]][["CLIN"]][["train"]] = obj$CLIN %>% 
                          dplyr::filter(Participant.ID %in% train.samples)
            split.obs.by.endpoint[[end.point]][["CLIN"]][["validation"]] = obj$CLIN %>% 
                          dplyr::filter(Participant.ID %in% validation.samples)
                    
            split.obs.by.endpoint[[end.point]][["EXP"]] = list()    
            split.obs.by.endpoint[[end.point]][["EXP"]][["train"]] = obj$EXP %>% 
                          dplyr::filter(Participant.ID %in% train.samples)
            split.obs.by.endpoint[[end.point]][["EXP"]][["validation"]] = obj$EXP %>% 
                          dplyr::filter(Participant.ID %in% validation.samples)
            
            split.obs.by.endpoint[[end.point]][["CN"]] = list()
            split.obs.by.endpoint[[end.point]][["CN"]][["train"]] = obj$CN %>% 
                          dplyr::filter(Participant.ID %in% train.samples)           
            split.obs.by.endpoint[[end.point]][["CN"]][["validation"]] = obj$CN %>% 
                          dplyr::filter(Participant.ID %in% validation.samples)
        }
    }
    return(split.obs.by.endpoint)
}

tcga.dataset.splitted = splitDataset(tcga.dataset, train_and_validation_ls)

convertAge = function(x){
    return(x/360)
}

[1m[22mNew names:
[36m•[39m `complete` -> `complete...1`
[36m•[39m `complete` -> `complete...2`
[36m•[39m `complete` -> `complete...3`


[1] "Including 481 cases out of 497 cases"


[1m[22mNew names:
[36m•[39m `complete` -> `complete...1`
[36m•[39m `complete` -> `complete...2`
[36m•[39m `complete` -> `complete...3`


[1] "Including 480 cases out of 497 cases"


[1m[22mNew names:
[36m•[39m `complete` -> `complete...1`
[36m•[39m `complete` -> `complete...2`
[36m•[39m `complete` -> `complete...3`


[1] "Including 330 cases out of 497 cases"


[1m[22mNew names:
[36m•[39m `complete` -> `complete...1`
[36m•[39m `complete` -> `complete...2`
[36m•[39m `complete` -> `complete...3`


[1] "Including 481 cases out of 497 cases"


In [7]:
prepareSummary = function(end.point, data){
    # Convert age 
    data[[end.point]]$CLIN$train$Age.clin = convertAge(data[[end.point]]$CLIN$train$Age.clin)
    data[[end.point]]$CLIN$validation$Age.clin = convertAge(data[[end.point]]$CLIN$validation$Age.clin)
    
    a = data[[end.point]]$CLIN$train %>%
          tbl_summary(include = paste0(clin.var, ".clin")) %>% as_tibble()
    
    b = data[[end.point]]$CLIN$validation %>%
          tbl_summary(include = paste0(clin.var, ".clin")) %>% as_tibble()
    
    test = cbind(a,b)
    test = test[,-3]
    test = t(test)
    test = as.data.frame(test)
    colnames(test) = test[1,]
    test = test[-1,]
    test$End.point = end.point
    return(test)
}

clin.summary.table = bind_rows(map(c("OS","DSS","PFI","DFI"), 
                prepareSummary, 
                data = tcga.dataset.splitted))

for (end.point in names(train_and_validation_ls)){
    
    # Counts and VST for training data
    counts.training = expDataToMatrix(tcga.dataset.splitted[[end.point]]$EXP$train)
    
    vst.transf.training.obj = performVSTtraining(counts.training)
    
    # Counts for evaluation 
    counts.validation = expDataToMatrix(tcga.dataset.splitted[[end.point]]$EXP$validation)
    vst.transf.validation.counts = performVSTtest(counts = counts.validation, 
                                                  disp.function.train = vst.transf.training.obj$disp.function)
         
    tcga.dataset.splitted[[end.point]]$EXP$train = MatrixToExpdata(vst.transf.training.obj$vst.counts)
    tcga.dataset.splitted[[end.point]]$EXP$validation = MatrixToExpdata(vst.transf.validation.counts)
    
}

saveRDS(tcga.dataset.splitted, file.path(out.dir.data, "tcga.dataset_splitted.rds"))

rm(tcga.dataset)

tcga.dataset.splitted = readRDS(file.path(out.dir.data, "tcga.dataset_splitted.rds"))

tcga.dataset.merged =  mergeDataTypes(tcga.dataset.splitted )

saveRDS(tcga.dataset.merged, file.path(out.dir.data, "tcga.dataset_merged.rds"))

rm(tcga.dataset.splitted)

tcga.dataset.merged = readRDS(file.path(out.dir.data, "tcga.dataset_merged.rds"))

converting counts to integer mode

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

converting counts to integer mode

variance of dispersion residuals not estimated (necessary only for differential expression calling)

converting counts to integer mode

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

converting counts to integer mode

variance of dispersion residuals not estimated (necessary only for differential expression calling)

converting counts to integer mode

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

converting counts to integer mode

variance of dispersion residuals not estimated (necessary only for differential expression calling)

converting counts to integer mode

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

converting counts to integer mode

variance of dispersion residuals not estimated (necessary

In [8]:
# Store summary statistics 
summary.stats.ls = list()

# Iterate over end points 
for (end.point in c("OS","DSS","DFI","PFI")){
    
    exp.summary.training = prepSummaryExp(x = tcga.dataset.merged[[end.point]]$train, 
                                          raw.data = tcga.expr.datamat,
                                          variables = variables.selected, type = "exp")

    cn.summary.training = prepSummaryCN(tcga.dataset.merged[[end.point]]$train, 
                                        variables = variables.selected, 
                                        type = "cn")
    
    summary.stats.ls[[end.point]] = list("exp.summary.training" = exp.summary.training,
                                         "cn.summary.training" = cn.summary.training)
}


# Store filtered variables 
variables.selected.filtered.ls = list()

# Iterate over end points 
for (end.point in c("OS","DSS","DFI","PFI")){

    exp.features.keep = summary.stats.ls[[end.point]]$exp.summary.training %>% 
                          filter(`Median expression` > 20, 
                                 `Fraction of zero expression` < 0.75)

    cn.features.keep = summary.stats.ls[[end.point]]$cn.summary.training %>% 
                          filter(`Maximum fraction of aberrations` > 0.15) 

    # Update the summary tables 
    summary.stats.ls[[end.point]]$exp.summary.training$Selected = ifelse(summary.stats.ls[[end.point]]$exp.summary.training$name %in% exp.features.keep$name, "Yes", "No")
    summary.stats.ls[[end.point]]$cn.summary.training$Selected = ifelse(summary.stats.ls[[end.point]]$cn.summary.training$name %in% cn.features.keep$name, "Yes", "No")

    # Collect the variables into vector 
    variables.selected.filtered.ls[[end.point]] = filterFeatures(variables_selected_ls[[end.point]], exp.features.keep$name, type = "exp")
    variables.selected.filtered.ls[[end.point]]= filterFeatures(variables_selected_ls[[end.point]], cn.features.keep$name, type = "cn")
    
}

saveRDS(variables.selected.filtered.ls,  
        file.path(out.dir.data,"variables_selected_filtered_ls.rds"))

In [9]:
variables.selected.filtered.ls = readRDS(file.path(out.dir.data,"variables_selected_filtered_ls.rds"))

# Store the KM tables 
km.pvalue.table.ls = list()

# Store the significant features 
significant.features.ls = list()

# Iterate over end points 
for (end.point in c("OS","DSS","DFI","PFI")){

    # Create dir for plots 
    dir.create(file.path(dir.res.km, end.point))
    
    if (nrow(tcga.dataset.merged[[end.point]]$train) > 0){
    
        # Run univariate KM
        km.pvalue.table = runUnivariateKM(input.data = tcga.dataset.merged[[end.point]], 
                                          variables = variables.selected.filtered.ls[[end.point]],
                                          clinical.endpoint = end.point,
                                          out.dir = file.path(dir.res.km, end.point),
                                          plots = T)
    
    
        # Sort the results based on the training p-value and write the results to output
        km.pvalue.table = km.pvalue.table %>% dplyr::arrange(pvalues.training)
        km.pvalue.table$Selected = ifelse(km.pvalue.table$pvalues.training < 0.05, "Yes", "No") 
        write.csv(km.pvalue.table, file.path(dir.res.km, paste0(end.point, "_LogRank_pvalues.csv")))
    
        km.pvalue.table.ls[[end.point]] = km.pvalue.table
    
        # Extract the significant features 
        significant.features = getSignificantFeatures(km.pvalue.table, pvalue.thresh = 0.05)

        # Store 
        significant.features.ls[[end.point]] = significant.features
        
    } else {
        significant.features.ls[[end.point]] = NULL
    }
}

saveRDS(significant.features.ls, file.path(dir.res.root, "significant.features.ls"))

“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/OS' already exists”
“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/OS/Expression/Training_data' already exists”
“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/OS/Expression/Validation_data' already exists”
“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/OS/CN/Training_data' already exists”


NULL
NULL


“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/OS/CN/Validation_data' already exists”


NULL
NULL
NULL


“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/DSS' already exists”
“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/DSS/Expression/Training_data' already exists”
“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/DSS/Expression/Validation_data' already exists”
“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/DSS/CN/Training_data' already exists”


NULL
NULL
NULL


“NaNs produced”
“NaNs produced”
“NaNs produced”
“NaNs produced”
“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/DSS/CN/Validation_data' already exists”


NULL
NULL


“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/DFI' already exists”
“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/DFI/Expression/Training_data' already exists”
“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/DFI/Expression/Validation_data' already exists”
“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/DFI/CN/Training_data' already exists”


NULL
NULL


“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/DFI/CN/Validation_data' already exists”


NULL
NULL
NULL
NULL


“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/PFI' already exists”
“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/PFI/Expression/Training_data' already exists”
“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/PFI/Expression/Validation_data' already exists”
“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/PFI/CN/Training_data' already exists”


NULL
NULL


“'/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/tgfbeta//PRAD/Kaplan_Meier_plots/PFI/CN/Validation_data' already exists”


NULL
NULL
NULL


## Feature selection

Here we experiment with random survival forests

In [10]:
library(pec)
library(randomForestSRC)


 randomForestSRC 3.2.2 
 
 Type rfsrc.news() to see new features, changes, and bug fixes. 
 



Attaching package: ‘randomForestSRC’


The following object is masked from ‘package:purrr’:

    partial




In [11]:
#
# Function for tuning model parameters
#
#

tune_model <- function(data, formula, ntree, seed) {    
  
  set.seed(seed)
  tuning <- tune(formula, data, mtryStart = 2, ntreeTry = ntree, doBest = TRUE)
  node <- tuning$optimal[1]
  mtry <- tuning$optimal[2]
  
  error <- tuning$rf$err.rate[500]
  res <- tibble(nodesize = node, mtry = mtry, OOB_error = error)
  
  return(res)
}

In [39]:
#
# Main function for generating the RSF model 
# based on training set
#
generateRSFModel = function(training.data,
                            variables,
                            end.point.event,
                            end.point.time, 
                            model.name,
                            ntrees = c(50,100,200,500,1000),
                            seed) {
  
  
  # Store relevant data into an object 
  rsf.obj = list("model" = model.name)

  # Select only variables of interest 
  training.data = training.data  %>% dplyr::select(all_of(c(variables, end.point.event, end.point.time)))

  # Generate the formula for the model 
  surv.expression = paste0("Surv(", end.point.time, ", " , end.point.event, ")")
  f <- as.formula(paste(surv.expression, ".", sep = " ~ "))

  # Store formula
  rsf.obj$formula = f
 
  # First tune the parameters
  tune.res <- list()
  for(n in ntrees) {
    tune.res[[as.character(n)]] <- tune_model(training.data, f, n, seed) # Store optimal values for each ntree
  }
  
  # Generate a table 
  tune.res.df = dplyr::bind_rows(tune.res, .id = "ntree")
  
  # Store tuning results 
  rsf.obj$tuning = tune.res.df
  
  idx.min <- which.min(sapply(tune.res, function(x) x$OOB_error)) # Get ntree with smallest error
  ntree.optm <- ntrees[idx.min]
  nodesize.optm <- tune.res[[idx.min]]$nodesize # Get the best nodesize for the chosen ntree
  mtry.optm <- tune.res[[idx.min]]$mtry # Get the best mtry for the chosen ntree
  
  # Finally fit the model with optimised parameters
  set.seed(seed)
  rf.fit.train <- list(eval(bquote (rfsrc(.(f), training.data, 
                                          ntree = ntree.optm, 
                                          mtry = mtry.optm, 
                                          nodesize = nodesize.optm, 
                                          importance = TRUE, 
                                          block.size = 1, 
                                          seed = -13))))
  
  # Store the fitted model
  rsf.obj$fit.model = rf.fit.train
  
  return(rsf.obj)
}

PFI 

Iterative feature elimation

In [40]:
iterativeFeatElim = function(data, 
                             variables, 
                             end.point.event,
                             end.point.time,
                             model.name = "model2",
                             drop.out.perc = 0.2,
                             ntrees = c(50,100,200,500,1000),
                             seed){


  # Store results for testing 
  vimp.result.ls = list() 

  # Errors 
  oob.errors = list()
  
  # Num variables
  num.variables = list()
  
  # Store optimal parameters 
  params.ls = list("ntree" = list(),
                   "mtry" = list(),
                   "nodesize" = list())
  
  # Initialise the number of variables
  variables.left = variables[-c(1,2)]
  
  # Iteration
  i = 1
    
  # If the number of variables is less than 0.2 of the original 
  # data stop iteration
  n.var.stop = round(length(variables) * drop.out.perc)
  
  # Iterate 
  while (length(variables.left) > n.var.stop) {
   
    print(paste0("Iteration : ", i))

    # Create model 
    rsf.model = generateRSFModel(data,
                 variables = variables.left,
                 end.point.event = end.point.event,
                 end.point.time = end.point.time,
                 model.name = "model2",
                 ntrees = ntrees,
                 seed = seed)
    
    # Store the params 
    params.ls[["ntree"]][i] = rsf.model$fit.model[[1]]$ntree
    params.ls[["mtry"]][i] = rsf.model$fit.model[[1]]$mtry
    params.ls[["nodesize"]][i] = rsf.model$fit.model[[1]]$nodesize
  
    # Add the oob.error to list   
    oob.errors[[i]] = rsf.model$fit.model[[1]]$err.rate[rsf.model$fit.model[[1]]$ntree]
  
    # Add the number of variables 
    num.variables[[i]] = length(variables.left)
  
    # Calculate VIMP
    vimp.res = vimp(rsf.model$fit.model[[1]])
    
    # Prepare a table
    vimp.table = as.data.frame(vimp.res$importance) %>% 
                      tibble::rownames_to_column("Variable") 
    colnames(vimp.table) = c("Variable", "Importance")
    
    # Order by decreasing Importance such that the worst features
    # go bottom of the table
    vimp.table = dplyr::arrange(vimp.table, desc(Importance))

    # Store results 
    vimp.result.ls[[i]] = vimp.table
    
    # Remove the worst performing features 
    variables.left = vimp.table$Variable[1:((1 - drop.out.perc) * length(variables.left))]
    
    # Increment 
    i = i + 1
  }
  
  # Combine to table
  final.df = data.frame("ntree" = unlist(params.ls$ntree),
             "mtry" = unlist(params.ls$mtry),
             "nodesize" = unlist(params.ls$nodesize),
             "num_variable" = unlist(num.variables),
             "error" = unlist(oob.errors))


  
  # Find row with minumum error 
  min.idx = which(final.df$error == min(final.df$error))
  print(min.idx)
  
  # Select the last row having error equal to minimum. We want to minimize the 
  # number of features
  if (length(min.idx) > 1) {
    min.idx = max(min.idx)
  }
  print(min.idx)
  
  # Optimal parameters 
  optimal.set.param = final.df[min.idx,]
  optimal.set.variables = vimp.result.ls[[min.idx]]
  
  
  # Final results
  final.results = list("Errors" = final.df, 
                       "Features" = vimp.result.ls,
                       "Optimal.param" = optimal.set.param,
                       "Optimal.var" = optimal.set.variables)
  
  return(final.results)

}

In [None]:
end.point.time = "PFI.time.clin"


In [42]:
feature.elim.pfi = iterativeFeatElim(data = tcga.dataset.merged$PFI$train,
                  variables = variables.selected.filtered.ls[["PFI"]],
                  end.point.event = "PFI.clin",
                  end.point.time = "PFI.time.clin",
                  model.name = "model2",
                  drop.out.perc = 0.2,
                  ntrees = c(50,100,200,500,1000),
                  seed = 42)    
                  

In [45]:
feat.elim.res = readRDS(file.path(out.dir.data, "Feature_elimination_res_PFI.rds"))

In [59]:
feat.elim.res$Errors

ntree,mtry,nodesize,num_variable,error
<dbl>,<dbl>,<dbl>,<int>,<dbl>
100,1,8,35,0.3309813
500,1,25,28,0.3030503
50,7,8,22,0.3006983
100,1,8,17,0.2995222
500,4,2,13,0.3409776
500,5,1,10,0.3189269
50,6,2,8,0.3216465


In [49]:
vip.collected = bind_rows(feat.elim.res$Features, .id = "Iteration") %>% pivot_wider(names_from = Iteration, values_from = Importance)

In [57]:
vip.collected %>% arrange(desc(`7`), desc(`6`), desc(`5`), desc(`4`), desc(`3`))

Variable,1,2,3,4,5,6,7
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
AURKB.exp,0.0112466516,0.024319146,0.025517688,0.0166860983,0.041220932,0.070169715,0.10987642
PTEN.exp,0.0153373213,0.027285531,0.022245246,0.0315808193,0.035677535,0.061739104,0.0984787
TGFB1.exp,0.0168543497,0.01661849,0.032298196,0.0235684475,0.028683854,0.03684703,0.07719605
KIF2C.exp,0.0078198308,0.025926463,0.047219532,0.0522887796,0.025678605,0.020767503,0.05683456
BIRC5.exp,0.0075572326,0.016653425,0.040349042,0.0116136525,0.016075173,0.018383435,0.0402661
GDF9.exp,0.0057092189,0.010278202,0.015200859,0.0309803598,0.012100446,0.047000375,0.03673613
FLCN.exp,0.0101900738,0.023873504,0.032028799,0.0398617795,0.031163148,0.029970969,0.02960134
ENG.exp,0.0087708146,0.012458479,0.028244611,0.0313942057,0.010719336,0.020268861,0.01514332
CDCA8.exp,0.0084615116,0.012342167,0.020615266,0.0182926493,0.014705465,0.011720581,
SMAD9.exp,0.0058374508,0.009179841,0.019560365,0.0233721015,0.009771761,0.003302699,
