# PRAD model extension combined list of oncogenes, TGFBeta genes and cell cycle genes

Here we will experiment with adding new features based on DE genes. The DE-genes have been filtered and ranked based on feature elimination utilising RSFs.

In [1]:
library(tidyverse)
library(survival)
library(survminer)
library(glmnet)
library(WriteXLS)
library(ggfortify)
library(circlize)
library(ComplexHeatmap)
library(parallel)
library(broom)
library(survcomp)
library(survivalROC)
library(gtsummary)
source("../getTCGAData.R")
source("../preprocessTCGAData.R")
source("../KM_analysis.R")
source("../Heatmaps.R")
source("../enet.R")
library("readxl")

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.3     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.2.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
Loading required package: ggpubr


Attaching package: ‘survminer’


The following object is masked from ‘package:survival’:

    myeloma


Loading required package: Matr

## Functions 

In [2]:
#
#
#
# Helper function for fixing variable names 
fixVarNames = function(x){
    print(x[1])
    if (str_detect(x, "Gender.clin")) {
        return("Gender")
    } else if (str_detect(x, "Tumor.stage.clin")){
        return("Tumor.stage")
    } else if (str_detect(x,".cn")){
        return(str_extract(x, "\\w+.cn"))
    } else if (str_detect(x, "Gleason.group.clin")){ 
        return("Gleason.group.clin")
    } else {
        return(x)
    }
}


# 
# Function fits a cox regression model
# 
fitCoxModel = function(data, end.point, features){
    
    # Expand to variable name
    end_point_time = paste0(end.point, ".time.clin")
    end_point_event = paste0(end.point, ".clin")

    # Generate a survival formula object 
    survExpression = paste0("Surv(", end_point_time, ", " , end_point_event, ")")
    f <- as.formula(paste(survExpression, paste(features, collapse = " + "), sep = " ~ "))
    
    model.fit = coxph(f, data = data)
    return(model.fit)
}

## Load processed data

In [3]:
tcga.dataset.merged = readRDS("/lustre/projects/landstrom_core/data/rdata_revised/manuscript_work//PRAD//tcga.dataset_merged.rds")

Convert - to _

In [4]:
colnames(tcga.dataset.merged$PFI$train) = str_replace(colnames(tcga.dataset.merged$PFI$train), "-", "_")
colnames(tcga.dataset.merged$PFI$validation) =  str_replace(colnames(tcga.dataset.merged$PFI$validation), "-", "_")

## Load original model and extracting features

In [5]:
# Loading original features 
original.model.features = read_excel("/lustre/projects/landstrom_core/results/prognostic_model_development_revised/models_by_cancer_type/PRAD/Penalized_Cox_risk_prediction/customer_features/With_clinical_features/PFI/Active_covariates_in_lambda.min_model.xlsx")

[1m[22mNew names:
[36m•[39m `` -> `...1`


In [6]:
original.model.features

...1,coef,HR
<chr>,<dbl>,<dbl>
Age.clin,2.777648e-06,1.0000028
Gleason.group.clinGleason_group_2,0.9138624,2.4939365
KIF23.exp,0.384691,1.4691603
PTEN.exp,-0.4741249,0.6224295
VPS4B.exp,-0.2241524,0.7991933
AURKA.exp,-0.4645377,0.6284256
AURKB.exp,0.439466,1.5518783
TP53.cn-1,0.07668698,1.0797041


In [7]:
# Feature names to be selected 
selected.features = map_chr(unlist(original.model.features[,1]), fixVarNames) %>% as.vector()

[1] "Age.clin"
[1] "Gleason.group.clinGleason_group_2"
[1] "KIF23.exp"
[1] "PTEN.exp"
[1] "VPS4B.exp"
[1] "AURKA.exp"
[1] "AURKB.exp"
[1] "TP53.cn-1"


## Refitting the original model using standard cox-regression

In [8]:
# Construct the clinical end points 
end_point_event = paste0("PFI", ".clin")
end_point_time = paste0("PFI", ".time.clin")
    
selected.columns = c(end_point_event, end_point_time, selected.features)

In [9]:
# Input data for cox-model 
input.training = tcga.dataset.merged[["PFI"]]$train %>% dplyr::select(all_of(selected.columns))

# Fit the cox-model 
pcox.ref.fit.pfi.original = fitCoxModel(input.training, "PFI", selected.features)

# Predict the risk scores for validation data 
rel.risk = predict(object = pcox.ref.fit.pfi.original, 
                   newdata = tcga.dataset.merged$PFI$validation[,selected.features], 
                   type = "risk")

#
# Evaluate 
#

# Stratify validation data into two groups based on the fitted relative risk
y.data <- tcga.dataset.merged$PFI$validation[paste0("PFI", c(".clin",".time.clin"))]
colnames(y.data) = c("status","time")

# TEST new function for calculating the C-index
cindex.ref.valid = concordance.index(rel.risk, 
                                        y.data$time, 
                                        y.data$status,
                                        na.rm = TRUE)

# Plot KM and extract the p-value  
KM.valid.ref.by.risk = plotKMbyRelativeRisk(data = y.data, 
                                                     rel.risk = rel.risk)

In [10]:
# Modify the labels 

colnames(tcga.dataset.merged[["PFI"]]$train) = str_replace(colnames(tcga.dataset.merged[["PFI"]]$train), "-", "_")
colnames(tcga.dataset.merged[["PFI"]]$validation) = str_replace(colnames(tcga.dataset.merged[["PFI"]]$validation), "-", "_")

## Load the feature elimination results

In [11]:
feature.elim.res = readRDS("/lustre/projects/landstrom_core/data/rdata/manuscript_work/revisions/combined_list/PRAD/Feature_elimination_res_PFI_oncogenes.rds")

In [22]:
feature.elim.res$Errors

ntree,mtry,nodesize,num_variable,error
<dbl>,<dbl>,<dbl>,<int>,<dbl>
500,2,9,538,0.2746049
50,2,8,430,0.2726204
100,1,3,344,0.2932745
200,1,9,275,0.2712238
200,1,3,220,0.2543183
1000,1,5,176,0.2588019
100,2,2,140,0.2345829
1000,1,6,112,0.2338846
500,1,6,89,0.232194
200,4,3,71,0.2518927


In [12]:
length(feature.elim.res$Features)

In [14]:
genes.to.add = feature.elim.res$Optimal.var$Variable
genes.to.add = str_replace(genes.to.add, "-", "_")

In [15]:
# Store results 
step.wise.results = matrix(nrow = 0, ncol = 2) %>% as.data.frame() 
colnames(step.wise.results) = c("KM.pvalue","C.index")

In [16]:
step.wise.results = as.data.frame(lapply(step.wise.results, as.numeric))

In [17]:
selected.columns.stepwise = selected.columns

In [19]:
for (i in 1:length(genes.to.add)){
    new.feature = genes.to.add[i]
    
    # Test adding one gene
    selected.columns.stepwise = c(selected.columns.stepwise, new.feature)
    
    # Input data for cox-model 
    input.training = tcga.dataset.merged[["PFI"]]$train[,colnames(tcga.dataset.merged[["PFI"]]$train) %in% selected.columns.stepwise]

    # The features 
    selected.features.stepwise = selected.columns.stepwise[-c(1,2)]
    
    # Fit the cox-model 
    pcox.ref.fit.pfi = fitCoxModel(input.training, "PFI", selected.features.stepwise)

    # Predict the risk scores for validation data 
    rel.risk = predict(object = pcox.ref.fit.pfi , 
                   newdata = tcga.dataset.merged$PFI$validation[,selected.features.stepwise], 
                   type = "risk")

    #
    # Evaluate 
    #

    # Stratify validation data into two groups based on the fitted relative risk
    y.data <- tcga.dataset.merged$PFI$validation[paste0("PFI", c(".clin",".time.clin"))]
    colnames(y.data) = c("status","time")

    # TEST new function for calculating the C-index
    cindex.ref.valid = concordance.index(rel.risk, 
                                        y.data$time, 
                                        y.data$status,
                                        na.rm = TRUE)

    # Plot KM and extract the p-value  
    KM.valid.ref.by.risk = plotKMbyRelativeRisk(data = y.data, 
                                                     rel.risk = rel.risk)
    
    
    res = data.frame(KM.pvalue = KM.valid.ref.by.risk$table$Pvalue.pval,
                     C.index = cindex.ref.valid$c.index,
                     "C-index CI" = paste0("(", round(cindex.ref.valid$lower, 4), " - ",  
                                                round(cindex.ref.valid$upper, 4), ")")
                     )
    
    step.wise.results = rbind(step.wise.results, res)
    
    #print(cindex.ref.valid$c.index)
    #print(KM.valid.ref.by.risk$table)
    
}

Find the minimal p-value and the corresponding C-index

In [21]:
step.wise.results

KM.pvalue,C.index,C.index.CI
<dbl>,<dbl>,<chr>
0.03276572,0.608627,(0.3708 - 0.8041)
0.042654,0.6170437,(0.3782 - 0.8102)
0.29508967,0.5833772,(0.3487 - 0.7855)
0.15443495,0.5854813,(0.3505 - 0.7871)
0.29732351,0.5854813,(0.3505 - 0.7871)
0.27267591,0.5833772,(0.3487 - 0.7855)
0.16334542,0.5839032,(0.3492 - 0.7859)
0.25271704,0.5917938,(0.356 - 0.7917)
0.63538282,0.5796949,(0.3455 - 0.7828)
0.51216877,0.5986323,(0.362 - 0.7968)


In [20]:
which(step.wise.results$KM.pvalue == min(step.wise.results$KM.pvalue))

In [25]:
step.wise.results[1,]

Unnamed: 0_level_0,KM.pvalue,C.index,C.index.CI
Unnamed: 0_level_1,<dbl>,<dbl>,<chr>
1,0.03276572,0.608627,(0.3708 - 0.8041)


In [26]:
genes.to.add[1]

In [35]:
write.csv(feature.elim.res$Optimal.var, "Feature_elimination_combined_temp.csv")