# PRAD model extension testing

Here we will experiment with adding new features based on DE genes. The DE-genes have been filtered and ranked based on feature elimination utilising RSFs.

In [22]:
library(tidyverse)
library(survival)
library(survminer)
library(glmnet)
library(WriteXLS)
library(ggfortify)
library(circlize)
library(ComplexHeatmap)
library(parallel)
library(broom)
library(survcomp)
library(survivalROC)
library(gtsummary)
source("../getTCGAData.R")
source("../preprocessTCGAData.R")
source("../KM_analysis.R")
source("../Heatmaps.R")
source("../enet.R")
library("readxl")

## Functions 

In [23]:
#
#
#
# Helper function for fixing variable names 
fixVarNames = function(x){
    print(x[1])
    if (str_detect(x, "Gender.clin")) {
        return("Gender")
    } else if (str_detect(x, "Tumor.stage.clin")){
        return("Tumor.stage")
    } else if (str_detect(x,".cn")){
        return(str_extract(x, "\\w+.cn"))
    } else if (str_detect(x, "Gleason.group.clin")){ 
        return("Gleason.group.clin")
    } else {
        return(x)
    }
}


# 
# Function fits a cox regression model
# 
fitCoxModel = function(data, end.point, features){
    
    # Expand to variable name
    end_point_time = paste0(end.point, ".time.clin")
    end_point_event = paste0(end.point, ".clin")

    # Generate a survival formula object 
    survExpression = paste0("Surv(", end_point_time, ", " , end_point_event, ")")
    f <- as.formula(paste(survExpression, paste(features, collapse = " + "), sep = " ~ "))
    
    model.fit = coxph(f, data = data)
    return(model.fit)
}

## Load processed data

In [24]:
tcga.dataset.merged = readRDS("/lustre/projects/landstrom_core/data/rdata_revised/manuscript_work//PRAD//tcga.dataset_merged.rds")

Convert - to _

In [25]:
colnames(tcga.dataset.merged$PFI$train) = str_replace(colnames(tcga.dataset.merged$PFI$train), "-", "_")
colnames(tcga.dataset.merged$PFI$validation) =  str_replace(colnames(tcga.dataset.merged$PFI$validation), "-", "_")

## Load original model and extracting features

In [26]:
# Loading original features 
original.model.features = read_excel("/lustre/projects/landstrom_core/results/prognostic_model_development_revised/models_by_cancer_type/PRAD/Penalized_Cox_risk_prediction/customer_features/With_clinical_features/PFI/Active_covariates_in_lambda.min_model.xlsx")

[1m[22mNew names:
[36m•[39m `` -> `...1`


In [27]:
original.model.features

...1,coef,HR
<chr>,<dbl>,<dbl>
Age.clin,2.777648e-06,1.0000028
Gleason.group.clinGleason_group_2,0.9138624,2.4939365
KIF23.exp,0.384691,1.4691603
PTEN.exp,-0.4741249,0.6224295
VPS4B.exp,-0.2241524,0.7991933
AURKA.exp,-0.4645377,0.6284256
AURKB.exp,0.439466,1.5518783
TP53.cn-1,0.07668698,1.0797041


In [28]:
# Feature names to be selected 
selected.features = map_chr(unlist(original.model.features[,1]), fixVarNames) %>% as.vector()

[1] "Age.clin"
[1] "Gleason.group.clinGleason_group_2"
[1] "KIF23.exp"
[1] "PTEN.exp"
[1] "VPS4B.exp"
[1] "AURKA.exp"
[1] "AURKB.exp"
[1] "TP53.cn-1"


## Refitting the original model using standard cox-regression

In [29]:
# Construct the clinical end points 
end_point_event = paste0("PFI", ".clin")
end_point_time = paste0("PFI", ".time.clin")
    
selected.columns = c(end_point_event, end_point_time, selected.features)

In [30]:
# Input data for cox-model 
input.training = tcga.dataset.merged[["PFI"]]$train %>% dplyr::select(all_of(selected.columns))

# Fit the cox-model 
pcox.ref.fit.pfi.original = fitCoxModel(input.training, "PFI", selected.features)

# Predict the risk scores for validation data 
rel.risk = predict(object = pcox.ref.fit.pfi.original, 
                   newdata = tcga.dataset.merged$PFI$validation[,selected.features], 
                   type = "risk")

#
# Evaluate 
#

# Stratify validation data into two groups based on the fitted relative risk
y.data <- tcga.dataset.merged$PFI$validation[paste0("PFI", c(".clin",".time.clin"))]
colnames(y.data) = c("status","time")

# TEST new function for calculating the C-index
cindex.ref.valid = concordance.index(rel.risk, 
                                        y.data$time, 
                                        y.data$status,
                                        na.rm = TRUE)

# Plot KM and extract the p-value  
KM.valid.ref.by.risk = plotKMbyRelativeRisk(data = y.data, 
                                                     rel.risk = rel.risk)

In [31]:
# Modify the labels 

colnames(tcga.dataset.merged[["PFI"]]$train) = str_replace(colnames(tcga.dataset.merged[["PFI"]]$train), "-", "_")
colnames(tcga.dataset.merged[["PFI"]]$validation) = str_replace(colnames(tcga.dataset.merged[["PFI"]]$validation), "-", "_")

## Load the feature elimination results

In [32]:
de.feat.elim.res = readRDS("/lustre/projects/landstrom_core/data/rdata/manuscript_work/revisions/PRAD/Feature_elimination_res_PFI_DE.rds")

In [33]:
de.feat.elim.res$Features

Variable,Importance
<chr>,<dbl>
CDC25C.exp,0.012259246
AMH.exp,0.011619466
RRM2.exp,0.008777480
KIF20A.exp,0.008335818
ESPL1.exp,0.007593893
DLGAP5.exp,0.007328550
AURKA.exp,0.007244444
FMOD.exp,0.007011241
CDC20.exp,0.006980378
NEIL3.exp,0.006834242

Variable,Importance
<chr>,<dbl>
FMOD.exp,0.010582984
AMH.exp,0.009568775
CDC45.exp,0.009476841
DMBX1.exp,0.009372653
GTSE1.exp,0.009042136
ARHGEF39.exp,0.008592004
KIF2C.exp,0.008197516
OIP5.exp,0.008156268
SPAG5.exp,0.007456163
IGSF1.exp,0.007373233

Variable,Importance
<chr>,<dbl>
FMOD.exp,0.015779729
AMH.exp,0.011886180
OIP5.exp,0.011454177
KIF4A.exp,0.009028762
DMBX1.exp,0.008962359
KRTAP5-1.exp,0.008778209
ESPL1.exp,0.007753775
RDM1.exp,0.007348833
SPAG5.exp,0.007035747
ENSG00000235978.exp,0.006944910

Variable,Importance
<chr>,<dbl>
FMOD.exp,0.020305862
DMBX1.exp,0.019196767
CDC45.exp,0.018131051
OIP5.exp,0.016329484
INSYN2B.exp,0.014505867
AMH.exp,0.013812103
KRTAP5-1.exp,0.013120511
GTSE1.exp,0.013079122
ARHGEF39.exp,0.012404113
ENSG00000254548.exp,0.012142333

Variable,Importance
<chr>,<dbl>
OIP5.exp,0.030640058
IQGAP3.exp,0.026860618
ENSG00000254548.exp,0.018411023
IGSF1.exp,0.017787029
AMH.exp,0.016904427
CDC45.exp,0.015643886
TICRR.exp,0.014326284
RDM1.exp,0.013380137
TROAP.exp,0.013341694
SPC24.exp,0.013167399

Variable,Importance
<chr>,<dbl>
CDC45.exp,0.0229376304
RRM2.exp,0.019545913
PIMREG.exp,0.0184098225
OIP5.exp,0.0183750384
TROAP.exp,0.0173446784
ARHGEF39.exp,0.0167022457
KRTAP5-1.exp,0.0160302317
FAM72A.exp,0.0157484377
AMH.exp,0.0156863722
TICRR.exp,0.0150169187

Variable,Importance
<chr>,<dbl>
FMOD.exp,0.0276121348
ENSG00000254548.exp,0.0218382297
CDC45.exp,0.0203802784
AMH.exp,0.0198028019
KRTAP5-1.exp,0.0182903692
MAB21L1.exp,0.0162888701
AURKA.exp,0.0154585206
FAM72A.exp,0.0152486889
ENSG00000225986.exp,0.0152470205
CSAG1.exp,0.0140647308

Variable,Importance
<chr>,<dbl>
AMH.exp,0.025562487
DMBX1.exp,0.023276656
FMOD.exp,0.022143845
CDC45.exp,0.021551223
ENSG00000254548.exp,0.020279427
ARHGEF39.exp,0.017563231
INSYN2B.exp,0.017372367
MAB21L1.exp,0.017024905
OIP5.exp,0.016415967
KRTAP5-1.exp,0.016232468

Variable,Importance
<chr>,<dbl>
FMOD.exp,0.036986768
CDC45.exp,0.033469533
AMH.exp,0.029263239
KRTAP5-1.exp,0.028220619
OIP5.exp,0.025791174
DMBX1.exp,0.022986161
MAB21L1.exp,0.021374202
ENSG00000225986.exp,0.019699151
CSAG1.exp,0.018899968
ARHGEF39.exp,0.018231151

Variable,Importance
<chr>,<dbl>
FMOD.exp,0.027590019
DMBX1.exp,0.020766861
ARHGEF39.exp,0.02030745
CSAG1.exp,0.020011323
KRTAP5-1.exp,0.019235663
ENSG00000235978.exp,0.018374238
RRM2.exp,0.018245914
SPC24.exp,0.018175057
TICRR.exp,0.01793734
INSYN2B.exp,0.01780616

Variable,Importance
<chr>,<dbl>
FMOD.exp,0.062256043
INSYN2B.exp,0.059507633
ENSG00000254548.exp,0.050677767
OIP5.exp,0.042909415
KRTAP5-1.exp,0.042463571
RRM2.exp,0.03838878
AMH.exp,0.036991436
CSAG1.exp,0.035861072
CDC45.exp,0.027020639
ENSG00000235978.exp,0.024497832

Variable,Importance
<chr>,<dbl>
FMOD.exp,0.05650005
TICRR.exp,0.05501059
AMH.exp,0.05108191
OIP5.exp,0.04996688
DMBX1.exp,0.04752093
ENSG00000235978.exp,0.04554979
CSAG1.exp,0.04309766
ENSG00000254548.exp,0.04054405
INSYN2B.exp,0.0382536
ESPL1.exp,0.03160156

Variable,Importance
<chr>,<dbl>
ENSG00000254548.exp,0.05410565
FMOD.exp,0.05021335
AMH.exp,0.04961264
TICRR.exp,0.04498654
KRTAP5-1.exp,0.04270727
CSAG1.exp,0.04228056
OIP5.exp,0.03999588
INSYN2B.exp,0.039143
ESPL1.exp,0.0333282
ENSG00000235978.exp,0.0229957


In [34]:
genes.to.add = de.feat.elim.res$Optimal.var$Variable

Remove features already in model

In [36]:
genes.to.add = genes.to.add[!(genes.to.add %in% selected.features)]

In [37]:
genes.to.add = str_replace(genes.to.add, "-", "_")

In [38]:
# Store results 
step.wise.results = matrix(nrow = 0, ncol = 2) %>% as.data.frame() 
colnames(step.wise.results) = c("KM.pvalue","C.index")

In [39]:
step.wise.results = as.data.frame(lapply(step.wise.results, as.numeric))

In [40]:
selected.columns.stepwise = selected.columns

In [41]:
for (i in 1:length(genes.to.add)){
    new.feature = genes.to.add[i]
    
    # Test adding one gene
    selected.columns.stepwise = c(selected.columns.stepwise, new.feature)
    
    # Input data for cox-model 
    input.training = tcga.dataset.merged[["PFI"]]$train[,colnames(tcga.dataset.merged[["PFI"]]$train) %in% selected.columns.stepwise]

    # The features 
    selected.features.stepwise = selected.columns.stepwise[-c(1,2)]
    
    # Fit the cox-model 
    pcox.ref.fit.pfi = fitCoxModel(input.training, "PFI", selected.features.stepwise)

    # Predict the risk scores for validation data 
    rel.risk = predict(object = pcox.ref.fit.pfi , 
                   newdata = tcga.dataset.merged$PFI$validation[,selected.features.stepwise], 
                   type = "risk")

    #
    # Evaluate 
    #

    # Stratify validation data into two groups based on the fitted relative risk
    y.data <- tcga.dataset.merged$PFI$validation[paste0("PFI", c(".clin",".time.clin"))]
    colnames(y.data) = c("status","time")

    # TEST new function for calculating the C-index
    cindex.ref.valid = concordance.index(rel.risk, 
                                        y.data$time, 
                                        y.data$status,
                                        na.rm = TRUE)

    # Plot KM and extract the p-value  
    KM.valid.ref.by.risk = plotKMbyRelativeRisk(data = y.data, 
                                                     rel.risk = rel.risk)
    
    
    res = data.frame(KM.pvalue = KM.valid.ref.by.risk$table$Pvalue.pval,
                     C.index = cindex.ref.valid$c.index,
                     "C-index CI" = paste0("(", round(cindex.ref.valid$lower, 4), " - ",  
                                                round(cindex.ref.valid$upper, 4), ")")
                     )
    
    step.wise.results = rbind(step.wise.results, res)
    
    #print(cindex.ref.valid$c.index)
    #print(KM.valid.ref.by.risk$table)
    
}

Find the minimal p-value and the corresponding C-index

In [42]:
which(step.wise.results$KM.pvalue == min(step.wise.results$KM.pvalue))

In [45]:
step.wise.results[8,]

Unnamed: 0_level_0,KM.pvalue,C.index,C.index.CI
Unnamed: 0_level_1,<dbl>,<dbl>,<chr>
8,0.001966972,0.6249342,(0.3853 - 0.8158)


The feature list : 

In [46]:
genes.to.add[1:8]

In [44]:
step.wise.results

KM.pvalue,C.index,C.index.CI
<dbl>,<dbl>,<chr>
0.022556489,0.6165176,(0.3778 - 0.8098)
0.010569691,0.621778,(0.3825 - 0.8136)
0.007750568,0.6196739,(0.3806 - 0.8121)
0.002008449,0.6170437,(0.3782 - 0.8102)
0.002116817,0.6223041,(0.3829 - 0.8139)
0.002522324,0.6280905,(0.3881 - 0.8181)
0.002522324,0.6338769,(0.3933 - 0.8222)
0.001966972,0.6249342,(0.3853 - 0.8158)
0.003726015,0.6249342,(0.3853 - 0.8158)
0.003726015,0.6428196,(0.4014 - 0.8285)
