# PRAD model extension testing

Here we test what happens to the original models if we start adding features

In [1]:
library(tidyverse)
library(survival)
library(survminer)
library(glmnet)
library(WriteXLS)
library(ggfortify)
library(circlize)
library(ComplexHeatmap)
library(parallel)
library(broom)
library(survcomp)
library(survivalROC)
library(gtsummary)
source("../getTCGAData.R")
source("../preprocessTCGAData.R")
source("../KM_analysis.R")
source("../Heatmaps.R")
source("../enet.R")
library("readxl")

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.3     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.2.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
Loading required package: ggpubr


Attaching package: ‘survminer’


The following object is masked from ‘package:survival’:

    myeloma


Loading required package: Matr

## Functions 

In [2]:
#
#
#
# Helper function for fixing variable names 
fixVarNames = function(x){
    print(x[1])
    if (str_detect(x, "Gender.clin")) {
        return("Gender")
    } else if (str_detect(x, "Tumor.stage.clin")){
        return("Tumor.stage")
    } else if (str_detect(x,".cn")){
        return(str_extract(x, "\\w+.cn"))
    } else if (str_detect(x, "Gleason.group.clin")){ 
        return("Gleason.group.clin")
    } else {
        return(x)
    }
}


# 
# Function fits a cox regression model
# 
fitCoxModel = function(data, end.point, features){
    
    # Expand to variable name
    end_point_time = paste0(end.point, ".time.clin")
    end_point_event = paste0(end.point, ".clin")

    # Generate a survival formula object 
    survExpression = paste0("Surv(", end_point_time, ", " , end_point_event, ")")
    f <- as.formula(paste(survExpression, paste(features, collapse = " + "), sep = " ~ "))
    
    model.fit = coxph(f, data = data)
    return(model.fit)
}

## Load processed data

In [3]:
tcga.dataset.merged = readRDS("/lustre/projects/landstrom_core/data/rdata_revised/manuscript_work//PRAD//tcga.dataset_merged.rds")

## Load original model and extracting features

In [4]:
# Loading original features 
original.model.features = read_excel("/lustre/projects/landstrom_core/results/prognostic_model_development_revised/models_by_cancer_type/PRAD/Penalized_Cox_risk_prediction/customer_features/With_clinical_features/PFI/Active_covariates_in_lambda.min_model.xlsx")

[1m[22mNew names:
[36m•[39m `` -> `...1`


In [5]:
# Feature names to be selected 
selected.features = map_chr(unlist(original.model.features[,1]), fixVarNames) %>% as.vector()

[1] "Age.clin"
[1] "Gleason.group.clinGleason_group_2"
[1] "KIF23.exp"
[1] "PTEN.exp"
[1] "VPS4B.exp"
[1] "AURKA.exp"
[1] "AURKB.exp"
[1] "TP53.cn-1"


## Refitting the original model using standard cox-regression

In [6]:
# Construct the clinical end points 
end_point_event = paste0("PFI", ".clin")
end_point_time = paste0("PFI", ".time.clin")
    
selected.columns = c(end_point_event, end_point_time, selected.features)

In [7]:
# Input data for cox-model 
input.training = tcga.dataset.merged[["PFI"]]$train %>% dplyr::select(all_of(selected.columns))

# Fit the cox-model 
pcox.ref.fit.pfi.original = fitCoxModel(input.training, "PFI", selected.features)

# Predict the risk scores for validation data 
rel.risk = predict(object = pcox.ref.fit.pfi.original, 
                   newdata = tcga.dataset.merged$PFI$validation[,selected.features], 
                   type = "risk")

#
# Evaluate 
#

# Stratify validation data into two groups based on the fitted relative risk
y.data <- tcga.dataset.merged$PFI$validation[paste0("PFI", c(".clin",".time.clin"))]
colnames(y.data) = c("status","time")

# TEST new function for calculating the C-index
cindex.ref.valid = concordance.index(rel.risk, 
                                        y.data$time, 
                                        y.data$status,
                                        na.rm = TRUE)

# Plot KM and extract the p-value  
KM.valid.ref.by.risk = plotKMbyRelativeRisk(data = y.data, 
                                                     rel.risk = rel.risk)

In [8]:
# Modify the labels 

colnames(tcga.dataset.merged[["PFI"]]$train) = str_replace(colnames(tcga.dataset.merged[["PFI"]]$train), "-", "_")
colnames(tcga.dataset.merged[["PFI"]]$validation) = str_replace(colnames(tcga.dataset.merged[["PFI"]]$validation), "-", "_")

## Load the KM results 

In [9]:
km.results.de = read.csv("/lustre/projects/landstrom_core/results/prognostic_model_development_revised_cell_cycle/combined/PRAD/Kaplan_Meier_plots/PFI_LogRank_pvalues.csv")

In [10]:
"NKX3_1.exp" %in% colnames(tcga.dataset.merged$PFI$validation)

In [11]:
km.results.de$Feature[km.results.de$Feature == "NKX301.exp"] = "NKX3_1.exp"

In [12]:
# Convert "-" to &
km.results.de$Feature = str_replace(km.results.de$Feature, "-", "_")

## Step-wise addition of features

In [13]:
selected.columns.stepwise = selected.columns

Pick top 200

In [14]:
new.genes.exp = km.results.de$Feature[1:200]

In [15]:
new.genes.exp = str_replace(new.genes.exp, ".amp", "")
new.genes.exp = str_replace(new.genes.exp, ".del", "")

In [16]:
# Store results 
step.wise.results = matrix(nrow = 0, ncol = 2) %>% as.data.frame() 
colnames(step.wise.results) = c("KM.pvalue","C.index")

In [17]:
step.wise.results = as.data.frame(lapply(step.wise.results, as.numeric))

In [None]:
for (i in 1:length(new.genes.exp)){
    new.feature = new.genes.exp[i]
    
    # Test adding one gene
    selected.columns.stepwise = c(selected.columns.stepwise, new.feature)
    
    # Input data for cox-model 
    input.training = tcga.dataset.merged[["PFI"]]$train[,colnames(tcga.dataset.merged[["PFI"]]$train) %in% selected.columns.stepwise]

    # The features 
    selected.features.stepwise = selected.columns.stepwise[-c(1,2)]
    
    # Fit the cox-model 
    pcox.ref.fit.pfi = fitCoxModel(input.training, "PFI", selected.features.stepwise)

    # Predict the risk scores for validation data 
    rel.risk = predict(object = pcox.ref.fit.pfi , 
                   newdata = tcga.dataset.merged$PFI$validation[,selected.features.stepwise], 
                   type = "risk")

    #
    # Evaluate 
    #

    # Stratify validation data into two groups based on the fitted relative risk
    y.data <- tcga.dataset.merged$PFI$validation[paste0("PFI", c(".clin",".time.clin"))]
    colnames(y.data) = c("status","time")

    # TEST new function for calculating the C-index
    cindex.ref.valid = concordance.index(rel.risk, 
                                        y.data$time, 
                                        y.data$status,
                                        na.rm = TRUE)

    # Plot KM and extract the p-value  
    KM.valid.ref.by.risk = plotKMbyRelativeRisk(data = y.data, 
                                                     rel.risk = rel.risk)
    
    
    res = data.frame(KM.pvalue = KM.valid.ref.by.risk$table$Pvalue.pval,
                     C.index = cindex.ref.valid$c.index,
                     "C-index CI" = paste0("(", round(cindex.ref.valid$lower, 4), " - ",  
                                                round(cindex.ref.valid$upper, 4), ")")
                     )
    step.wise.results = rbind(step.wise.results, res)
    #print(step.wise.results)
    
    #print(cindex.ref.valid$c.index)
    #print(KM.valid.ref.by.risk$table)
    
}


Find the minimal p-value

In [20]:
which(step.wise.results$KM.pvalue == min(step.wise.results$KM.pvalue))

In [21]:
step.wise.results[5,]

Unnamed: 0_level_0,KM.pvalue,C.index,C.index.CI
Unnamed: 0_level_1,<dbl>,<dbl>,<chr>
5,0.001586542,0.6207259,(0.3815 - 0.8128)


In [22]:
new.genes.exp[1:5]