# Model validation SU2C 2019

In [1]:
library(tidyverse)

── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



## Functions

In [15]:
# Modify the OS status to match the common convention
modifyOSStatus = function(x){
    status = unlist(strsplit(x, split = ":"))[1]
    return(status)
}

# Add patient id from sample clin data
addPatientId = function(exp.data, sample.clin.data){
    sample.clin.data.minimal = sample.clin.data %>% dplyr::select(PATIENT_ID, SAMPLE_ID)
    sample.clin.data.minimal$Sample = sample.clin.data.minimal$SAMPLE_ID
    sample.clin.data.minimal = sample.clin.data.minimal %>% dplyr::select(-SAMPLE_ID)
    exp.data =  dplyr::left_join(exp.data, 
                                 sample.clin.data.minimal, by = "Sample")
    return(exp.data)
}

# Add survival data from patient clin data 
addSurvivalData = function(exp.data, patient.clin.data){
    patient.clin.data.minimal = patient.clin.data %>% dplyr::select(PATIENT_ID, OS, OS.time)
    exp.data =  dplyr::left_join(exp.data, 
                                 patient.clin.data.minimal, by = "PATIENT_ID")

}

# 
# Function fits a cox regression model
# 
fitCoxModel = function(data, end.point, features){
    
    end_point_event = end.point
    end_point_time = paste0(end.point, ".time")
    
    # Generate a survival formula object 
    survExpression = paste0("Surv(", end_point_time, ", " , end_point_event, ")")
    f <- as.formula(paste(survExpression, paste(features, collapse = " + "), sep = " ~ "))
    
    model.fit = coxph(f, data = data)
    return(model.fit)
}

#
# Plot Kaplan-Meier curve by relative risk
#
plotKMbyRelativeRisk = function(data, rel.risk) {

    # Assign relative risk 
    data$pred <- rel.risk
    data$group <- ifelse(data$pred  < median(data$pred), "Low", "High")

    if (length(table(data$group)) == 2) {

        # Fit the survival function 
        s.fit.train <- survfit(Surv(time, status) ~group, data = data)

        # Generate the formula for the model 
        survExpression = paste0("Surv(", "time", ", " , "status", ")")
        f <- as.formula(paste(survExpression, "group", sep = " ~ "))
        sFit <- surv_fit(f, data =  data)

        # Initialise results object 
        sFit.res = list()

        # Prepare plot  
        sFit.res$Plot <- ggsurvplot(s.fit.train, 
                           data = data, legend = "bottom",
                           title = paste0("Risk groups", " (n = ", nrow(data) ,")"),
                           legend.title = "Risk group", pval = TRUE, xlab = "Time (days)", 
                          font.family = "Helvetica", font.x = 15, font.y = 15, font.tickslab = 15, font.legend = 15,
                           conf.int = T,
                           break.time.by = 10,
                           surv.plot.height = 0.65, risk.table = TRUE, cumevents = F,
                           ggtheme = theme_classic(), 
                           fontsize = 5, pval.size = 7, tables.font.tickslab = 5, tables.y.text.col = T, tables.y.text = FALSE, 
                           tables.theme = theme_survminer())
   

        # Store relevant information as table 
        sFit.res$table = data.frame(N = nrow(data), 
                                Pvalue = surv_pvalue(sFit))

    } else {
        # Based on the predicted risk it was impossible to group into two groups
        # we cannot get a p-value 
        sFit.res = NULL
    } 
    return(sFit.res)
}

## Load and preprocess data

In [16]:
data.dir = "/workstation//project_data/landstrom_project_3/PRAD_SU2C_2019/"

### Clinical patient data

In [17]:
# Clinical data for patients 
clinical.patients = read.csv(file.path(data.dir, "data_clinical_patient.csv"), sep = "\t")

# Remove all patients without OS_STATUS
clinical.patients = clinical.patients[clinical.patients$OS_STATUS != "",]

# Modify the OS_Status
clinical.patients$OS = as.numeric(unlist(map(clinical.patients$OS_STATUS, modifyOSStatus)))
clinical.patients$OS.time =  clinical.patients$OS_MONTHS

### Clinical sample data

In [18]:
# Clinical data for patients 
clinical.samples = read.csv(file.path(data.dir, "data_clinical_sample.csv"), sep = "\t")

# Select only samples from patients with OS status
clinical.samples = dplyr::filter(clinical.samples, PATIENT_ID %in% clinical.patients$PATIENT_ID)

In [19]:
sample.counts.per.patient = as.data.frame(table(clinical.samples$PATIENT_ID))

In [20]:
colnames(sample.counts.per.patient) = c("PATIENT_ID", "FREQ")
sample.counts.per.patient = arrange(sample.counts.per.patient, desc(FREQ))

Keep track of the patients with multiple samples. We need to deal with this somehow later.

In [21]:
patients.with.multi = sample.counts.per.patient$PATIENT_ID[sample.counts.per.patient$FREQ > 1]

### Expression data

We will load the scaled FPKM values

In [22]:
# Capture data 
expression.data.capture.fpkm = read.csv(file.path(data.dir, "data_mrna_seq_fpkm_capture_zscores_ref_all_samples.csv"), 
                                   sep = "\t", check.names = F)

# Poly-A data 
expression.data.polya.fpkm = read.csv(file.path(data.dir, "data_mrna_seq_fpkm_polya_zscores_ref_all_samples.csv"), 
                                   sep = "\t", check.names = F)

In [23]:
# Select samples included in the clinical.samples 
expression.data.capture = expression.data.capture.fpkm[,colnames(expression.data.capture.fpkm) %in% c("Hugo_Symbol", clinical.samples$SAMPLE_ID)]

expression.data.polya = expression.data.polya.fpkm[,colnames(expression.data.polya.fpkm) %in% c("Hugo_Symbol", clinical.samples$SAMPLE_ID)]

## Merge the expression data and clinical data 

First transpose the expression tables

In [24]:
expression.data.capture.long = expression.data.capture %>% pivot_longer(-Hugo_Symbol, 
                                 values_to = "Expression", 
                                 names_to = "Sample")

expression.data.capture.wider = expression.data.capture.long %>% 
                               pivot_wider(names_from = Hugo_Symbol,
                                           values_from =  Expression,
                                           values_fn = ~mean(.x, na.rm = TRUE))  

colnames(expression.data.capture.wider)[2:ncol(expression.data.capture.wider)] = paste0(colnames(expression.data.capture.wider)[2:ncol(expression.data.capture.wider)], ".exp")

expression.data.polya.long = expression.data.polya %>% pivot_longer(-Hugo_Symbol, 
                                 values_to = "Expression", 
                                 names_to = "Sample")

expression.data.polya.wider = expression.data.polya.long %>% 
                               pivot_wider(names_from = Hugo_Symbol,
                                           values_from =  Expression,
                                           values_fn = ~mean(.x, na.rm = TRUE))  

colnames(expression.data.polya.wider)[2:ncol(expression.data.polya.wider)] = paste0(colnames(expression.data.polya.wider)[2:ncol(expression.data.polya.wider)], ".exp")


Add survival data

In [25]:
# Add patient ids for samples 
expression.data.capture.with.patid = addPatientId(expression.data.capture.wider, clinical.samples)
expression.data.polya.with.patid = addPatientId(expression.data.polya.wider, clinical.samples)

In [26]:
# Add survival data from patient clin data
expression.data.capture.final = addSurvivalData(expression.data.capture.with.patid, clinical.patients)
expression.data.polya.final = addSurvivalData(expression.data.polya.with.patid, clinical.patients)

## Fit the Cox-model including the model genes

Output directory :

In [27]:
out.dir = "/workstation/project_results/landstrom_project_3/manuscript_work/PRAD_validation_OS/SU2C"

In [28]:
library(survival)
library(survminer)
library(ggplot2)
library(survcomp)

Genes included in the final PFI model were AURKA, AURKB, KIF23 and PTEN

In [29]:
# Final variable selection
expression.data.capture.final = expression.data.capture.final %>% 
       dplyr::select(OS, OS.time, AURKA.exp, AURKB.exp, KIF23.exp, PTEN.exp)

expression.data.polya.final = expression.data.polya.final %>% 
       dplyr::select(OS, OS.time, AURKA.exp, AURKB.exp, KIF23.exp, PTEN.exp)

Fitting the model

In [30]:
cox.model.capture = fitCoxModel(expression.data.capture.final, 
            end.point = "OS", 
            features = c("AURKA.exp", "AURKB.exp", "KIF23.exp", "PTEN.exp"))

In [31]:
cox.model.capture

Call:
coxph(formula = f, data = data)

             coef exp(coef) se(coef)      z     p
AURKA.exp  0.5036    1.6547   0.3977  1.266 0.205
AURKB.exp -0.6349    0.5300   0.3909 -1.624 0.104
KIF23.exp  0.4673    1.5957   0.3952  1.183 0.237
PTEN.exp   0.3860    1.4711   0.2441  1.581 0.114

Likelihood ratio test=9.6  on 4 df, p=0.04766
n= 71, number of events= 45 

In [32]:
cox.model.polya = fitCoxModel(expression.data.polya.final, 
            end.point = "OS", 
            features = c("AURKA.exp", "AURKB.exp", "KIF23.exp", "PTEN.exp"))

Calculate C-index 

In [33]:
rel.risk.polya = predict(object = cox.model.polya)
rel.risk.capture = predict(object = cox.model.capture)

In [34]:
# Prepare data
y.data.capture = expression.data.capture.final %>% dplyr::select(OS, OS.time)
colnames(y.data.capture) = c("status","time")


# Prepare data
y.data.polya = expression.data.polya.final %>% dplyr::select(OS, OS.time)
colnames(y.data.polya) = c("status","time")

In [35]:
cindex.polya = concordance.index(rel.risk.polya, 
                                     y.data.polya$time, 
                                     y.data.polya$status,
                                     na.rm = TRUE)

cindex.capture = concordance.index(rel.risk.capture, 
                                     y.data.capture$time, 
                                     y.data.capture$status,
                                     na.rm = TRUE)

In [36]:
# Collect the results 
c.index.results = data.frame("dataset" = c("poly-A","Capture"), 
           "C.index" = c(cindex.polya$c.index, cindex.capture$c.index),
           "CI" = c(paste0(round(cindex.polya$lower,3),"-",round(cindex.polya$upper),3), 
                    paste0(round(cindex.capture$lower,3),"-",round(cindex.capture$upper),3)))

In [37]:
c.index.results 

dataset,C.index,CI
<chr>,<dbl>,<chr>
poly-A,0.6869184,0.482-13
Capture,0.6565056,0.435-13


Prepare KM-plot

In [38]:
# Plot KM
km.res.capture = plotKMbyRelativeRisk(data = y.data.capture, 
                                     rel.risk = rel.risk.capture)

# Plot KM
km.res.polya = plotKMbyRelativeRisk(data = y.data.polya, 
                                     rel.risk = rel.risk.polya)

In [42]:
ggsave(km.res.capture$Plot$plot, filename = file.path(out.dir, "KM_OS_capture_samples.pdf"))
ggsave(km.res.polya$Plot$plot, filename = file.path(out.dir, "KM_OS_polyA_samples.pdf"))

Saving 6.67 x 6.67 in image

Saving 6.67 x 6.67 in image

