# Investigating correlation between features and tumor stage (linear model)

In [1]:
setwd("/home/data/project_code/landstrom_core/prognostic_model_development/r/notebooks")
library(ggplot2)
library(tidyverse)
source("../getTCGAData.R")
source("../preprocessTCGAData.R")
source("../KM_analysis.R")
source("../Heatmaps.R")
source("../enet.R")
library(ggpubr)
library(rstatix)

── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mtibble [39m 3.1.7     [32m✔[39m [34mdplyr  [39m 1.0.9
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1
[32m✔[39m [34mpurrr  [39m 0.3.4     

── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Loading required package: S4Vectors

Loading required package: stats4

Loading 

# Function

In [2]:
#
# Define function for adding the clinical variables 
#
addClinVar = function(data, clin.var) {
    if ("Age.oth" %in% clin.var) {
        data$Age.oth <- data$age_at_diagnosis.clin
    } 
    if ("Tumor.stage.oth" %in% clin.var){
        data$Tumor.stage.oth = factor(map_chr(data$ajcc_pathologic_stage.clin, reformatTumorStage))
    }
    if ("Gender.oth" %in% clin.var){
        data$Gender.oth <- factor(data$gender.clin)    
    } 
    if ("Gleason.group.oth" %in% clin.var) {
        
        # Determine the Gleason group 
        data$Gleason.group.oth = map2_chr(data$primary_gleason_grade.clin, 
                                           data$secondary_gleason_grade.clin, 
                                           determineGleasonGroup)

        # Set up the factor levels 
        data$Gleason.group.oth = factor(data$Gleason.group, 
                                    levels = c("Gleason group 1", "Gleason group 2"))
    }
    return(data)
}

# Recode Tumor stage to numeric
recodeTumorStageToNumeric = function(x){
    x.recoded = recode_factor(x, `Stage 1` = 1, `Stage 2` = 2, `Stage 3` = 3, `Stage 4` = 4)
    x.recoded = as.numeric(as.character(x.recoded))
    return(x.recoded)
}

# Setting up paths and variables

In [3]:
# Read in the table including the clinical features for each cancer type
clin.feat.tb = read.table("/workstation/project_data/landstrom_core/clin_features_final.csv", sep = "\t", header = T)

# Ensembl id mapping file 
ens.id.mapping = "/home/organisms/Human/hg38/Homo_sapiens.GRCh38_March2022/ENSEMBLE_to_SYMBOL.csv"

# Input dir data 
input.dir.data.path = file.path("/workstation/project_data/landstrom_core/rdata/manuscript_work/")

# Output directory
dir.res.root = file.path("/workstation/project_results/landstrom_core/Features_vs_tumor_stage/")
dir.create(dir.res.root)

# Gene list  
gene.list.file = read.table("/workstation/project_data/landstrom_core/Customer_genes.tsv", 
                            sep = "\t", header = F)
gene.list = gene.list.file$V1

“'/workstation/project_results/landstrom_core/Features_vs_tumor_stage' already exists”


# Prepare boxplots and test for significance

Output dir :

In [4]:
# Output dir :
boxplt.results.dir = file.path(dir.res.root, "Features_and_tumor_stage_boxplots/lm")
dir.create(boxplt.results.dir)

“'/workstation/project_results/landstrom_core/Features_vs_tumor_stage//Features_and_tumor_stage_boxplots/lm' already exists”


In [5]:
# Store all statistical test results to a table 
stat.results.ls = list()

In [6]:
calcCorr = function(x, y){
    res = cor.test(x, y)
    return(data.frame("Corr" = res$estimate, "P.value" = res$p.value))
}

In [7]:
final.results = list()
i = 1
for (cancer.type in clin.feat.tb$Ctype){
    
    # Get Clinical variables
    clin.var = unlist(strsplit(clin.feat.tb$Features[clin.feat.tb$Ctype == cancer.type], split = ","))
    
    # Add oth-suffix 
    clin.var = paste0(clin.var, ".oth")
    
    # Read in the preprocessed dataset if continued 
    tcga.dataset = readRDS(file.path(input.dir.data.path, cancer.type, "tcga.dataset.rds"))
    
    # Add clinical variables to dataset
    tcga.dataset = addClinVar(tcga.dataset, clin.var)
    
    # Selected variables 
    variables.selected = c(paste0(gene.list, ".exp"), "Tumor.stage.oth")
    
    # Selected variables         
    data.selected.exp = tcga.dataset %>% 
                        dplyr::select(one_of(variables.selected))
    
    if ("Tumor.stage.oth" %in% colnames(data.selected.exp)){
        
        # Recode the tumor stage 
        data.selected.exp$Tumor.stage.oth = recodeTumorStageToNumeric(data.selected.exp$Tumor.stage.oth)
        
        # Calculate correlation and p-value 
        res.ls = list()
        for (j in 1:ncol(data.selected.exp)){
            if (colnames(data.selected.exp[j]) != "Tumor.stage.oth"){
                var1 = data.selected.exp[,j]
                var2 = data.selected.exp$Tumor.stage.oth
                res.ls[[colnames(data.selected.exp[j])]] = calcCorr(var1, var2)
            }
        
        }
        corr.results = bind_rows(res.ls, .id = "Gene")
        final.results[[cancer.type]] = corr.results
    }
}

“Unknown columns: `Tumor.stage.oth`”
“Unknown columns: `Tumor.stage.oth`”
“Unknown columns: `Tumor.stage.oth`”
“Unknown columns: `Tumor.stage.oth`”
“Unknown columns: `Tumor.stage.oth`”
“Unknown columns: `Tumor.stage.oth`”
“Unknown columns: `Tumor.stage.oth`”
“Unknown columns: `Tumor.stage.oth`”
“Unknown columns: `Tumor.stage.oth`”
“Unknown columns: `Tumor.stage.oth`”
“Unknown columns: `Tumor.stage.oth`”
“Unknown columns: `Tumor.stage.oth`”
“Unknown columns: `Tumor.stage.oth`”


In [8]:
final.results.df = bind_rows(final.results, .id = "Cancer type")

In [9]:
write.csv(final.results.df, file.path(boxplt.results.dir, "Correlation_results.csv"))