# PGS analysis in TCGA/GTEX

## Import packages

In [None]:
library(edgeR)
library(readr)
library(readxl)
library(data.table)
library(dplyr)
library(tidyr)

## GTEX

### Import metadata

In [None]:
sample_meta = read_excel('/mnt/grid/janowitz/rdata_norepl/gtex/counts/sampledata_gtex_SKclean.xlsx')
subject_meta = read_excel('/mnt/grid/janowitz/rdata_norepl/gtex/counts/subjectdata_gtex_SKclean.xlsx')

### Import PGS

In [None]:
pgs = read_table2('/grid/wsbs/home_norepl/skleeman/PGS/final/GTEX_UKB380_PGS_inner.sscore')
scale2 <- function(x, na.rm = TRUE) (x - mean(x, na.rm = na.rm)) / sd(x, na.rm)
pgs$score_scale = scale2(pgs$SCORE1_AVG)
m<-match(subject_meta$SUBJID, pgs$IID)
subject_meta$pgs_score = pgs$score_scale[m]
subject_meta

### Import counts

In [None]:
dt = fread("/mnt/grid/janowitz/rdata_norepl/gtex/counts/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz")
genes = data.frame(ensembl_id = dt$Name, gene_name = dt$Description)


library("GenomicFeatures")
gtf_txdb <- makeTxDbFromGFF("/mnt/grid/janowitz/home/references/human_rna/gencode.v26.annotation.gtf.gz")
exons_list_per_gene <- exonsBy(gtf_txdb,by="gene")
widths <- width(reduce(exons_list_per_gene))
totalexonlength <- vapply(widths, sum, numeric(1))
m<-match(gsub("\\..*","",genes$ensembl_id), gsub("\\..*","",names(totalexonlength)))
genes$length = totalexonlength[m]

y <- DGEList(counts=as.matrix(dt[,-c(1:2)]),genes=genes)

keep <- rowSums(cpm(y)>1) >= 15 #Arbritary, need expression in at least 15 samples
y <- y[keep, , keep.lib.sizes=FALSE]

y <- calcNormFactors(y)

In [None]:
logcpm <- cpm(y, log=TRUE)

#Lets use TPM

#RPKM <- rpkm(y)
#logcpm <- t( t(RPKM) / colSums(RPKM, na.rm=TRUE) ) * 1e6
#logcpm <- log2(logcpm+0.1)

row.names(logcpm)<-y$genes$gene_name
logcpm <- subset(logcpm, row.names(logcpm)=="CST3" | row.names(logcpm)=="FKBP5" | row.names(logcpm)=="SERPINA6" | row.names(logcpm)=="SERPINA1")
logcpm<-as.data.frame(t(logcpm))
logcpm$SAMPID = row.names(logcpm)
logcpm<-inner_join(logcpm, sample_meta, by="SAMPID")
logcpm$SUBJID = sub('^([^-]+-[^-]+).*', '\\1', logcpm$SAMPID)
logcpm<-left_join(logcpm, subject_meta, by="SUBJID")
logcpm

In [None]:
correl = logcpm %>% drop_na(pgs_score) %>%
    group_by(sample_type) %>% 
    dplyr::summarize(cor = cor(pgs_score, CST3, method='spearman'), p=cor.test(pgs_score, CST3,method='spearman')$p.value,
                    n=n()) %>% filter(n>150) %>% arrange(desc(cor))

In [None]:
correl

In [None]:
library(ggplot2)
options(repr.plot.width=7, repr.plot.height=5)


de = correl
# add a column of NAs
de$diffexpressed <- "NO"
# if log2Foldchange > 0.6 and pvalue < 0.05, set as "UP" 
de$diffexpressed[de$cor > 0 & de$p < 0.05] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
de$diffexpressed[de$cor < 0 & de$p < 0.05] <- "DOWN"

de$delabel <- NA
de$delabel[de$diffexpressed != "NO"] <- de$sample_type[de$diffexpressed != "NO"]


library(ggrepel)
# plot adding up all layers we have seen so far
ggplot(data=de, aes(x=cor, y=-log10(p), col=diffexpressed, label=delabel)) +
        geom_point() + 
        theme_bw() +
        geom_text_repel(size=5) +
        scale_color_manual(values=c("blue", "black", "red"))+xlim(-0.2,0.2)+ylim(0,2)+xlab("Spearman correlation coefficient")+ylab("-log10(p-value)") +
        geom_hline(yintercept=-log10(0.05))+geom_vline(xintercept=0)

ggsave('/mnt/grid/janowitz/home/skleeman/cystatinc/figure_prep/figure3a.pdf', width=7, height=5)

In [None]:
library(ggplot2)
time = subset(logcpm, uberon_string=="spleen")
#time = logcpm
#time$time_death[time$time_death > 0.987] = 0
time$time_death = time$time_death * 24
ggplot(time, aes(x=time_death,y=CST3)) + geom_smooth()+theme_bw()+xlim(0,24)

In [None]:
library(tidyr)
library(ggplot2)
logcpm2 = subset(logcpm, uberon_string=="spleen")

center_scale <- function(x) {
    scale(x, scale = FALSE)
}

logcpm2$FKBP5 = center_scale(logcpm2$FKBP5)
logcpm2$CST3 = center_scale(logcpm2$CST3)
use = logcpm2 %>% pivot_longer(cols = c('FKBP5','CST3'))
timex = use
#time = logcpm
timex$time_death = timex$time_death * 24
ggplot(timex, aes(x=time_death,y=value, color=name)) + geom_smooth(se=F)+theme_bw()+xlim(0,24)+xlab("Time of death (24h clock)")+ylab("Relative expression (Z-score)")

In [None]:
library(cosinor)
cst3 = subset(use, name=="FKBP5")
cst3 = cst3 %>% rowwise() %>% mutate(time_posix=as.POSIXct(as.Date("2011-02-01 00:00", tzone="UTC"))+3600*5 + 3600*24*time_death)
cst3$hour_of_sampling = format(cst3$time_posix,format = "%H")
cst3 = subset(cst3, is.na(hour_of_sampling)==FALSE)
cst3$hour_of_sampling = as.numeric(cst3$hour_of_sampling)
cst3$hour2 = cst3$time_death*24
cst3 = as.data.frame(cst3)
fit <- cosinor.lm(value ~ time(hour2), data = cst3, period = 24)

cst3 = subset(use, name=="CST3")
cst3 = cst3 %>% rowwise() %>% mutate(time_posix=as.POSIXct(as.Date("2011-02-01 00:00", tzone="UTC"))+3600*5 + 3600*24*time_death)
cst3$hour_of_sampling = format(cst3$time_posix,format = "%H")
cst3 = subset(cst3, is.na(hour_of_sampling)==FALSE)
cst3$hour_of_sampling = as.numeric(cst3$hour_of_sampling)
cst3$hour2 = cst3$time_death*24
cst3 = as.data.frame(cst3)
fit2 <- cosinor.lm(value ~ time(hour2), data = cst3, period = 24)


In [None]:
summary(fit2)

In [None]:
object=fit
timeax <- seq(0, object$period, length.out = 200)
covars <- grep("(rrr|sss)", attr(object$fit$terms, "term.labels"), invert = TRUE, value = TRUE)

newdata <- data.frame(time = timeax, rrr = cos(2 * pi * timeax / object$period),
                    sss = sin(2 * pi * timeax / object$period))

newdata$Y.hat <- predict(object$fit, newdata = newdata)


object=fit2
timeax <- seq(0, object$period, length.out = 200)
covars <- grep("(rrr|sss)", attr(object$fit$terms, "term.labels"), invert = TRUE, value = TRUE)

newdata2 <- data.frame(time = timeax, rrr = cos(2 * pi * timeax / object$period),
                    sss = sin(2 * pi * timeax / object$period))

newdata2$Y.hat <- predict(object$fit, newdata = newdata2)

names(newdata)[4]="FKBP5"
newdata$CST3 = newdata2$Y.hat

newdata = newdata %>% pivot_longer(cols = c('FKBP5','CST3'))


ggplot(newdata, aes_string(x = "time", y = "value", color = "name")) + geom_line()+theme_bw()+xlim(0,24)+xlab("Time of death (24h clock)")+ylab("Normalized expression - log(TPM)")

## TCGA

In [None]:
tcga_count = fread('/mnt/grid/janowitz/rdata_norepl/tcga_germline/rna/tcga_gene_expected_count.gz')
tcga_genes = fread('/mnt/grid/janowitz/rdata_norepl/tcga_germline/rna/gencode.v23.annotation.gene.probemap')
tcga_samples = fread('/mnt/grid/janowitz/rdata_norepl/tcga_germline/rna/TCGA_phenotype_denseDataOnlyDownload.tsv.gz')
tcga_immune = read_excel('/mnt/grid/janowitz/rdata_norepl/tcga_germline/rna/tcga_immunity.xlsx')
tcga_clinical = read_excel('/mnt/grid/janowitz/rdata_norepl/tcga_germline/rna/TCGA-CDR.xlsx')
tcga_prs = read_table2('/grid/wsbs/home_norepl/skleeman/PGS/final/TCGA_UKB380_PGS_inner.sscore')
scale2 <- function(x, na.rm = TRUE) (x - mean(x, na.rm = na.rm)) / sd(x, na.rm)
tcga_prs$score_scale = scale2(tcga_prs$SCORE1_AVG)
names(tcga_prs)[2]="subjid"
tcga_prs$subjid = gsub('\\.', '-', tcga_prs$subjid )
pcs = fread('/mnt/grid/janowitz/rdata_norepl/tcga_germline/plink/tcga_eur_pcs.tsv')
pcs$IID = gsub('\\.', '-', pcs$IID )
names(pcs)[1]="subjid"
names(tcga_clinical)[1] = "subjid"

### PRS vs survival

In [None]:
prs_survival<-inner_join(tcga_prs, tcga_clinical, by='subjid')
prs_survival<-inner_join(prs_survival, pcs, by='subjid')

In [None]:
names(prs_survival)

In [None]:
nrow(prs_survival)

In [None]:
library(survival)
library(survminer)

framex = data.frame()

prs_survival$type[prs_survival$type=="READ"] = "COAD"

prs_survival = subset(prs_survival, tumor_status =="TUMOR FREE" | tumor_status == "WITH TUMOR")

prs_survival = subset(prs_survival, type %in% c("BLCA","BRCA", "COAD","READ","UCEC","ESCA","STAD","HNSC","KIRC","KIRP",
                                               "LIHC","LUSC","LUAD",'SKCM','OV','PAAD'))

for(i in 1:length(unique(prs_survival$type))) {
    tryCatch({
        cancer = unique(prs_survival$type)[i]
        use = subset(prs_survival, type==cancer)
        if(length(unique(use$gender)) ==2 ) {
            res.cox = coxph(Surv(DSS.time, DSS) ~ score_scale + age_at_initial_pathologic_diagnosis + gender + PC1 + PC2 + PC3 + PC4 + tumor_status, data=use)
        } else {
            res.cox = coxph(Surv(DSS.time, DSS) ~ score_scale + age_at_initial_pathologic_diagnosis + PC1 + PC2 + PC3 + PC4 + tumor_status, data=use)
        }
        test = summary(res.cox)
        p_value = as.numeric(test$coefficients[1,5])
        add = data.frame(name=cancer, p_value = p_value, hr=test$coefficients[1,2], se=test$coefficients[1,3],lower = test$conf.int[1,3],
                     upper = test$conf.int[1,4])
        test = summary(res.cox)
        framex=rbind(framex,add)
    }, error=function(e){})
}


In [None]:
framex %>% arrange(hr)

In [None]:
library(meta)
meta = metagen(log(framex$hr), framex$se, sm = "HR", studlab=framex$name)

pdf(file = "/mnt/grid/janowitz/home/skleeman/cystatinc/figure_prep/figure4cy.pdf", width = 8, height = 6)



forest.meta(meta,leftlabs = c("Cancer code"), leftcols = c("studlab"),rightcols=c("effect", "ci"),sortvar = TE)

dev.off()

In [None]:
framex  %>% ggplot(
  aes(x = name,y = hr, ymin = lower, ymax = upper))+
  geom_pointrange(aes(col=name))+
  geom_hline(aes(fill=name),yintercept =1, linetype=2)+
  xlab('Group')+ ylab("Hazard Ratio (95% Confidence Interval)")+
  geom_errorbar(aes(ymin=lower, ymax=upper,col=name),width=0.5,cex=1)+ 
  theme(plot.title=element_text(size=16,face="bold"),
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank(),
        axis.text.x=element_text(face="bold"),
        axis.title=element_text(size=12,face="bold"),
        strip.text.y = element_text(hjust=0,vjust = 1,angle=180,face="bold"))+
  coord_flip()+theme_bw()+xlab("")+ theme(legend.position = "none") 