# Differential Expression with limma-voom pipeline - Transcripts

In [1]:
suppressMessages({library(SummarizedExperiment)
                  library(data.table)
                  library(limma)
                  library(edgeR)
                  library(dplyr)
                  library(sva)})

## Prepare Data

In [2]:
# Function from jaffelab github
merge_rse_metrics <- function(rse) {
    stopifnot(is(rse, 'RangedSummarizedExperiment'))
    stopifnot(
        c('concordMapRate', 'overallMapRate', 'mitoRate', 'rRNA_rate',
          'totalAssignedGene', 'numMapped', 'numReads', 'numUnmapped',
          'mitoMapped', 'totalMapped') %in%
            colnames(SummarizedExperiment::colData(rse))
    )

    stopifnot(all(sapply(c(
        'concordMapRate', 'overallMapRate', 'mitoRate', 'rRNA_rate',
        'totalAssignedGene', 'numMapped', 'numReads', 'numUnmapped',
        'mitoMapped', 'totalMapped'), function(var) {
            is(colData(rse)[, var], 'List')
        })
    ))

    rse$concordMapRate = mapply(function(r, n) {
        sum(r*n)/sum(n)
    }, rse$concordMapRate, rse$numReads)
    rse$overallMapRate = mapply(function(r, n) {
        sum(r*n)/sum(n)
    }, rse$overallMapRate, rse$numReads)
    rse$mitoRate = mapply(function(r, n) {
        sum(r*n)/sum(n)
    }, rse$mitoRate, rse$numMapped)
    rse$rRNA_rate = mapply(function(r, n) {
        sum(r*n)/sum(n)
    }, rse$rRNA_rate, rse$numMapped)
    rse$totalAssignedGene = mapply(function(r, n) {
        sum(r*n)/sum(n)
    }, rse$totalAssignedGene, rse$numMapped)

    rse$numMapped = sapply(rse$numMapped, sum)
    rse$numReads = sapply(rse$numReads, sum)
    rse$numUnmapped = sapply(rse$numUnmapped, sum)
    rse$mitoMapped = sapply(rse$mitoMapped, sum)
    rse$totalMapped = sapply(rse$totalMapped, sum)
    return(rse)
}

### Load counts

In [3]:
counts.files = '/ceph/projects/brainseq/rnaseq/phase2_Hippo_RiboZero/salmon/preprocess/annotate/_m/annotated_counts.txt'
df.raw0 <- fread(counts.files, header=TRUE, data.table = FALSE)

rownames(df.raw0) = df.raw0[, 1]
annot <- df.raw0[, 1:9]
df.raw <- df.raw0[, c(-1:-9)]

print(dim(df.raw))
print(dim(annot))

counts = as.matrix(df.raw)
counts[1:2, 1:5]

### Load R variable

In [6]:
load("/ceph/projects/v3_phase3_paper/inputs/phase2/_m/count_data/hippo_brainseq_phase2_hg38_rseTx_merged_n447.rda")
rse_tx

class: RangedSummarizedExperiment 
dim: 198093 447 
metadata(0):
assays(1): tpm
rownames(198093): ENST00000456328.2 ENST00000450305.2 ...
  ENST00000387460.2 ENST00000387461.2
rowData names(22): source type ... protein_id ccdsid
colnames(447): R11135 R11137 ... R5766 R5768
colData names(58): SAMPLE_ID RNum ... totalAssignedGene rRNA_rate

In [7]:
if(!all(colnames(assays(rse_tx)$tpm) == colnames(counts)))
{
    counts = counts[, colnames(rse_tx)]
}
dim(counts)

"longer object length is not a multiple of shorter object length"


In [8]:
rse_tx <- SummarizedExperiment(assays=SimpleList(counts=counts),
                               rowData=annot, colData=colData(rse_tx))
rse_tx <- as(rse_tx, "RangedSummarizedExperiment")
rse_tx

class: RangedSummarizedExperiment 
dim: 197311 447 
metadata(0):
assays(1): counts
rownames(197311): ENST00000456328.2 ENST00000450305.2 ...
  ENST00000387460.2 ENST00000387461.2
rowData names(9): transcript_id gene_id ... strand length
colnames(447): R11135 R11137 ... R5766 R5768
colData names(58): SAMPLE_ID RNum ... totalAssignedGene rRNA_rate

### Load genotype data

In [9]:
mds_file = paste0("/ceph/projects/v3_phase3_paper/inputs/genotypes/to_brnum/", 
                  "merge/to_plink/mds/_m/merged.mds")
mds = fread(mds_file) %>% 
    rename("snpPC1"="C1", "snpPC2"="C2", "snpPC3"="C3", 
           "snpPC4"="C4", "snpPC5"="C5") %>%
    mutate_if(is.character, as.factor)
mds[1:2, 1:5]

class: RangedSummarizedExperiment 
dim: 197311 447 
metadata(0):
assays(1): counts
rownames(197311): ENST00000456328.2 ENST00000450305.2 ...
  ENST00000387460.2 ENST00000387461.2
rowData names(9): transcript_id gene_id ... strand length
colnames(447): R11135 R11137 ... R5766 R5768
colData names(68): SAMPLE_ID RNum ... snpPC9 snpPC10

### Subset and recode

In [11]:
keepIndex = which((rse_tx$Dx %in% c("Control", "Schizo")) &  
                  rse_tx$Age > 17 & 
                  rse_tx$Race %in% c("AA", "CAUC"))
rse_tx = rse_tx[, keepIndex]

rse_tx$Dx = factor(rse_tx$Dx, levels = c("Control", "Schizo"))
rse_tx$Sex <- factor(rse_tx$Sex)

rse_tx <- merge_rse_metrics(rse_tx)

rse_tx$ERCCsumLogErr <- mapply(function(r, n) {
        sum(r * n)/sum(n)
    }, rse_tx$ERCCsumLogErr, rse_tx$numReads)
colData(rse_tx)$RIN = sapply(colData(rse_tx)$RIN,"[",1)

pheno = colData(rse_tx) %>% as.data.frame %>% 
    inner_join(mds, by=c("BrNum"="FID"))

### Generate DGEList

In [15]:
x <- DGEList(counts=assays(rse_tx)$counts, 
             genes=rowData(rse_tx), samples=pheno)

### Filtering low counts

In [16]:
design0 <- model.matrix(~Sex, data=x$samples)
keep.x <- filterByExpr(x, design=design0)
x <- x[keep.x, , keep.lib.sizes=FALSE]
print(paste('There are:', sum(keep.x), 'features left!', sep=' '))

[1] "There are: 81495 features left!"


### Normalize counts

In [None]:
x <- calcNormFactors(x, method="TMM")

## Differential Expression Analysis

### Design matrix

In [None]:
mod = model.matrix(~Sex + Dx + Age + mitoRate +
        rRNA_rate + totalAssignedGene + RIN +
        ERCCsumLogErr + overallMapRate +
        snpPC1 + snpPC2 + snpPC3,
        data = x$samples)

colnames(mod) <- gsub("Dx", "", colnames(mod))
colnames(mod) <- gsub("SexM", "Male", colnames(mod))
colnames(mod) <- gsub("\\(Intercept\\)", "Intercept",
                         colnames(mod))

head(mod, 2)

### Calculate SVAs

In [17]:
print(paste('Determining number of surragate variables ...', Sys.time(), sep=' '))
n.sv <- num.sv(x$counts, mod, method="be")
null.model = as.matrix(as.data.frame(mod)[, !(names(as.data.frame(mod)) %in% c("Male"))])

print(paste('Fitting SV model ...', Sys.time(), sep=' '))
svobj <- svaseq(x$counts, mod, null.model, n.sv=n.sv)

[1] "Determining number of surragate variables ... 2020-09-28 12:58:46"
[1] "Fitting SV model ... 2020-09-28 12:59:40"
Number of significant surrogate variables is:  4 
Iteration (out of 5 ):1  2  3  4  5  

### Merge models

In [42]:
print(paste('Adding SV to design matrix ...', Sys.time(), sep=' '))
modQsva <- cbind(mod, svobj$sv)
len.d <- length(colnames(modQsva))
colnames(modQsva)[((len.d - n.sv)+1):len.d] <- make.names(paste0("sv",1:n.sv))

[1] "Adding SV to design matrix ... 2020-09-28 13:10:33"


### Preform voom

In [None]:
v <- voom(x, modQsva, plot=TRUE)

In [None]:
feature = 'transcripts'
dir.create(feature)

save(v, file=paste0(feature,'/voomSVA.RData'))
#load('../_m/dlpfc//voomSVA.RData')

### Calculate residuals

In [None]:
null_model = v$design %>% as.data.frame %>% select(-c("Male")) %>% as.matrix
fit_res <- lmFit(v, design=null_model)
res = v$E - ( fit_res$coefficients %*% t(null_model) )
res_sd = apply(res, 1, sd)
res_mean = apply(res, 1, mean)

### Normalize residuals and write to file

In [None]:
res_norm = (res - res_mean) / res_sd
write.table(res_norm, file=paste0(feature, '/residualized_expression.tsv'),
            sep="\t", quote=FALSE)

### Fit model with limma

In [None]:
fit0 <- lmFit(v, modQsva)
contr.matrix <- makeContrasts(MvsF = Male, 
                              levels=colnames(modQsva))
fit <- contrasts.fit(fit0, contrasts=contr.matrix)

### Calculate differential expression with eBayes

In [None]:
esv <- eBayes(fit)

options(width=200)
top0 <- topTable(esv, coef=1, number=Inf, sort.by="P")
sigTest <- decideTests(esv)

top <- merge(top0, sigTest, by=0)
rownames(top) <- top$Row.names
top <- top[,-1]
top <- top[order(top$P.Value), ]
#top <- subset(top, select=-gencodeTx)
dim(top)

In [None]:
write.table(top, file=paste0(feature, "/diffExpr_maleVfemale_full.txt"),
            sep='\t', row.names=TRUE, quote=FALSE)

### Subset via FDR

In [None]:
top.fdr <- top[top$adj.P.Val<=0.05,]
print(paste('There are:', dim(top.fdr)[1], 'DE features!'))

In [None]:
subset(top.fdr, Symbol == 'XIST')

In [None]:
write.table(top.fdr, file=paste0(feature, "/diffExpr_maleVfemale_FDR05.txt"),
            sep='\t', row.names=TRUE, quote=FALSE)

## Volcano and MA plots

In [None]:
with(top, plot(logFC, -log10(P.Value), pch=20, cex=0.6))
with(subset(top, adj.P.Val<=0.05), points(logFC, -log10(P.Value),
                                          pch=20, col='red', cex=0.6))
with(subset(top, abs(logFC)>0.50), points(logFC, -log10(P.Value),
                                          pch=20, col='orange', cex=0.6))
with(subset(top, adj.P.Val<=0.05 & abs(logFC)>0.50),
     points(logFC, -log10(P.Value), pch=20, col='green', cex=0.6))

In [None]:
pdf(file=paste0(feature, "/volcanoPlot.pdf"), 8, 6)
with(top, plot(logFC, -log10(P.Value), pch=20, cex=0.6))
with(subset(top, adj.P.Val<=0.05), points(logFC, -log10(P.Value),
                                          pch=20, col='red', cex=0.6))
with(subset(top, abs(logFC)>0.50), points(logFC, -log10(P.Value),
                                          pch=20, col='orange', cex=0.6))
with(subset(top, adj.P.Val<=0.05 & abs(logFC)>0.50),
     points(logFC, -log10(P.Value), pch=20, col='green', cex=0.6))
dev.off()

In [None]:
with(top, plot(AveExpr, logFC, pch=20, cex=0.5))
with(subset(top, adj.P.Val<0.05),
     points(AveExpr, logFC, col="red", pch=20, cex=0.5))

In [None]:
pdf(file=paste0(feature, "/MAplot.pdf"), 8, 6)
with(top, plot(AveExpr, logFC, pch=20, cex=0.5))
with(subset(top, adj.P.Val<0.05),
     points(AveExpr, logFC, col="red", pch=20, cex=0.5))
dev.off()

## Repreducibility Information

In [None]:
Sys.time()
proc.time()
options(width = 120)
sessioninfo::session_info()