Program Dependencies:

In [None]:
!conda install -c r r-base # (R version 3.6.1)
!conda install -c bioconda hisat2 # Hisat2 version 2.2.1
!conda install -c bioconda samtools # samtools version 1.7
!conda install -c bioconda subread # subread version 2.0.1 including featureCounts

Additional intallations in R:

In [None]:
# Install Bioconductor
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install(version = "3.10")

# R packages
BiocManager::install("limma")
BiocManager::install("edgeR")

Import python packages

In [2]:
import sys,os
from numpy import *
from matplotlib import *
from pylab import *
#from scipy import *

Run hisat2 spliced alignment
forward reads: LNCaP_red_rep6_1.fastq
reverse reads: LNCaP_red_rep6_2.fastq
hisat2-indexed reference genome: Human_hg20/genome_tran  

In [None]:
!hisat2 -p 1 --dta -x Human_hg20/genome_tran -1 LNCaP_red_rep6_1.fastq -2 LNCaP_red_rep6_2.fastq -S LNCaP_rep6.sam 2>summary.txt

Convert from .sam to .bam format using samtools

In [None]:
!samtools view -bS LNCaP_rep6.sam > LNCaP_rep6.bam

Sort .bam file using samtools

In [None]:
!samtools sort LNCaP_rep6.bam -o LNCaP_rep6_sorted.bam

Make count-table (one sample) using featureCounts
Reference transcriptome annotation: Homo_sapiens.GRCh38.84.gtf

In [None]:
!featureCounts -T 1 -t exon -g gene_id -O -a Homo_sapiens.GRCh38.84.gtf -o count-1smp.txt LNCaP_rep6_sorted.bam

Clean count-table for differential expression

In [None]:
!python fcnts2dseq.py count-1smp.txt

For differential expression we will use a prepared count-table with 12 samples, 6 LNCaP and 6 RWPE

R-script to perform differential expression in Limma-Voom
Input table: count-table.tsv

In [None]:
##############################################################
### Script for differential expression analysis using Voom ###
##############################################################

### Load limma-package (which includes Voom) ###
library(limma)
library(edgeR)

### load data table ###
countTable = read.table("count-table.tsv",header=TRUE,row.names=1)

###########################################################

### create condition ###
condition = factor( c("LNCaP","LNCaP","LNCaP","LNCaP","LNCaP","LNCaP",
                        "RWPE","RWPE","RWPE","RWPE","RWPE","RWPE"))

### create design matrix ###
des = model.matrix(~-1+condition)
colnames(des) = levels(condition)

### define contrasts (which groups to compare) ###
cmat <- makeContrasts(LNCaP - RWPE, levels=des)

### Normalise count-table ###
dge <- DGEList(counts=countTable)
dge <- calcNormFactors(dge)

## make MDS plot ###
snames = colnames(countTable)
plotMDS(dge, col = as.numeric(condition)) # Group colors

## Cutoff cpm on dge
cutoff <- 1
drop <- which(apply(cpm(dge), 1, max) < cutoff)
dge <- dge[-drop,]
dim(dge) # number of genes left (21802)

### Fit voom model ###
v <- voom(dge,design=des)

###  Mean-Variance trend plot ###
v <- voom(dge,design=des,plot=T)

### fit model
fit <- lmFit(v,design=des)
fit <- contrasts.fit(fit, cmat)
fit <- eBayes(fit)

### find differentially expressed transcripts ###
a <- decideTests(fit,adjust.method="fdr", p.value=0.05, lfc=0)

### summary of result-table ###
sma = summary(a)
dmm <- dim(countTable)
res <- topTable(fit,n=dmm[1],coef=1)

### write table with results from differential expression ###
write.table(res,file="Voom_diffexp.txt",sep="\t",row.names=TRUE,col.names=TRUE,quote=FALSE)


Add gene-names to Ensembl-id to differential expression table.
Use input conversion table: ENS_to_gene-names.txt

In [None]:
!python replace_ENS_with_gene_symbol_py3.py Voom_diffexp.txt > Voom_diffexp_gnms.txt

Create MAplot and Volcano-plot in python

In [None]:
# Load differential expression table
infile = open('Voom_diffexp_gnms.txt','r')
lines = infile.readlines()
tbl = []
for line in lines:
    cols = line.split('\t')
    cols[-1] = cols[-1].strip()
    tbl.append(cols)
tbl = array(tbl)

In [None]:
# define data columns
hdr = tbl[0]
tbl = tbl[1:]
gnms = tbl[:,0]
ensid = tbl[:,1]
fc = tbl[:,2].astype(double)
avgx = tbl[:,3].astype(double)
pval = tbl[:,5].astype(double)
qval = tbl[:,6].astype(double)
mlogp = -log10(pval)
mlogq = -log10(qval)

In [None]:
# find genes with q-value > cutoff (default 0.05) - for MAplot
ctf = 0.05
id0 = where(qval <= 0.05)[0]
id01 = where(fc > 0)[0]
id02 = where(fc < 0)[0]
id1 = intersect1d(id0,id01)
id2 = intersect1d(id0,id02)
str1 = 'q > '+str(ctf)
str2 = 'q < '+str(ctf)
idd = union1d(id1,id2)

In [None]:
# Create MAplot
ff = figure()
ff.set_figwidth(12)
ff.set_figheight(10)
plot(avgx,fc,'k.',ms=2)
plot(avgx[id1],fc[id1],'r.',ms=10,label=str1)
plot(avgx[id2],fc[id2],'b.',ms=10,label=str2)
xlabel('Average log-expression',size=20)
ylabel('log-fold-change',size=20)
fsz = matplotlib.font_manager.FontProperties(size=20)
legend(prop=fsz)
#plt.savefig('MA_plot.pdf',format='pdf')

In [None]:
# Find top NN differentially expressed genes (default 100) - for Volcano-plot
NN = 100
id0 = arange(NN)
id01 = where(fc > 0)[0]
id02 = where(fc < 0)[0]
id1 = intersect1d(id0,id01)
id2 = intersect1d(id0,id02)
str1 = 'top '+str(NN)+' up'
str2 = 'top '+str(NN)+' down'
idd = union1d(id1,id2)

In [None]:
# Create Volcano-plot
ff = figure()
ff.set_figwidth(12)
ff.set_figheight(10)
plot(fc,mlogp,'k.',ms=2)
plot(fc[id1],mlogp[id1],'r.',ms=10,label=str1)
plot(fc[id2],mlogp[id2],'b.',ms=10,label=str2)
xlabel('Log2 Fold Change',size=20)
ylabel('-log10(P-Value)',size=20)
fsz = matplotlib.font_manager.FontProperties(size=20)
legend(prop=fsz)
## add gene-names
for i in idd:
    tt = text(fc[i],mlogp[i],gnms[i],fontsize=8)
#plt.savefig('Volcano_plot.pdf',format='pdf')