# ... RNA-Seq experiments

I'm mapping `hg38/gencode.v34` to the `fastq` files using salmon. 

In [27]:
cat fastq/download.sh | grep "96"

rsync -av rsync://ftp.sra.ebi.ac.uk/vol1/fastq/SRR321/006/SRR3214296/SRR3214296.fastq.gz .


In [13]:
cat scripts/salmon.sh

PDIR=$1
FASTQDIR=$2
quantsDir=$3
INDEX=$4
JOBS=$5

cd $PDIR
mkdir -p $quantsDir

for f in $FASTQDIR/*fastq.gz; do 
	samp=`basename ${f}`; 
	samp=${samp/.fastq.gz/}; 
	echo "Processing sample ${samp}"; 
	salmon quant -i $INDEX \
 	-l A -r $f -p $JOBS --validateMappings -o $quantsDir/$samp; 
done


In [None]:
%%bash 
mkdir -p exp
mkdir -p exp/quant

nohup bash scripts/salmon.sh \
. \
fastq \
exp/quants \
~/genomes/hg38/gencode.v34/salmon_index/ \
40 &> salmon.out&

In [33]:
cat SRR_Acc_List.txt | wc -l

96


In [22]:
!mkdir fastq_missing

In [23]:
cp fastq/SRR3214296.fastq.gz fastq_missing/

In [25]:
!bash scripts/salmon.sh \
. \
fastq_missing/ \
exp/quants \
~/genomes/hg38/gencode.v34/salmon_index/ \
40

Processing sample SRR3214296
Version Info: Could not resolve upgrade information in the alotted time.
Check for upgrades manually at https://combine-lab.github.io/salmon
### salmon (mapping-based) v1.2.1
### [ program ] => salmon 
### [ command ] => quant 
### [ index ] => { /data_gilbert/home/aarab/genomes/hg38/gencode.v34/salmon_index/ }
### [ libType ] => { A }
### [ unmatedReads ] => { fastq_missing//SRR3214296.fastq.gz }
### [ threads ] => { 40 }
### [ validateMappings ] => { }
### [ output ] => { exp/quants/SRR3214296 }
Logs will be written to exp/quants/SRR3214296/logs
[00m[2024-03-04 18:41:31.225] [jointLog] [info] setting maxHashResizeThreads to 40
[00m[00m[2024-03-04 18:41:31.225] [jointLog] [info] Fragment incompatibility prior below threshold.  Incompatible fragments will be ignored.
[00m[00m[2024-03-04 18:41:31.225] [jointLog] [info] Usage of --validateMappings implies use of minScoreFraction. Since not explicitly specified, it is being set to 0.65
[00m[00m[2024-03-

In [16]:
ls -l exp/quants/*/quant.sf

-rw-rw-r--. 1 aarab aarab 10442600 Mar  4 15:49 exp/quants/SRR3214253/quant.sf
-rw-rw-r--. 1 aarab aarab 10411548 Mar  4 15:49 exp/quants/SRR3214254/quant.sf
-rw-rw-r--. 1 aarab aarab 10440634 Mar  4 15:50 exp/quants/SRR3214255/quant.sf
-rw-rw-r--. 1 aarab aarab 10448874 Mar  4 15:50 exp/quants/SRR3214256/quant.sf
-rw-rw-r--. 1 aarab aarab 10457737 Mar  4 15:52 exp/quants/SRR3214257/quant.sf
-rw-rw-r--. 1 aarab aarab 10441382 Mar  4 15:52 exp/quants/SRR3214258/quant.sf
-rw-rw-r--. 1 aarab aarab 10454963 Mar  4 15:53 exp/quants/SRR3214259/quant.sf
-rw-rw-r--. 1 aarab aarab 10436567 Mar  4 15:54 exp/quants/SRR3214260/quant.sf
-rw-rw-r--. 1 aarab aarab 10450740 Mar  4 15:55 exp/quants/SRR3214261/quant.sf
-rw-rw-r--. 1 aarab aarab 10459171 Mar  4 15:56 exp/quants/SRR3214262/quant.sf
-rw-rw-r--. 1 aarab aarab 10432308 Mar  4 15:56 exp/quants/SRR3214263/quant.sf
-rw-rw-r--. 1 aarab aarab 10441773 Mar  4 15:57 exp/quants/SRR3214264/quant.sf
-rw-rw-r--. 1 aarab aarab 10451287 Mar  4 15:58 exp/

In [None]:
cat salmon.out | grep "Mapping rate" | wc -l

___

## Load packages and functions

In [1]:
# %load_ext rpy2.ipython

In [2]:
# %%R 
suppressMessages(suppressWarnings(library (GenomicFeatures)))
suppressMessages(suppressWarnings(library (tximport)))
suppressMessages(suppressWarnings(library (tidyverse)))
suppressMessages(suppressWarnings(library (ggplot2)))
suppressMessages(suppressWarnings(library (ggrepel)))
suppressMessages(suppressWarnings(library (DESeq2)))
suppressMessages(suppressWarnings(library (patchwork)))
suppressMessages(suppressWarnings(library (BiocParallel)))
suppressMessages(suppressWarnings(library(gridExtra)))
library(grid)
library(ggthemes)

register(MulticoreParam(18))

In [3]:
source("../scripts/util.R")

- https://medium.com/analytics-vidhya/ggplot2-themes-for-publication-ready-plots-including-dark-themes-9cd65cc5a7e3

- https://rpubs.com/Koundy/71792

In [4]:
# Filter non-informative genes:
filter_low_counts <- function (dds_in , min_count=10){
    # all samples with a count of higher than `min_count`
    keep <- rowSums(counts(dds_in ) > min_count) == ncol(dds_in )
    # at least 3 samples with a count of 10 or higher
    # keep <- rowSums(counts(dds) >= 10) >= 3
    dds_out <- dds_in [keep,]
    print (paste (nrow(dds_in), 'genes filtered to ->' , nrow(dds_out),'genes!', sep=' ') )
    return (dds_out)
}


plot_PCA <- function(vsd, colData, labels='', gr=c('cond'), title='', leg="none"){
    z <- plotPCA(vsd,intgroup=gr, returnData=TRUE)
    percentVar <- round(100 * attr(z, "percentVar"))
    pca <- ggplot(z, aes(PC1, PC2)) +
            geom_point(aes(colour=group), alpha = 4/10,size=12) + 
            geom_point(shape = 1,size = 12,colour = "black") + 
            # geom_point(
            #     aes(color=group), 
            #     
            # size=10,
            # stroke = 0.5
            # ) +
            geom_text_repel(
                aes(label = labels),
                box.padding = 1.5, max.overlaps = Inf
            ) + #,size = 3.5)) +
            xlab(paste0("PC1: ",percentVar[1],"% variance")) +
            ylab(paste0("PC2: ",percentVar[2],"% variance")) +
            ggtitle (title) + 
            scale_colour_Publication() + 
            theme_Publication(legend.position='top') +
            guides (size = 'none') + theme(legend.title = element_blank()) 
    return (pca)
}

In [5]:
write_Result <- function(res, name_it, col=FALSE, row=FALSE){
    write.table(res,name_it, sep="\t", quote=FALSE, col.names=col, row.names=row)
}


correct_batch <- function (dds,gr,out,labels='',title=''){
    vsd <- varianceStabilizingTransformation(dds, blind=FALSE)
    p0 <- plot_PCA(vsd, gr, labels=labels,title=paste0(title,' Before removeBatchEffect'))
    mat <- assay(vsd)
    
    mat <- limma::removeBatchEffect(mat, vsd$reps) # batch is same as time 
    assay(vsd) <- mat
    p1 <- plot_PCA(vsd, gr, labels=labels,title=paste0(title,' After removeBatchEffect'))
    counts_batch_corrected <- assay(vsd)
    
    if (out == 'plot') {return (p0 + p1)}
    if (out == 'plot1'){return (p1)}
    if (out == 'vsd')  {return (vsd)}
    if (out == 'cbc')  {return (counts_batch_corrected)}
}


ann_Result <- function(res){
    return (
        res %>% data.frame %>% 
        add_column(gene_name=gene2name[rownames(res),]) %>%
        add_column(gene_id=rownames(res)) %>% 
        select (gene_id,gene_name,everything())
    )
}

plot_gene_counts <- function (dds, gene_id,gene_name){
    fiss <- plotCounts(dds, gene_id, intgroup = c("cond"), returnData = TRUE)

    p <- ggplot(fiss,
      aes(x = cond, y = count, color = cond)) + 
      geom_point(size=5,alpha=8/10) + stat_summary(fun=mean, geom="line") +
      scale_y_log10() + 
      theme_bw() + 
      ggtitle(gene_name)
    
    return (p + theme_Publication())
}

## Load annotations

In [6]:
# %%R
GTF = '~/genomes/hg38/gencode.v34/gencode.v34.annotation.gtf'
gtf <- rtracklayer::import(GTF)
gene2name <- gtf[gtf$type == "gene"] %>% 
    data.frame %>% 
    column_to_rownames('gene_id') %>% 
    dplyr::select('gene_name')

txdb  = makeTxDbFromGFF(GTF,organism='Homo sapiens')
# tx2gene objects 
k <- keys(txdb, keytype = "TXNAME")
tx2gene <- AnnotationDbi::select(txdb, k, "GENEID", "TXNAME")

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
“The "phase" metadata column contains non-NA values for features of type
  stop_codon. This information was ignored.”
OK

'select()' returned 1:1 mapping between keys and columns



In [7]:
# %%R 
tx2name <- gtf[gtf$type == "transcript"] %>% 
    data.frame %>% column_to_rownames('transcript_id') %>% 
    dplyr::select('transcript_name')

## Load salmon quant files

List every salmon quant files

In [8]:
# %%R
files <- list.files(path='./exp/quants', pattern="quant.sf",full.names = TRUE, recursive=T)
names(files) <- gsub("./exp/quants/(\\S+)/quant.sf","\\1",files)

In [125]:
# txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut=TRUE)

# txi.gene <- summarizeToGene(txi, tx2gene, ignoreAfterBar= TRUE)

### Define sample sheet

In [172]:
meta = read.table('SraRunTable.txt',sep = ',', header = T) %>%
    column_to_rownames('Run') %>% select(c('source_name','induction','Time'))

meta = meta[names(files),]

meta = meta %>% 
    # mutate(condition=paste(source_name,induction,Time, sep = '_')) %>%
    mutate(condition=source_name) %>%
    select(-c(source_name)) %>%
    mutate(Time=str_replace(Time,' hr','h')) %>%
    mutate(condition=str_replace(condition,' ','_')) %>%
    mutate(condition=str_replace(condition,'-',''))

#### Make `DESeq` object and run test

In [244]:
run_deseq <- function(me,cond_name){
    txi <- tximport(
        files[me %>% rownames], 
        type = "salmon", tx2gene = tx2gene, txOut=TRUE
    )

    txi.gene <- summarizeToGene(txi, tx2gene, ignoreAfterBar= TRUE)

    dds <- DESeqDataSetFromTximport(
        txi.gene,
        me,
        design=~Time
    )

    dds <- estimateSizeFactors(dds)

    dds = DESeq(dds)
    
    RES = list()

    for (res_name in resultsNames(dds)[2:8]){
        print(res_name)

        res = results(dds, name = res_name)

        res %>% summary

        RES[[paste0(cond_name,res_name)]] = res

    }
    
    # temp file for running iPAGE 
    # write results 
    for (name in names(RES)){
        print (name)
        # write results 
        RES[[name]] %>% ann_Result %>%
            mutate(log2FoldChange = replace_na(log2FoldChange, 0)) %>% remove_rownames %>% 
            dplyr::select('gene_name','log2FoldChange') %>% 
            write_Result(paste('exp/res_',name,'_delta_exp.txt', sep=''),col=TRUE)
    }

    for (name in names(RES)){
        print (name)
        # write results 
        RES[[name]] %>% ann_Result %>%
            arrange(gene_name) %>% 
            dplyr::select('gene_id','gene_name','log2FoldChange','pvalue',everything()) %>% 
            write_Result(paste('exp/res_',name,'_delta_exp_table.txt', sep=''),col=TRUE)
    }
}

### Neutrophil

In [249]:
run_deseq(
    meta %>% 
        filter(condition %in% c('HL60','Monocyte')) %>%
        filter(induction %in% c('None','PMA','DMSO/ATRA','Vitamin D3')),
    
    'Monocyte'
)

reading in files with read_tsv

1 
2 
3 
4 
5 
6 
7 
8 
9 
10 
11 
12 
13 
14 
15 
16 
17 
18 
19 
20 
21 
22 
23 


summarizing abundance

summarizing counts

summarizing length

“some variables in design formula are characters, converting to factors”
using counts and average transcript lengths from tximport

using 'avgTxLength' from assays(dds), correcting for library size

using pre-existing normalization factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



[1] "Time_120h_vs_0h"

out of 37715 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up)       : 925, 2.5%
LFC < 0 (down)     : 472, 1.3%
outliers [1]       : 629, 1.7%
low counts [2]     : 19116, 51%
(mean count < 5)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

[1] "Time_12h_vs_0h"

out of 37715 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up)       : 117, 0.31%
LFC < 0 (down)     : 36, 0.095%
outliers [1]       : 629, 1.7%
low counts [2]     : 21942, 58%
(mean count < 11)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

[1] "Time_24h_vs_0h"

out of 37715 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up)       : 529, 1.4%
LFC < 0 (down)     : 381, 1%
outliers [1]       : 629, 1.7%
low counts [2]     : 18410, 49%
(mean count < 4)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

[1] "Time_3h_vs_0h"


### 

In [None]:
dds_Neutrophil = run_deseq(
    meta %>% 
        filter(condition %in% c('HL60','Neutrophil')) %>%
        filter(induction %in% c('None','PMA','DMSO/ATRA','Vitamin D3'))
)

### 

In [235]:
# temp file for running iPAGE 
# write results 
for (name in names(RES)){
    print (name)
    # write results 
    RES[[name]] %>% ann_Result %>%
        mutate(log2FoldChange = replace_na(log2FoldChange, 0)) %>% remove_rownames %>% 
        dplyr::select('gene_name','log2FoldChange') %>% 
        write_Result(paste('exp/res_',name,'_delta_exp.txt', sep=''),col=TRUE)
}

[1] "Neutrophil_Time_120h_vs_0h"
[1] "Neutrophil_Time_12h_vs_0h"
[1] "Neutrophil_Time_24h_vs_0h"
[1] "Neutrophil_Time_3h_vs_0h"
[1] "Neutrophil_Time_48h_vs_0h"
[1] "Neutrophil_Time_6h_vs_0h"
[1] "Neutrophil_Time_96h_vs_0h"


In [236]:
for (name in names(RES)){
    print (name)
    # write results 
    RES[[name]] %>% ann_Result %>%
        arrange(gene_name) %>% 
        dplyr::select('gene_id','gene_name','log2FoldChange','pvalue',everything()) %>% 
        write_Result(paste('exp/res_',name,'_delta_exp_table.txt', sep=''),col=TRUE)
}

[1] "Neutrophil_Time_120h_vs_0h"
[1] "Neutrophil_Time_12h_vs_0h"
[1] "Neutrophil_Time_24h_vs_0h"
[1] "Neutrophil_Time_3h_vs_0h"
[1] "Neutrophil_Time_48h_vs_0h"
[1] "Neutrophil_Time_6h_vs_0h"
[1] "Neutrophil_Time_96h_vs_0h"


In [230]:
# RES = list(
#     res_Macrophage_vs_HL_60, 
#     res_Monocyte_vs_HL_60,
#     res_Neutrophil_vs_HL_60
# )

# names (RES) <- c('Macrophage_vs_HL_60', 'Monocyte_vs_HL_60','Neutrophil_vs_HL_60')

#### Save normalized counts

In [203]:
# dds1 <- estimateSizeFactors(dds)

# ncu <- counts(dds1, normalized=TRUE) %>% 
#     data.frame %>% rownames_to_column('gene_id') %>% 
#     add_column(gene_name=gene2name[rownames(dds1),], .after='gene_id')

# write.table(
#     ncu,'exp/deseq2_norm.txt', sep="\t", quote=FALSE, col.names=TRUE
# )

# ___

# counts.raw <- counts(dds1, normalized=FALSE) %>% 
#     data.frame %>% rownames_to_column('gene_id') %>% 
#     add_column(gene_name=gene2name[rownames(dds1),], .after='gene_id')

# write.table(
#     counts.raw,'exp/deseq2_raw_counts.txt',
#     sep="\t", quote=FALSE, col.names=TRUE
# )

# 

In [103]:
RES = list(
    res_Macrophage_vs_HL_60, 
    res_Monocyte_vs_HL_60,
    res_Neutrophil_vs_HL_60
)

names (RES) <- c('Macrophage_vs_HL_60', 'Monocyte_vs_HL_60','Neutrophil_vs_HL_60')

In [104]:
# temp file for running iPAGE 
# write results 
for (name in names(RES)){
    print (name)
    # write results 
    RES[[name]] %>% ann_Result %>%
        mutate(log2FoldChange = replace_na(log2FoldChange, 0)) %>% remove_rownames %>% 
        dplyr::select('gene_name','log2FoldChange') %>% 
        write_Result(paste('exp/res_',name,'_delta_exp.txt', sep=''),col=TRUE)
}

[1] "Macrophage_vs_HL_60"
[1] "Monocyte_vs_HL_60"
[1] "Neutrophil_vs_HL_60"


In [105]:
for (name in names(RES)){
    print (name)
    # write results 
    RES[[name]] %>% ann_Result %>%
        arrange(gene_name) %>% 
        dplyr::select('gene_id','gene_name','log2FoldChange','pvalue',everything()) %>% 
        write_Result(paste('exp/res_',name,'_delta_exp_table.txt', sep=''),col=TRUE)
}

[1] "Macrophage_vs_HL_60"
[1] "Monocyte_vs_HL_60"
[1] "Neutrophil_vs_HL_60"


### 

In [106]:
# res_Macrophage_vs_HL_60 %>% data.frame %>% 
#     filter(log2FoldChange>4 & pvalue < 0.001) %>% ann_Result

# Session Info

In [107]:
sessionInfo()

R version 4.1.1 (2021-08-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /data_gilbert/home/aarab/anaconda3/envs/deseq2/lib/libopenblasp-r0.3.18.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] grid      stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] ggthemes_4.2.4              gridExtra_2.3              
 [3] BiocParallel_1.28.0         patchwork_1.1.1            
 [5] DESeq2_1.34.0               SummarizedExperiment_1.24.0
 [7] MatrixGenerics_1.6.0        matrixStats_0.61.0         
 [9] ggrepel_0.9.1    

In [108]:
date()