In [1]:
getwd()

## Creates binarized Seurat objects, integrates conditions and annotates genes by nearby peaks

In [2]:
# Input info
#outdir = "/mnt/md0/isshamie/Projects/Mito_Trace/output/pipeline/DUPI_april08_2021/MTblacklist_mtasnucl_Bonly/data/annotation/gff_hg38_1_2/mergedSamples/"
outdir = "/mnt/md0/isshamie/Projects/Mito_Trace/output/CHIP_aggr/data/annotation/gff_A2/mergedSamples"

gff_id = "hg38_1_2"
annotations_indir = "/mnt/md0/isshamie/Projects/Mito_Trace/data/processed/annotation_granges/"


# gene_type="custom" #"UCSC" #or custom
# gene_file = "/data/Mito_Trace/data/processed/genomes/mtMasked/GRCh38_MT_blacklist/genes/genes.gtf"

# Parameters
nTop = 25000
lsi_start_comp = 2
to.filt.cells = "TRUE"
to.qc = FALSE

## QC parameters
min_peak_region_fragments=10
max_peak_region_fragments=15000
min_pct_reads_in_peaks=15
max_blacklist_ratio=0.05
max_nucleosome_signal=4
min_TSS_enrichment=0.2

cores = 24

input_dirs = ""


In [3]:
annotations <- readRDS(file.path(annotations_indir, paste0(gff_id, ".annotationGranges.rds")))

In [4]:
if ((to.filt.cells == "T") | (to.filt.cells == "TRUE")){
    to.filt.cells = TRUE
}else{to.filt.cells = FALSE}


## Add each integrate

In [5]:
library(repr)
options(repr.plot.width=12, repr.plot.height=12)

library(GenomicRanges)
library(Seurat)
library(Signac)
library(GenomeInfoDb)
library(EnsDb.Hsapiens.v75)
library(ggplot2)
library(patchwork)
set.seed(1234)
library(data.table)
library(magrittr)
library(cowplot)
library(metap)
library(dplyr)
library("patchwork")
library(future)
library(Rsamtools)


plan("multiprocess", workers = cores)
options(future.globals.maxSize = 16000 * 1024^2)
#options(future.globals.maxSize = 50000 * 1024^2) # for 50 Gb RAM
#plan("multiprocess", workers = workers)

Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which.max, which.min


Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The following objects are masked from ‘package:base’:

    expand.grid, I, unname


Loading required package: IRanges

Loading required package: GenomeInfoDb

Attaching SeuratObject

Loading required package: ensembldb

Loading required package: GenomicFeatures

Loading required pa

In [6]:
allSE <- readRDS(file.path(outdir, paste0("allSamples.rds")))

## Filter cells if parameters there

# Merge:
- get umap and clusters

In [7]:
# Add sample names to cell prefix here.
for (i in 1:length(allSE)){
    print(i)
    curr.SE <- allSE[[i]]
    allSE[[i]] <- RenameCells(allSE[[i]], add.cell.id=paste(names(allSE)[[i]],curr.SE$orig.ident[[1]],sep="_"))
    allSE[[i]]$expID = paste(names(allSE)[[i]],curr.SE$orig.ident[[1]],sep="_")
}

[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8


In [8]:
length(allSE)

In [10]:
### Plot QC first

to.qc = TRUE

vPlot <- function(se){
      vPlot <- VlnPlot(
      object = se,
      features = c('pct_reads_in_peaks', 'peak_region_fragments',
                   'TSS.enrichment', 'blacklist_ratio', 'nucleosome_signal'),
      pt.size = 0.1,
      ncol = 5
    )  
    vPlot <- vPlot +    # Create grid of plots with title
             plot_annotation(title = se$expID[[1]]) & 
             theme(plot.title = element_text(hjust = 0.5, size=15))
    #print(vPlot)
    return(vPlot)
}

qc <- function(se){
    # add the gene information to the object
    Annotation(se) <- annotations
    gene.activities <- GeneActivity(se)
    # compute nucleosome signal score per cell
    se <- NucleosomeSignal(object = se)
    
    # compute TSS enrichment score per cell
    se <- TSSEnrichment(object = se, fast = FALSE)

    # add blacklist ratio and fraction of reads in peaks
    se$pct_reads_in_peaks <- se$peak_region_fragments / se$passed_filters * 100
    se$blacklist_ratio <- se$blacklist_region_fragments / se$peak_region_fragments
    se$high.tss <- ifelse(se$TSS.enrichment > 2, 'High', 'Low')
    se$nucleosome_group <- ifelse(se$nucleosome_signal > 4, 'NS > 4', 'NS < 4')
    TSSPlot(se, group.by = 'high.tss') + NoLegend()
    ggsave(file=file.path(outdir, paste0("QC.TSS", se$expID[[1]], ".png")))
    FragmentHistogram(object = se, group.by = 'nucleosome_group')
    ggsave(file=file.path(outdir, paste0("QC.FragHist", se$expID[[1]], ".png")))
    return(se)
}

filtCells <- function(se, min_peak_region_fragments=10,
                      max_peak_region_fragments=15000,
                     min_pct_reads_in_peaks=15,
                     max_blacklist_ratio=0.05,
                     max_nucleosome_signal=4,
                     min_TSS_enrichment=2){
    print('se before filt')
    print(se)
    se <- subset(
      x = se,
      subset = peak_region_fragments > min_peak_region_fragments &
               peak_region_fragments < max_peak_region_fragments &
               pct_reads_in_peaks > min_pct_reads_in_peaks &
               blacklist_ratio < max_blacklist_ratio &
               nucleosome_signal < max_nucleosome_signal  &
               TSS.enrichment > min_TSS_enrichment
    )
    print('se after filt')
    print(se)
    return(se)
}


if (to.filt.cells){
    #ggsave(file.path(outdir, paste0("QC_02.pdf")))
    allSE <- lapply(allSE, qc)
    allSE
    #ggsave(file.path(outdir,"integrated.batch.png"), dpi=300)
    allSE <- lapply(allSE, filtCells,
                    min_peak_region_fragments=min_peak_region_fragments,
                    max_peak_region_fragments=max_peak_region_fragments,
                    min_pct_reads_in_peaks=min_pct_reads_in_peaks,
                    max_blacklist_ratio=max_blacklist_ratio,
                    max_nucleosome_signal=max_nucleosome_signal,
                    min_TSS_enrichment=min_TSS_enrichment)
}

Extracting gene coordinates

Extracting reads overlapping genomic regions

Extracting TSS positions

Finding + strand cut sites

Finding - strand cut sites

Computing mean insertion frequency in flanking regions

Normalizing TSS score

Saving 6.67 x 6.67 in image

Saving 6.67 x 6.67 in image

"Removed 14 rows containing non-finite values (stat_bin)."
"Removed 4 rows containing missing values (geom_bar)."
Extracting gene coordinates

Extracting reads overlapping genomic regions

Extracting TSS positions

Finding + strand cut sites

Finding - strand cut sites

Computing mean insertion frequency in flanking regions

Normalizing TSS score

Saving 6.67 x 6.67 in image

Saving 6.67 x 6.67 in image

"Removed 45 rows containing non-finite values (stat_bin)."
"Removed 4 rows containing missing values (geom_bar)."
Extracting gene coordinates

Extracting reads overlapping genomic regions

Extracting TSS positions

Finding + strand cut sites

Finding - strand cut sites

Computing mean insertion fr

[1] "se before filt"
An object of class Seurat 
195596 features across 5308 samples within 1 assay 
Active assay: ATAC (195596 features, 0 variable features)
[1] "se after filt"
An object of class Seurat 
195596 features across 5102 samples within 1 assay 
Active assay: ATAC (195596 features, 0 variable features)
[1] "se before filt"
An object of class Seurat 
187005 features across 7540 samples within 1 assay 
Active assay: ATAC (187005 features, 0 variable features)
[1] "se after filt"
An object of class Seurat 
187005 features across 7218 samples within 1 assay 
Active assay: ATAC (187005 features, 0 variable features)
[1] "se before filt"
An object of class Seurat 
211492 features across 6848 samples within 1 assay 
Active assay: ATAC (211492 features, 0 variable features)
[1] "se after filt"
An object of class Seurat 
211492 features across 6381 samples within 1 assay 
Active assay: ATAC (211492 features, 0 variable features)
[1] "se before filt"
An object of class Seurat 
158263 

In [None]:
# merge all datasets, adding a cell ID to make sure cell names are unique
# combined <- merge(
#   x = allSE[[1]],
#   y = unlist(allSE[2:length(allSE)]),
# )

if(length(allSE) == 1){
    combined = allSE[[1]]
}else{
    combined <- merge(
      x = allSE[[1]],
      y = allSE[[2]])
}

if (length(allSE) > 2) {
    for (i in 3:length(allSE)){
        combined <- merge(x=combined,
                          y = allSE[[i]])
        }
}

In [None]:
combined <- FindTopFeatures(combined, min.cutoff = 20)
combined

In [None]:
combined[[]]

In [None]:
combined$expID <- factor(combined$expID)

VlnPlot(
  object = combined,
  features = c('nCount_ATAC', 'peak_region_fragments', 'passed_filters',
               'duplicate', 'unmapped'),
  split.by = "expID",
  pt.size = 0.1,
  ncol = 3
)


## Filter

In [None]:
# if (to.filt.cells){
#     combined <- lapply(combined, filtCells,
#                     min_peak_region_fragments=min_peak_region_fragments,
#                     max_peak_region_fragments=max_peak_region_fragments,
#                     min_pct_reads_in_peaks=min_pct_reads_in_peaks,
#                     max_blacklist_ratio=max_blacklist_ratio,
#                     max_nucleosome_signal=max_nucleosome_signal,
#                     min_TSS_enrichment=min_TSS_enrichment)
# }

In [None]:
# Binarize and run LSI
combined <- BinarizeCounts(combined)
combined <- RunTFIDF(combined)
combined <- RunSVD(combined)
combined <- RunUMAP(combined, dims = lsi_start_comp:50, reduction = 'lsi')
DimPlot(combined, group.by = "proj", pt.size = 0.1)

In [None]:
DimPlot(combined, group.by = "expID", pt.size = 0.1)

In [None]:
pDepthCorr <- DepthCor(combined)
pDepthCorr

In [None]:
saveRDS(combined, file.path(outdir, paste0("combined.rds")))

In [None]:
allSE <- lapply(levels(combined$expID),  function(x) subset(combined, subset = expID == x))
allSE

In [None]:
saveRDS(allSE, file.path(outdir, paste0("allSamples.filt.rds")))

# Integrate

In [None]:
if(length(allSE) == 1){
    integrated = allSE[[1]]
}else{
    # find integration anchors
    integration.anchors <- FindIntegrationAnchors(
      object.list = allSE, #c(ext,curr),
      anchor.features = allSE[[1]], #rownames(ext),
      reduction = "rlsi",
      dims = lsi_start_comp:30
    )

    # integrate LSI embeddings
    integrated <- IntegrateEmbeddings(
      anchorset = integration.anchors,
      reductions = combined[["lsi"]],
      new.reduction.name = "integrated_lsi",
      dims.to.integrate = 1:30
    )
}


## Run UMAP and plot

In [None]:
p1 <- DimPlot(combined, group.by = "expID")

In [None]:
# create a new UMAP using the integrated embeddings
integrated <- RunUMAP(integrated, reduction = "integrated_lsi", dims = 2:30)
p2 <- DimPlot(integrated, group.by = "expID")
ggsave(file.path(outdir,"integrated.batch.png"), dpi=300)
p2

### Compare merged and integrated

In [None]:
(p1 + ggtitle("Merged")) | (p2 + ggtitle("Integrated"))
ggsave(file.path(outdir,"integrated.merged.compare.png"), dpi=300)

## Correlation of LSI components and Depth

In [None]:
pDepthCorr <- DepthCor(integrated, reduction='integrated_lsi')
ggsave(file.path(outdir,"integrated.depthCor.png"), plot=pDepthCorr, dpi=300)

pDepthCorr

In [None]:
saveRDS(integrated, file.path(outdir, paste0("allSamples.integrated.beforeClust.rds")))

## Cluster and plot

In [None]:
#integrated <- RunUMAP(object = integrated, reduction = 'integrated_lsi', dims = 2:30)
integrated <- FindNeighbors(object = integrated, reduction = 'integrated_lsi', dims = 2:30)
integrated <- FindClusters(object = integrated, verbose = FALSE, algorithm = 3)

pclust <- DimPlot(object = integrated, label = TRUE) + NoLegend()
ggsave(file.path(outdir, "integrated.lsi.clusters.png"), pclust)
pclust

## Get gene activity results 

In [None]:
# add the gene information to the object
DefaultAssay(integrated) <- "ATAC"
Annotation(integrated) <- annotations
gene.activities <- GeneActivity(integrated)


## Remove MT genes

In [None]:
mt_genes <- annotations[seqnames(annotations) == "chrM"]
dim(mt_genes)


In [None]:
dim(gene.activities)

In [None]:
gene.activities <- gene.activities[!(rownames(gene.activities) %in% mt_genes$gene_name),]

In [None]:
dim(gene.activities)

In [None]:
# add the gene activity matrix to the Seurat object as a new assay and normalize it
integrated[['RNA']] <- CreateAssayObject(counts = gene.activities)
integrated <- NormalizeData(
  object = integrated,
  assay = 'RNA',
  normalization.method = 'LogNormalize',
  scale.factor = median(integrated$nCount_RNA)
)

In [None]:
DefaultAssay(integrated) <- 'RNA'

In [None]:
saveRDS(integrated, file.path(outdir, paste0("allSamples.integrated.rds")))

In [None]:
Idents(integrated) <- "expID"
VlnPlot(
  object = integrated,
  features = c('nCount_ATAC', 'peak_region_fragments', 'passed_filters',
               'duplicate', 'unmapped'),
  split.by = "expID",
  pt.size = 0.1,
  ncol = 3
)

ggsave(file.path(outdir, paste0("QC_02.png")))

In [None]:
sessionInfo()

In [15]:
sessionInfo()

R version 4.1.1 (2021-08-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.4 LTS

Matrix products: default
BLAS/LAPACK: /home/isaac/miniconda3/envs/mttrace/lib/libopenblasp-r0.3.18.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] Rsamtools_2.10.0          Biostrings_2.62.0        
 [3] XVector_0.34.0            future_1.23.0            
 [5] dplyr_1.0.8               metap_1.1                
 [7] cowplot_1.1.1             magrittr_2.0.2           
 [9] data.table_1.14.2         patchwork_1.1.1          
[11] 