## Creates binarized Seurat objects, integrates conditions and annotates genes by nearby peaks

In [7]:
# Input info
cellr_in = "/home/isaac/lewis/mito_lineage/output/mtscATAC/data/DUPI_april08_2021_Croker/MTblacklist_mtasnucl/" 
outdir = "/mnt/md0/isshamie/Projects/Mito_Trace/output/pipeline/DUPI_april08_2021/MTblacklist_mtasnucl_Bonly/data/annotation/gff_hg38_1_2/mergedSamples/"

sample_names = "preB,postB"
samples = "rxn2,rxn4"

# Parameters
nTop = 25000
cores = 24

gff_id = "hg38_1_2"

annotations_indir = "/mnt/md0/isshamie/Projects/Mito_Trace/data/processed/annotation_granges/"

In [8]:
library(repr)
options(repr.plot.width=12, repr.plot.height=12)

In [9]:
samples <- unlist(strsplit(samples, ",")[[1]])
sample_names <- unlist(strsplit(sample_names, ","))

samples

In [10]:
annotations <- readRDS(file.path(annotations_indir, paste0(gff_id, ".annotationGranges.rds")))

In [11]:
annotations

GRanges object with 1684537 ranges and 21 metadata columns:
            seqnames      ranges strand |   source       type     score
               <Rle>   <IRanges>  <Rle> | <factor>   <factor> <numeric>
        [1]     chr1 11869-14409      + |   HAVANA gene              NA
        [2]     chr1 11869-14409      + |   HAVANA transcript        NA
        [3]     chr1 11869-12227      + |   HAVANA exon              NA
        [4]     chr1 12613-12721      + |   HAVANA exon              NA
        [5]     chr1 13221-14409      + |   HAVANA exon              NA
        ...      ...         ...    ... .      ...        ...       ...
  [1684533]     chrM 15888-15953      + |  ENSEMBL transcript        NA
  [1684534]     chrM 15888-15953      + |  ENSEMBL exon              NA
  [1684535]     chrM 15956-16023      - |  ENSEMBL gene              NA
  [1684536]     chrM 15956-16023      - |  ENSEMBL transcript        NA
  [1684537]     chrM 15956-16023      - |  ENSEMBL exon              NA
    

In [12]:
library(GenomicRanges)
library(Seurat)
library(Signac)
library(GenomeInfoDb)
#library(EnsDb.Hsapiens.v75)
library(ggplot2)
library(patchwork)
set.seed(1234)
library(data.table)
library(magrittr)
library(cowplot)
library(metap)
library(dplyr)
library(future)

plan("multiprocess", workers = cores)
options(future.globals.maxSize = 8000 * 1024^2)
#options(future.globals.maxSize = 50000 * 1024^2) # for 50 Gb RAM
#plan("multiprocess", workers = workers)

## Merge all peaks

In [13]:
read.peaks <- function(exp, cellr_in){
    print('here')
    print(file.path(cellr_in, exp, "outs", "filtered_peak_bc_matrix", "peaks.bed"))
    peaks <- read.table(
      file = file.path(cellr_in, exp, "outs", "filtered_peak_bc_matrix", "peaks.bed"),
      col.names = c("chr", "start", "end")
    )
    # convert to genomic ranges
    gr <- makeGRangesFromDataFrame(peaks)
    return(gr)
}


gr.full <- c(sapply(samples, read.peaks, cellr_in=cellr_in, USE.NAMES=F))

gr.full.c <- gr.full[[1]]
if (length(gr.full)>1){
    for (i in 2:length(gr.full)){
      gr.full.c <- c(gr.full.c, gr.full[[i]])
    }
}
combined.peaks <- reduce(x = c(gr.full.c))

# Filter out bad peaks based on length
peakwidths <- width(combined.peaks)
combined.peaks <- combined.peaks[peakwidths  < 10000 & peakwidths > 20]
head(combined.peaks)

[1] "here"
[1] "/home/isaac/lewis/mito_lineage/output/mtscATAC/data/DUPI_april08_2021_Croker/MTblacklist_mtasnucl//rxn2/outs/filtered_peak_bc_matrix/peaks.bed"
[1] "here"
[1] "/home/isaac/lewis/mito_lineage/output/mtscATAC/data/DUPI_april08_2021_Croker/MTblacklist_mtasnucl//rxn4/outs/filtered_peak_bc_matrix/peaks.bed"


GRanges object with 6 ranges and 0 metadata columns:
      seqnames        ranges strand
         <Rle>     <IRanges>  <Rle>
  [1]     chr1   10169-10267      *
  [2]     chr1 191337-191519      *
  [3]     chr1 268046-268142      *
  [4]     chr1 778355-779233      *
  [5]     chr1 827080-827940      *
  [6]     chr1 844530-844785      *
  -------
  seqinfo: 25 sequences from an unspecified genome; no seqlengths

In [24]:
length(combined.peaks)

## Are any peaks in the chrM

In [25]:
length(combined.peaks[seqnames(combined.peaks) != "chrM"])


In [26]:
combined.peaks[seqnames(combined.peaks) == "chrM"]

combined.peaks[seqnames(combined.peaks) == "chr1"]


GRanges object with 0 ranges and 0 metadata columns:
   seqnames    ranges strand
      <Rle> <IRanges>  <Rle>
  -------
  seqinfo: 25 sequences from an unspecified genome; no seqlengths

GRanges object with 4387 ranges and 0 metadata columns:
         seqnames              ranges strand
            <Rle>           <IRanges>  <Rle>
     [1]     chr1         10169-10267      *
     [2]     chr1       191337-191519      *
     [3]     chr1       268046-268142      *
     [4]     chr1       778355-779233      *
     [5]     chr1       827080-827940      *
     ...      ...                 ...    ...
  [4383]     chr1 248858094-248859625      *
  [4384]     chr1 248862858-248863683      *
  [4385]     chr1 248872949-248874682      *
  [4386]     chr1 248905881-248907165      *
  [4387]     chr1 248924667-248925484      *
  -------
  seqinfo: 25 sequences from an unspecified genome; no seqlengths

## Remove any peaks in the chrM

In [27]:
combined.peaks = combined.peaks[seqnames(combined.peaks) != "chrM"]
length(combined.peaks)

## Create fragment objects

In [None]:
allSE = c() 

samples_df <- cbind(sample_names, samples)
print('samples_df')
print(samples_df)
for (row in 1:nrow(samples_df)){
    exp <- (samples_df[[row, "samples"]])
    name <- (samples_df[[row, "sample_names"]]) 
    
#for (exp in samples) {
    print('exp')
    print(exp)
    print('name')
    print(name)
    barcode_path <- file.path(cellr_in, exp, "outs", "filtered_peak_bc_matrix", "barcodes.tsv")    
    barcodes <- readr::read_tsv(barcode_path, col_names = F) # %>% tidyr::unite(barcode)
    barcodes <- as.data.frame(barcodes) %>%  tibble::column_to_rownames(var="X1") %>% tibble::add_column(proj=name)
    frag_file <- file.path(cellr_in, exp, "outs", "fragments.tsv.gz")
    
    cells.meta.f <- file.path(cellr_in, exp, "outs", "singlecell.csv") 
    cells.meta <- as.data.frame(readr::read_csv(cells.meta.f)) %>% tibble::column_to_rownames(var="barcode") %>% tibble::add_column(proj=name)
    cells.meta <- cells.meta[rownames(cells.meta) %in% rownames(barcodes), ]

    # quantify multiome peaks in the scATAC-seq dataset
    
    
    print("Creating fragments object")
    frags.curr <- CreateFragmentObject(path = frag_file, cells= rownames(barcodes))
    #print(frags.curr)
    print("Quantifying peaks")
    ## Quantify peaks
    curr.counts <- FeatureMatrix(
      fragments = frags.curr,
      features = combined.peaks,
      cells = rownames(barcodes),
      process_n = cores
    )
    
    print("Creating chromatin assay")
    ## Create the objects and use simple filters
    curr_assay <- CreateChromatinAssay(curr.counts, fragments = frags.curr, min.cells = 10, min.features = 200)
    curr <- CreateSeuratObject(curr_assay, assay = "ATAC", project=name, meta.data=cells.meta)
    print('curr_assay')
    print(head(curr_assay))
    print('curr')
    print(head(curr[[]]))
    allSE = c(allSE, curr)
    #return(curr)
}

allSE

#allSE <- sapply(samples, create_frag, cellr_in=cellr_in)

[1] "samples_df"
     sample_names samples
[1,] "preB"       "rxn2" 
[2,] "postB"      "rxn4" 
[1] "exp"
[1] "rxn2"
[1] "name"
[1] "preB"


Registered S3 method overwritten by 'cli':
  method     from         
  print.boxx spatstat.geom
[1mRows: [22m[34m1546[39m [1mColumns: [22m[34m1[39m

[36m──[39m [1mColumn specification[22m [36m─────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): X1


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

[1mRows: [22m[34m385146[39m [1mColumns: [22m[34m18[39m

[36m──[39m [1mColumn specification[22m [36m─────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): barcode, cell_id
[32mdbl[39m (16): total, duplicate, chimeric, unmapped, lowmapq, mitochondrial, pass...


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[

[1] "Creating fragments object"


Computing hash



[1] "Quantifying peaks"


Extracting reads overlapping genomic regions



In [None]:
saveRDS(allSE, file.path(outdir, paste0("allSamples.rds")))

In [None]:
allSE

## QC Metrics

In [None]:
library(Rsamtools)

qc <- function(se){
    # add the gene information to the object
    Annotation(se) <- annotations
    
    # compute nucleosome signal score per cell
    se <- NucleosomeSignal(object = se)
    
    # compute TSS enrichment score per cell
    se <- TSSEnrichment(object = se, fast = FALSE)

    # add blacklist ratio and fraction of reads in peaks
    se$pct_reads_in_peaks <- se$peak_region_fragments / se$passed_filters * 100
    se$blacklist_ratio <- se$blacklist_region_fragments / se$peak_region_fragments
    se$high.tss <- ifelse(se$TSS.enrichment > 2, 'High', 'Low')
    se$nucleosome_group <- ifelse(se$nucleosome_signal > 4, 'NS > 4', 'NS < 4')

    return(se)
}
vPlot <- function(se){
      vPlot <- VlnPlot(
      object = se,
      features = c('pct_reads_in_peaks', 'peak_region_fragments',
                   'TSS.enrichment', 'blacklist_ratio', 'nucleosome_signal'),
      pt.size = 0.1,
      ncol = 5
    )  
    vPlot <- vPlot +    # Create grid of plots with title
             plot_annotation(title = se$orig.ident[[1]]) & 
             theme(plot.title = element_text(hjust = 0.5, size=15))
    #print(vPlot)
    return(vPlot)
}


In [None]:
allSE <- lapply(allSE, qc)
saveRDS(allSE, file.path(outdir, paste0("allSamples.rds")))
lapply(allSE,vPlot)
ggsave(file.path(outdir, paste0("QC_01.png")))
ggsave(file.path(outdir, paste0("QC_01.pdf")))

In [None]:
sessionInfo()