## Creates binarized Seurat objects, and merges them and finds Top Features (min.cutoff=20)

In [1]:
# Input info
cellr_in <- "/data2/isshamie/mito_lineage/data/processed/mtscATAC/jan21_2021/MTblacklist/"
samples <- "J2,P2"
sample_names <- "Flt3,Ctrl"

# External
external_dat_dir <- "/data2/mito_lineage/Analysis/annotation/output/data/granja_cd34"
external_frag_file <- "granja_cd34.fragments.tsv"
external_prefix <- "GSE129785_scATAC-Hematopoiesis-CD34"
external_dat_file <- "granja_10X_CD34.rds"

# Saving
outdir <- "/data2/mito_lineage/Analysis/output/data/processed/mttrace/jan21_2021/MTblacklist/" #"/data2/mito_lineage/Analysis/annotation/output/data/"

# Parameters
nTop = 25000

In [2]:
samples <- unlist(strsplit(samples, ",")[[1]])
sample_names <- strsplit(sample_names, ",")

samples

In [3]:
external_frag_full <- file.path(external_dat_dir, external_frag_file)
ext_frag_file <- gsub('.fragments.tsv', '.fragments.sort.tsv.gz', external_frag_full)

external_frag_full

In [4]:
library(GenomicRanges)
library(Seurat)
library(Signac)
library(GenomeInfoDb)
library(EnsDb.Hsapiens.v75)
library(ggplot2)
library(patchwork)
set.seed(1234)
library(data.table)
library(magrittr)

library(future)
plan()
#plan("multiprocess", workers = workers)
options(future.globals.maxSize = 8000 * 1024^2)

Loading required package: stats4

Loading required package: BiocGenerics

Loading required package: parallel


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which.max, which.min


Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The follow

## Merge all peaks

In [5]:
read.peaks <- function(exp, cellr_in){
    print('here')
    print(file.path(cellr_in, exp, "outs", "filtered_peak_bc_matrix", "peaks.bed"))
    peaks <- read.table(
      file = file.path(cellr_in, exp, "outs", "filtered_peak_bc_matrix", "peaks.bed"),
      col.names = c("chr", "start", "end")
    )
    # convert to genomic ranges
    gr <- makeGRangesFromDataFrame(peaks)
    return(gr)
}


gr.full <- c(sapply(samples, read.peaks, cellr_in=cellr_in, USE.NAMES=F))

# peaks.curr <- read.table(
#   file = file.path(singlecell_sumstats_dir, exp, "filtered_peak_bc_matrix", "peaks.bed"),
#   col.names = c("chr", "start", "end")
# )


#Create a unified set of peaks to quantify in each dataset
#combined.peaks <- reduce(x = c(gr.full[[1]], gr.full[[2]], gr.full[[1]]))
gr.full.c <- gr.full[[1]]
if (length(gr.full)>1){
    for (i in 2:length(gr.full)){
      gr.full.c <- c(gr.full.c, gr.full[[i]])
    }
}
combined.peaks <- reduce(x = c(gr.full.c))

# Filter out bad peaks based on length
peakwidths <- width(combined.peaks)
combined.peaks <- combined.peaks[peakwidths  < 10000 & peakwidths > 20]
combined.peaks

[1] "here"
[1] "/data2/isshamie/mito_lineage/data/processed/mtscATAC/jan21_2021/MTblacklist//J2/outs/filtered_peak_bc_matrix/peaks.bed"
[1] "here"
[1] "/data2/isshamie/mito_lineage/data/processed/mtscATAC/jan21_2021/MTblacklist//P2/outs/filtered_peak_bc_matrix/peaks.bed"


GRanges object with 149551 ranges and 0 metadata columns:
           seqnames            ranges strand
              <Rle>         <IRanges>  <Rle>
       [1]     chr1        9942-10364      *
       [2]     chr1     191685-191736      *
       [3]     chr1     267780-268257      *
       [4]     chr1     271072-271548      *
       [5]     chr1     585995-586411      *
       ...      ...               ...    ...
  [149547]     chrY 56844769-56845155      *
  [149548]     chrY 56846033-56848664      *
  [149549]     chrY 56849234-56851581      *
  [149550]     chrY 56857506-56857613      *
  [149551]     chrY 56873729-56874140      *
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

## Create fragment objects

In [6]:
create_frag <- function(exp, cellr_in){
    barcode_path <- file.path(cellr_in, exp, "outs", "filtered_peak_bc_matrix", "barcodes.tsv")    
    barcodes <- readr::read_tsv(barcode_path, col_names = F) # %>% tidyr::unite(barcode)
    frag_file <- file.path(cellr_in, exp, "outs", "fragments.tsv.gz")
    # quantify multiome peaks in the scATAC-seq dataset
    frags.curr <- CreateFragmentObject(path = frag_file,cells= barcodes[["X1"]])
    barcodes <- as.data.frame(barcodes) %>%  tibble::column_to_rownames(var="X1") %>% tibble::add_column(proj=exp)
    ## Quantify peaks
    curr.counts <- FeatureMatrix(
      fragments = frags.curr,
      features = combined.peaks,
      cells = rownames(barcodes)
    )
    
    ## Create the objects
    curr_assay <- CreateChromatinAssay(curr.counts, fragments = frags.curr)
    curr <- CreateSeuratObject(curr_assay, assay = "ATAC", project=exp, meta.data=barcodes)
    curr <- BinarizeCounts(curr)
    return(curr)
}



In [7]:
allSE <- sapply(samples, create_frag, cellr_in=cellr_in)

Registered S3 method overwritten by 'cli':
  method     from         
  print.boxx spatstat.geom

[36m──[39m [1m[1mColumn specification[1m[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
  X1 = [31mcol_character()[39m
)


Computing hash

Extracting reads overlapping genomic regions


[36m──[39m [1m[1mColumn specification[1m[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
  X1 = [31mcol_character()[39m
)


Computing hash

Extracting reads overlapping genomic regions



## Merge

In [12]:
# merge all datasets, adding a cell ID to make sure cell names are unique
combined <- merge(
  x = allSE[1],
  y = allSE[2:length(allSE)],
  add.cell.ids = sample_names
)
combined[["ATAC"]]

In [14]:
combined <- FindTopFeatures(combined, min.cutoff = 20)
combined

ERROR: Error in FindTopFeatures(combined, min.cutoff = 20): object 'combined' not found


In [13]:
saveRDS(combined, file.path(outdir, paste0("allSamples.merged.rds")))

ERROR: Error in saveRDS(combined, file.path(outdir, paste0("allSamples.merged.rds"))): object 'combined' not found


In [None]:
combined <- RunTFIDF(combined)
combined <- RunSVD(combined)
combined <- RunUMAP(combined, dims = 2:50, reduction = 'lsi')
DimPlot(combined, group.by = "proj", pt.size = 0.1)

In [None]:
DimPlot(combined, group.by = "Group", pt.size = 0.1)

In [None]:
create_se <- function(countsA, countsB, frags.countsA, frags.countsB){
    countsA_assay <- CreateChromatinAssay(countsA, fragments = frags.countsA)
    countsA <- CreateSeuratObject(countsA_assay, assay = "ATAC")
    
    countsB_assay <- CreateChromatinAssay(countsB, fragments = frags.countsB)
    countsB <- CreateSeuratObject(countsB_assay, assay = "ATAC")
    countsB$dataset <- 'granja_cd34'
    countsA$dataset <- exp
    
    # merge all datasets, adding a cell ID to make sure cell names are unique
    combined <- merge(
      x = countsB,
      y = countsA,
      add.cell.ids = c("countsB", exp)
    )
    combined[["ATAC"]]
    combined <- RunTFIDF(combined)
    combined <- FindTopFeatures(combined, min.cutoff = 20)
    combined <- RunSVD(combined)
    combined <- RunUMAP(combined, dims = 2:50, reduction = 'lsi')
    DimPlot(combined, group.by = 'dataset', pt.size = 0.1)
    return(combined)
}

### Make sure the peaks overlap

In [None]:
onlyOverlap <- FALSE
if (onlyOverlap){
    peaks_in_curr <- names(which(rowSums(curr.counts)>0))
    peaks_in_ext <- names(which(rowSums(ext.counts)>0))
    peaks_both <- intersect(peaks_in_curr, peaks_in_ext)
    overlap.curr.counts <- curr.counts[peaks_both,]
    overlap.ext.counts <- ext.counts[peaks_both,]

    combined <- create_se(overlap.curr.counts, overlap.ext.counts, frags.curr, frags.ext)
    DimPlot(combined, group.by = 'dataset', pt.size = 0.1)
    saveRDS(combined, file.path(outdir, paste0(exp, ".merged.over0Peaks.rds")))
}

In [None]:
print