## Creates binarized Seurat objects, integrates conditions and annotates genes by nearby peaks

In [1]:
# Input info
cellr_in = "/data2/isshamie/mito_lineage/data/processed/mtscATAC/DUPI_april08_2021_Croker/MTblacklist_mtasnucl" 
outdir =  "/data2/isshamie/mito_lineage/output/annotation/DUPI_april08_2021/MTblacklist/mergedSamples" 
samples = "rxn1,rxn2,rxn3,rxn4" 
sample_names = "preA,preB,postA,postB"

# Parameters
nTop = 25000
cores = 36

In [2]:
library(repr)
options(repr.plot.width=12, repr.plot.height=12)

In [3]:
# # Input info
# cellr_in <- "/data2/isshamie/mito_lineage/data/processed/mtscATAC/jan21_2021/MTblacklist"
# samples <- "J2,P2"
# sample_names <- "Flt3l,Control"

# # Saving
# outdir <- "/data/isshamie/mito_lineage/output/annotation/cd34norm/MTblacklist/mergedSamples" #"/data2/mito_lineage/Analysis/annotation/output/data/"

# # Parameters
# nTop = 25000
# assay="RNA"

# cores = 36

In [4]:
samples <- unlist(strsplit(samples, ",")[[1]])
sample_names <- unlist(strsplit(sample_names, ","))

samples

In [5]:
library(GenomicRanges)
library(Seurat)
library(Signac)
library(GenomeInfoDb)
library(EnsDb.Hsapiens.v75)
library(ggplot2)
library(patchwork)
set.seed(1234)
library(data.table)
library(magrittr)
library(cowplot)
library(metap)
library(dplyr)
library(future)
plan()

plan("multiprocess", workers = cores)
options(future.globals.maxSize = 8000 * 1024^2)
#options(future.globals.maxSize = 50000 * 1024^2) # for 50 Gb RAM
#plan("multiprocess", workers = workers)

Loading required package: stats4

Loading required package: BiocGenerics

Loading required package: parallel


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:parallel’:

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which.max, which.min


Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The follow

"Strategy 'multiprocess' is deprecated in future (>= 1.20.0). Instead, explicitly specify either 'multisession' or 'multicore'. In the current R session, 'multiprocess' equals 'multicore'."


## Merge all peaks

In [6]:
read.peaks <- function(exp, cellr_in){
    print('here')
    print(file.path(cellr_in, exp, "outs", "filtered_peak_bc_matrix", "peaks.bed"))
    peaks <- read.table(
      file = file.path(cellr_in, exp, "outs", "filtered_peak_bc_matrix", "peaks.bed"),
      col.names = c("chr", "start", "end")
    )
    # convert to genomic ranges
    gr <- makeGRangesFromDataFrame(peaks)
    return(gr)
}


gr.full <- c(sapply(samples, read.peaks, cellr_in=cellr_in, USE.NAMES=F))

gr.full.c <- gr.full[[1]]
if (length(gr.full)>1){
    for (i in 2:length(gr.full)){
      gr.full.c <- c(gr.full.c, gr.full[[i]])
    }
}
combined.peaks <- reduce(x = c(gr.full.c))

# Filter out bad peaks based on length
peakwidths <- width(combined.peaks)
combined.peaks <- combined.peaks[peakwidths  < 10000 & peakwidths > 20]
combined.peaks

[1] "here"
[1] "/data2/isshamie/mito_lineage/data/processed/mtscATAC/DUPI_april08_2021_Croker/MTblacklist_mtasnucl/rxn1/outs/filtered_peak_bc_matrix/peaks.bed"
[1] "here"
[1] "/data2/isshamie/mito_lineage/data/processed/mtscATAC/DUPI_april08_2021_Croker/MTblacklist_mtasnucl/rxn2/outs/filtered_peak_bc_matrix/peaks.bed"
[1] "here"
[1] "/data2/isshamie/mito_lineage/data/processed/mtscATAC/DUPI_april08_2021_Croker/MTblacklist_mtasnucl/rxn3/outs/filtered_peak_bc_matrix/peaks.bed"
[1] "here"
[1] "/data2/isshamie/mito_lineage/data/processed/mtscATAC/DUPI_april08_2021_Croker/MTblacklist_mtasnucl/rxn4/outs/filtered_peak_bc_matrix/peaks.bed"


GRanges object with 49634 ranges and 0 metadata columns:
          seqnames            ranges strand
             <Rle>         <IRanges>  <Rle>
      [1]     chr1       10169-10267      *
      [2]     chr1     191337-191519      *
      [3]     chr1     268046-268142      *
      [4]     chr1     778355-779233      *
      [5]     chr1     827080-827940      *
      ...      ...               ...    ...
  [49630]     chrY 56863163-56863474      *
  [49631]     chrY 56865579-56866679      *
  [49632]     chrY 56868819-56871669      *
  [49633]     chrY 56872060-56874599      *
  [49634]     chrY 56879752-56880373      *
  -------
  seqinfo: 25 sequences from an unspecified genome; no seqlengths

## Create fragment objects

In [7]:
# create_frag <- function(exp, cellr_in){
#     barcode_path <- file.path(cellr_in, exp, "outs", "filtered_peak_bc_matrix", "barcodes.tsv")    
#     barcodes <- readr::read_tsv(barcode_path, col_names = F) # %>% tidyr::unite(barcode)
#     frag_file <- file.path(cellr_in, exp, "outs", "fragments.tsv.gz")
#     # quantify multiome peaks in the scATAC-seq dataset
#     frags.curr <- CreateFragmentObject(path = frag_file,cells= barcodes[["X1"]])
#     barcodes <- as.data.frame(barcodes) %>%  tibble::column_to_rownames(var="X1") %>% tibble::add_column(proj=exp)
#     ## Quantify peaks
#     curr.counts <- FeatureMatrix(
#       fragments = frags.curr,
#       features = combined.peaks,
#       cells = rownames(barcodes),
#       process_n = 8
#     )
    
#     ## Create the objects
#     curr_assay <- CreateChromatinAssay(curr.counts, fragments = frags.curr)
#     curr <- CreateSeuratObject(curr_assay, assay = "ATAC", project=exp, meta.data=barcodes)
#     curr <- BinarizeCounts(curr)
#     return(curr)
# }
allSE = c()#[]

samples_df <- cbind(sample_names, samples)
for (row in 1:nrow(samples_df)){
    exp <- (samples_df[[row, "samples"]])
    name <- (samples_df[[row, "sample_names"]]) 
    
#for (exp in samples) {
    print('exp')
    print(exp)
    print('name')
    print(name)
    barcode_path <- file.path(cellr_in, exp, "outs", "filtered_peak_bc_matrix", "barcodes.tsv")    
    barcodes <- readr::read_tsv(barcode_path, col_names = F) # %>% tidyr::unite(barcode)
    barcodes <- as.data.frame(barcodes) %>%  tibble::column_to_rownames(var="X1") %>% tibble::add_column(proj=name)
    frag_file <- file.path(cellr_in, exp, "outs", "fragments.tsv.gz")
    
    cells.meta.f <- file.path(cellr_in, exp, "outs", "singlecell.csv") 
    cells.meta <- as.data.frame(readr::read_csv(cells.meta.f)) %>% tibble::column_to_rownames(var="barcode") %>% tibble::add_column(proj=name)
    cells.meta <- cells.meta[rownames(cells.meta) %in% rownames(barcodes), ]

    # quantify multiome peaks in the scATAC-seq dataset
    
    
    print("Creating fragments object")
    frags.curr <- CreateFragmentObject(path = frag_file, cells= rownames(barcodes))
    #print(frags.curr)
    print("Quantifying peaks")
    ## Quantify peaks
    curr.counts <- FeatureMatrix(
      fragments = frags.curr,
      features = combined.peaks,
      cells = rownames(barcodes),
      process_n = cores
    )
    
    print("Creating chromatin assay")
    ## Create the objects and use simple filters
    curr_assay <- CreateChromatinAssay(curr.counts, fragments = frags.curr, min.cells = 10, min.features = 200)
    curr <- CreateSeuratObject(curr_assay, assay = "ATAC", project=name, meta.data=cells.meta)
    
    print(head(curr[[]]))
    allSE = c(allSE, curr)
    #return(curr)
}

allSE

#allSE <- sapply(samples, create_frag, cellr_in=cellr_in)

[1] "exp"
[1] "rxn1"
[1] "name"
[1] "preA"


Registered S3 method overwritten by 'cli':
  method     from         
  print.boxx spatstat.geom
[1m[1mRows: [1m[22m[34m[34m450[34m[39m [1m[1mColumns: [1m[22m[34m[34m1[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): X1


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

[1m[1mRows: [1m[22m[34m[34m191669[34m[39m [1m[1mColumns: [1m[22m[34m[34m18[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m─────────────────────────────────────────────────────────────────────────

[1] "Creating fragments object"


Computing hash



[1] "Quantifying peaks"


Extracting reads overlapping genomic regions



[1] "Creating chromatin assay"


"Some cells in meta.data not present in provided counts matrix."


                   orig.ident nCount_ATAC nFeature_ATAC total duplicate
AAACTCGTCCGCCTAT-1       preA         480           438 11194      5530
AAAGATGCAAAGCTGG-1       preA         439           410  6506      3516
AAAGGATAGAAACGCC-1       preA        1173           981 12844      6696
AAATGCCTCGCTATAG-1       preA         404           360  7438      4381
AACGAGGCACTCGCAG-1       preA         577           502 13910      8695
AACGAGGGTTTAGGAA-1       preA         512           476  8319      3811
                   chimeric unmapped lowmapq mitochondrial passed_filters
AAACTCGTCCGCCTAT-1       80      116     608             0           4860
AAAGATGCAAAGCTGG-1       71       40     239             0           2640
AAAGGATAGAAACGCC-1      131       84     477             0           5456
AAATGCCTCGCTATAG-1       67       52     284             0           2654
AACGAGGCACTCGCAG-1      128       90     553             0           4444
AACGAGGGTTTAGGAA-1       57       89     525        

[1m[1mRows: [1m[22m[34m[34m1546[34m[39m [1m[1mColumns: [1m[22m[34m[34m1[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): X1


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

[1m[1mRows: [1m[22m[34m[34m385146[34m[39m [1m[1mColumns: [1m[22m[34m[34m18[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

[1] "Creating fragments object"


Computing hash



[1] "Quantifying peaks"


Extracting reads overlapping genomic regions



[1] "Creating chromatin assay"


"Some cells in meta.data not present in provided counts matrix."


                   orig.ident nCount_ATAC nFeature_ATAC total duplicate
AAACGAAGTCACTCTC-1       preB        4767          3793 38954     29912
AAACGAAGTCCCTAAA-1       preB        5290          4188 24309     15056
AAACGAAGTCTGTGTA-1       preB         727           694 17674     15525
AAACTCGCAACTCGTA-1       preB        7619          5315 39016     26386
AAACTCGCAGAACGAC-1       preB        7564          5683 70990     51473
AAACTGCCACACATGT-1       preB        1084           979 23591     20903
                   chimeric unmapped lowmapq mitochondrial passed_filters
AAACGAAGTCACTCTC-1      335      145    1057             0           7505
AAACGAAGTCCCTAAA-1      258      113     968             0           7914
AAACGAAGTCTGTGTA-1      189       72     200             0           1688
AAACTCGCAACTCGTA-1      437      223    1941             0          10029
AAACTCGCAGAACGAC-1      759      331    2599             0          15828
AAACTGCCACACATGT-1      197       96     328        

[1m[1mRows: [1m[22m[34m[34m2589[34m[39m [1m[1mColumns: [1m[22m[34m[34m1[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): X1


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

[1m[1mRows: [1m[22m[34m[34m375350[34m[39m [1m[1mColumns: [1m[22m[34m[34m18[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

[1] "Creating fragments object"


Computing hash



[1] "Quantifying peaks"


Extracting reads overlapping genomic regions



[1] "Creating chromatin assay"


"Some cells in meta.data not present in provided counts matrix."


                   orig.ident nCount_ATAC nFeature_ATAC total duplicate
AAACGAACAGGGTACA-1      postA        4597          3794 44884     32449
AAACGAAGTCAACGGA-1      postA         731           655 26552     23667
AAACGAAGTCAGACGA-1      postA        3089          2657 51323     39592
AAACGAAGTGATAGAT-1      postA         822           769 15254     11179
AAACGAATCGAAGCCC-1      postA        1563          1370 19254     14572
AAACTCGAGAGAGTTT-1      postA         713           680 17279     14122
                   chimeric unmapped lowmapq mitochondrial passed_filters
AAACGAACAGGGTACA-1      362      195    1813             0          10065
AAACGAAGTCAACGGA-1      311       83     349             0           2142
AAACGAAGTCAGACGA-1      490      229    1487             0           9525
AAACGAAGTGATAGAT-1      151       93     481             0           3350
AAACGAATCGAAGCCC-1      257      106     737             0           3582
AAACTCGAGAGAGTTT-1      169       44     491        

[1m[1mRows: [1m[22m[34m[34m2890[34m[39m [1m[1mColumns: [1m[22m[34m[34m1[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): X1


[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set [30m[47m[30m[47m`show_col_types = FALSE`[47m[30m[49m[39m to quiet this message.

[1m[1mRows: [1m[22m[34m[34m466091[34m[39m [1m[1mColumns: [1m[22m[34m[34m18[34m[39m

[36m──[39m [1m[1mColumn specification[1m[22m [36m─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

[1] "Creating fragments object"


Computing hash



[1] "Quantifying peaks"


Extracting reads overlapping genomic regions



[1] "Creating chromatin assay"
                   orig.ident nCount_ATAC nFeature_ATAC total duplicate
AAACGAAAGAGCAGCT-1      postB        1111          1025 11183      7609
AAACGAAAGAGCTACG-1      postB        2765          2384 14727      9299
AAACGAAAGTGAACCG-1      postB        1405          1288 10181      5818
AAACGAATCATGTTCT-1      postB        1767          1491 57814     21474
AAACGAATCGCTTACC-1      postB        1437          1312 11063      6789
AAACGAATCGGAGTTT-1      postB        5059          4002 28834     16252
                   chimeric unmapped lowmapq mitochondrial passed_filters
AAACGAAAGAGCAGCT-1      132       40     527             0           2875
AAACGAAAGAGCTACG-1      126       85     600             0           4617
AAACGAAAGTGAACCG-1       81       47     634             0           3601
AAACGAATCATGTTCT-1      859      206    7868             0          27407
AAACGAATCGCTTACC-1      109       50     525             0           3590
AAACGAATCGGAGTTT-1   

[[1]]
An object of class Seurat 
4759 features across 198 samples within 1 assay 
Active assay: ATAC (4759 features, 0 variable features)

[[2]]
An object of class Seurat 
42425 features across 1354 samples within 1 assay 
Active assay: ATAC (42425 features, 0 variable features)

[[3]]
An object of class Seurat 
43702 features across 2585 samples within 1 assay 
Active assay: ATAC (43702 features, 0 variable features)

[[4]]
An object of class Seurat 
45516 features across 2890 samples within 1 assay 
Active assay: ATAC (45516 features, 0 variable features)


In [8]:
curr.counts <- FeatureMatrix(
  fragments = frags.curr,
  features = combined.peaks,
  cells = rownames(barcodes),
  process_n = cores
)

Extracting reads overlapping genomic regions



## Merge

In [9]:
# merge all datasets, adding a cell ID to make sure cell names are unique
combined <- merge(
  x = allSE[[1]],
  y = unlist(allSE[2:length(allSE)],use.names=FALSE), #allSE[2:length(allSE)],
  add.cell.ids = sample_names
)
combined[["ATAC"]]



ERROR: Error in all.counts[[2:length(x = all.counts)]]: recursive indexing failed at level 2



In [None]:
combined <- FindTopFeatures(combined, min.cutoff = 20)
combined

### Plot metadata passed_filters, nCount_ATAC, and duplicates

In [None]:
combined$orig.ident <- factor(combined$orig.ident, levels = sample_names)

VlnPlot(
  object = combined,
  features = c('nCount_ATAC', 'peak_region_fragments', 'passed_filters',
               'duplicate', 'unmapped'),
  split.by = "orig.ident",
  pt.size = 0.1,
  ncol = 3
)

In [None]:
# Binarize and run LSI
combined <- BinarizeCounts(combined)
combined <- RunTFIDF(combined)
combined <- RunSVD(combined)
combined <- RunUMAP(combined, dims = 2:50, reduction = 'lsi')
DimPlot(combined, group.by = "proj", pt.size = 0.1)

In [None]:
pDepthCorr <- DepthCor(combined)
pDepthCorr

In [None]:
saveRDS(combined, file.path(outdir, paste0("allSamples.merged.rds")))

## Integrate datasets
### Uses https://satijalab.org/signac/articles/integrate_atac.html

In [None]:
p1 <- DimPlot(combined, group.by = "proj")

## First break them up again by subsetting, then integrating

In [None]:
# ext <- subset(x = combined, subset = orig.ident == samples[1])
# curr <- subset(x = combined, subset = orig.ident == samples[2])

allSE <- lapply(sample_names,  function(x) subset(combined, subset = orig.ident == x))
allSE

In [None]:
# find integration anchors
integration.anchors <- FindIntegrationAnchors(
  object.list = allSE, #c(ext,curr),
  anchor.features = allSE[[1]], #rownames(ext),
  reduction = "rlsi",
  dims = 2:30
)

# integrate LSI embeddings
integrated <- IntegrateEmbeddings(
  anchorset = integration.anchors,
  reductions = combined[["lsi"]],
  new.reduction.name = "integrated_lsi",
  dims.to.integrate = 1:30
)



In [None]:
# create a new UMAP using the integrated embeddings
integrated <- RunUMAP(integrated, reduction = "integrated_lsi", dims = 2:30)
p2 <- DimPlot(integrated, group.by = "proj")

In [None]:
pclust <- DimPlot(object = integrated, label = TRUE) + NoLegend()

(p1 + ggtitle("Merged")) | (p2 + ggtitle("Integrated"))
ggsave(file.path(outdir,"integrated.merged.compare.png"))

In [None]:
p2
ggsave(file.path(outdir,"integrated.batch.png"))

In [None]:
pDepthCorr <- DepthCor(integrated, reduction='integrated_lsi')
pDepthCorr

## Plot new cluster results

In [None]:
#integrated <- RunUMAP(object = integrated, reduction = 'integrated_lsi', dims = 2:30)
integrated <- FindNeighbors(object = integrated, reduction = 'integrated_lsi', dims = 2:30)
integrated <- FindClusters(object = integrated, verbose = FALSE, algorithm = 3)


In [None]:
pclust <- DimPlot(object = integrated, label = TRUE) + NoLegend()
ggsave(file.path(outdir, "integrated.lsi.clusters.png"), pclust)
pclust

In [None]:
## ATAC DE peaks

# # change back to working with peaks instead of gene activities
# DefaultAssay(integrated) <- 'ATAC'

# da_peaks <- FindMarkers(
#   object = integrated,
#   ident.1 = 9, #"CD4 Naive",
#   min.pct = 0.05,
#   test.use = 'LR',
#   #latent.vars = 'peak_region_fragments'
# )


# plot1 <- VlnPlot(
#   object = integrated,
#   features = rownames(da_peaks)[1],
#   pt.size = 0.1,
#   idents = c(1,9)
# )
# plot2 <- FeaturePlot(
#   object = integrated,
#   features = rownames(da_peaks)[1],
#   pt.size = 0.1
# )

# plot1 | plot2


## Get gene activity results and run DE results for RNA

In [None]:
# extract gene annotations from EnsDb
annotations <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v75)

# change to UCSC style since the data was mapped to hg19
seqlevelsStyle(annotations) <- 'UCSC'
genome(annotations) <- "hg38"

# add the gene information to the object
Annotation(integrated) <- annotations


gene.activities <- GeneActivity(integrated)


In [None]:
# add the gene activity matrix to the Seurat object as a new assay and normalize it
integrated[['RNA']] <- CreateAssayObject(counts = gene.activities)
integrated <- NormalizeData(
  object = integrated,
  assay = 'RNA',
  normalization.method = 'LogNormalize',
  scale.factor = median(integrated$nCount_RNA)
)

In [None]:
DefaultAssay(integrated) <- 'RNA'

In [None]:
saveRDS(integrated, file.path(outdir, paste0("allSamples.integrated.rds")))

In [None]:
sessionInfo()