## Creates binarized Seurat objects, integrates conditions and annotates genes by nearby peaks

In [1]:
# Input info
cellr_in = "/home/isaac/lewis/mito_lineage/output/mtscATAC/data/CHIP_dec172021_b1/MTBlacklist_A2/"

outdir = "/mnt/md0/isshamie/Projects/Mito_Trace/output/pipeline/v04/CHIP_b1/MTBlacklist_A2/data/annotation/gff_A2_black/external_granja/"
sample_names = "Input,Flt3l"
samples = "Input,Flt3l"

# Parameters
nTop = 25000
cores = 24
gff_id = "hg38_1_2"

annotations_indir = "/mnt/md0/isshamie/Projects/Mito_Trace/data/processed/annotation_granges/"

# External
external_dat_dir <- "/data/Mito_Trace/output/annotation/data/granja_cd34/"
#external_prefix <- "GSE129785_scATAC-Hematopoiesis-CD34"


ext_name <- "hg38.granja_cd34"
external_frag_file <- "/data/Mito_Trace/output/annotation/data/granja_cd34/hg38.granja_cd34.fragments_sorted.tsv.gz"
external_prefix <- "hg38.GSE129785_scATAC-Hematopoiesis-CD34"
#external_frag_file <- "/data/Mito_Trace/output/annotation/data/granja_cd34/granja_cd34.fragments.sort.tsv.gz"
use.sort.frag = FALSE

# Parameters
nTop = 25000
lsi_start_comp = 2
#to.filt.cells = "TRUE"
to.filt.cells = "FALSE"


to.qc = FALSE

## QC parameters
min_peak_region_fragments=10
max_peak_region_fragments=15000
min_pct_reads_in_peaks=15
max_blacklist_ratio=0.05
max_nucleosome_signal=4
min_TSS_enrichment=0.2


In [2]:
if ((to.filt.cells == "T") | (to.filt.cells == "TRUE")){
    to.filt.cells = TRUE
}else{to.filt.cells = FALSE}


In [3]:
library(repr)
options(repr.plot.width=12, repr.plot.height=12)

“package ‘repr’ was built under R version 4.1.2”


In [4]:
samples <- unlist(strsplit(samples, ",")[[1]])
sample_names <- unlist(strsplit(sample_names, ","))

samples

In [27]:
# Change to the sorted fragment file!
if(use.sort.frag){
    ext_frag_file <- gsub('.fragments.tsv', '.fragments.sort.tsv.gz', external_frag_file)
}else{ext_frag_file = external_frag_file}

external_dat_dir <- dirname(external_frag_file)
ext_frag_file

In [6]:
annotations <- readRDS(file.path(annotations_indir, paste0(gff_id, ".annotationGranges.rds")))

In [7]:
annotations

Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which.max, which.min


Loading required package: S4Vectors

“package ‘S4Vectors’ was built under R version 4.1.2”

Attaching package: ‘S4Vectors’


The following objects are masked from ‘package:base’:

    expand.grid, I, unname


Loading required package: IRanges

“package ‘IRanges’ was built under R version 4.1.2”
Loading required

GRanges object with 1684537 ranges and 21 metadata columns:
            seqnames      ranges strand |   source       type     score
               <Rle>   <IRanges>  <Rle> | <factor>   <factor> <numeric>
        [1]     chr1 11869-14409      + |   HAVANA gene              NA
        [2]     chr1 11869-14409      + |   HAVANA transcript        NA
        [3]     chr1 11869-12227      + |   HAVANA exon              NA
        [4]     chr1 12613-12721      + |   HAVANA exon              NA
        [5]     chr1 13221-14409      + |   HAVANA exon              NA
        ...      ...         ...    ... .      ...        ...       ...
  [1684533]     chrM 15888-15953      + |  ENSEMBL transcript        NA
  [1684534]     chrM 15888-15953      + |  ENSEMBL exon              NA
  [1684535]     chrM 15956-16023      - |  ENSEMBL gene              NA
  [1684536]     chrM 15956-16023      - |  ENSEMBL transcript        NA
  [1684537]     chrM 15956-16023      - |  ENSEMBL exon              NA
    

In [8]:
library(GenomicRanges)
library(Seurat)
library(Signac)
library(GenomeInfoDb)
#library(EnsDb.Hsapiens.v75)
library(ggplot2)
library(patchwork)
set.seed(1234)
library(data.table)
library(magrittr)
library(cowplot)
library(metap)
library(dplyr)
library(future)

plan("multiprocess", workers = cores)
options(future.globals.maxSize = 8000 * 1024^2)
#options(future.globals.maxSize = 50000 * 1024^2) # for 50 Gb RAM
#plan("multiprocess", workers = workers)

Registered S3 method overwritten by 'spatstat.core':
  method          from
  formula.glmmPQL MASS

Attaching SeuratObject

Attaching sp

“package ‘ggplot2’ was built under R version 4.1.3”

Attaching package: ‘data.table’


The following object is masked from ‘package:GenomicRanges’:

    shift


The following object is masked from ‘package:IRanges’:

    shift


The following objects are masked from ‘package:S4Vectors’:

    first, second


“package ‘magrittr’ was built under R version 4.1.3”

Attaching package: ‘cowplot’


The following object is masked from ‘package:patchwork’:

    align_plots


“package ‘dplyr’ was built under R version 4.1.3”

Attaching package: ‘dplyr’


The following objects are masked from ‘package:data.table’:

    between, first, last


The following objects are masked from ‘package:GenomicRanges’:

    intersect, setdiff, union


The following object is masked from ‘package:GenomeInfoDb’:

    intersect


The following objects are masked from ‘package:IRan

In [9]:


# # Create a unified set of peaks to quantify in each dataset
# #combined.peaks <- reduce(x = c(gr.500, gr.1k, gr.5k, gr.10k))
# combined.peaks <- reduce(x = c(gr.ext, gr.curr))

# # Filter out bad peaks based on length
# peakwidths <- width(combined.peaks)
# combined.peaks <- combined.peaks[peakwidths  < 10000 & peakwidths > 20]
# combined.peaks

## External data processing

In [10]:
barcode_path <- file.path(external_dat_dir, paste0(external_prefix, ".cell_barcodes.txt"))

metadata <- read.csv(
  file = file.path(external_dat_dir, paste0(external_prefix, ".cell_barcodes.txt")),
  header = TRUE,
  sep='\t',
)
#CD34boo <- colData$Group %in% c("CD34_Progenitors_Rep1","CD34_Progenitors_Rep2")
c1boo <-  metadata$Group %in% c("BM_pDC", "CLP", "CMP", "GMP", "HSC", "LMPP", "MEP", "Monocytes", "MPP")
metadata <- metadata[c1boo,]

rownames(metadata) <- metadata$Group_Barcode
metadata$proj <- "granja_cd34"
ext_cell_names <- rownames(metadata)
metadata <- metadata[,c("Group", "proj")]
ext_cell_names



## Merge all peaks

In [11]:
read.peaks <- function(exp, cellr_in){
    print('here')
    print(file.path(cellr_in, exp, "outs", "filtered_peak_bc_matrix", "peaks.bed"))
    peaks <- read.table(
      file = file.path(cellr_in, exp, "outs", "filtered_peak_bc_matrix", "peaks.bed"),
      col.names = c("chr", "start", "end")
    )
    # convert to genomic ranges
    gr <- makeGRangesFromDataFrame(peaks)
    return(gr)
}


gr.full <- c(sapply(samples, read.peaks, cellr_in=cellr_in, USE.NAMES=F))

gr.full.c <- gr.full[[1]]
if (length(gr.full)>1){
    for (i in 2:length(gr.full)){
      gr.full.c <- c(gr.full.c, gr.full[[i]])
    }
}
combined.peaks <- reduce(x = c(gr.full.c))

# read in peak sets
peaks.ext <- read.table(
  file = file.path(external_dat_dir, paste0(external_prefix, ".peaks.bed")),
  col.names = c("chr", "start", "end")
)

# convert to genomic ranges
gr.ext <- makeGRangesFromDataFrame(peaks.ext)

combined.peaks <- reduce(x = c(gr.ext, combined.peaks))


# Filter out bad peaks based on length
peakwidths <- width(combined.peaks)
combined.peaks <- combined.peaks[peakwidths  < 10000 & peakwidths > 20]
head(combined.peaks)

[1] "here"
[1] "/home/isaac/lewis/mito_lineage/output/mtscATAC/data/CHIP_dec172021_b1/MTBlacklist_A2//Input/outs/filtered_peak_bc_matrix/peaks.bed"
[1] "here"
[1] "/home/isaac/lewis/mito_lineage/output/mtscATAC/data/CHIP_dec172021_b1/MTBlacklist_A2//Flt3l/outs/filtered_peak_bc_matrix/peaks.bed"


“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': chr1_KI270706v1_random, chr4_GL000008v2_random, chr7_KI270803v1_alt, chr8_KI270821v1_alt, chr14_KI270846v1_alt, chr15_KI270850v1_alt, chr17_KI270909v1_alt, chr19_KI270938v1_alt, chr22_KI270879v1_alt, chrUn_KI270742v1
  - in 'y': chrY
  Make sure to always combine/compare objects based on the same reference


GRanges object with 6 ranges and 0 metadata columns:
      seqnames        ranges strand
         <Rle>     <IRanges>  <Rle>
  [1]     chr1    9866-10738      *
  [2]     chr1   16106-16366      *
  [3]     chr1   96375-96864      *
  [4]     chr1 115476-115976      *
  [5]     chr1 180724-181403      *
  [6]     chr1 184126-184527      *
  -------
  seqinfo: 34 sequences from an unspecified genome; no seqlengths

In [12]:
gr.full[[1]]

GRanges object with 168267 ranges and 0 metadata columns:
           seqnames            ranges strand
              <Rle>         <IRanges>  <Rle>
       [1]     chr1        9975-10498      *
       [2]     chr1       16106-16366      *
       [3]     chr1     180917-181403      *
       [4]     chr1     184126-184527      *
       [5]     chr1     191059-192097      *
       ...      ...               ...    ...
  [168263]     chrY 56842202-56842607      *
  [168264]     chrY 56843431-56845155      *
  [168265]     chrY 56846249-56847133      *
  [168266]     chrY 56849352-56849907      *
  [168267]     chrY 56850419-56850755      *
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

In [13]:
peaks.ext

chr,start,end
<chr>,<int>,<int>
chr1,10238,10738
chr1,115476,115976
chr1,267105,267605
chr1,267760,268260
chr1,271050,271550
chr1,280333,280833
chr1,605330,605830
chr1,774999,775499
chr1,779481,779981
chr1,804688,805188


In [14]:
length(gr.ext)

In [15]:
length(combined.peaks)

## Are any peaks in the chrM

In [16]:
length(combined.peaks[seqnames(combined.peaks) != "chrM"])


In [17]:
combined.peaks[seqnames(combined.peaks) == "chrM"]

combined.peaks[seqnames(combined.peaks) == "chr1"]


GRanges object with 0 ranges and 0 metadata columns:
   seqnames    ranges strand
      <Rle> <IRanges>  <Rle>
  -------
  seqinfo: 34 sequences from an unspecified genome; no seqlengths

GRanges object with 45547 ranges and 0 metadata columns:
          seqnames              ranges strand
             <Rle>           <IRanges>  <Rle>
      [1]     chr1          9866-10738      *
      [2]     chr1         16106-16366      *
      [3]     chr1         96375-96864      *
      [4]     chr1       115476-115976      *
      [5]     chr1       180724-181403      *
      ...      ...                 ...    ...
  [45543]     chr1 248924034-248927583      *
  [45544]     chr1 248928068-248928598      *
  [45545]     chr1 248930124-248930152      *
  [45546]     chr1 248944711-248945211      *
  [45547]     chr1 248945377-248946299      *
  -------
  seqinfo: 34 sequences from an unspecified genome; no seqlengths

## Remove any peaks in the chrM

In [18]:
combined.peaks = combined.peaks[seqnames(combined.peaks) != "chrM"]
length(combined.peaks)

## Create fragment objects

### Create external first

In [19]:
ext_frag_file

In [28]:
# quantify multiome peaks in the scATAC-seq dataset
frags.ext <- CreateFragmentObject(path = ext_frag_file,
                                  cells= ext_cell_names)

ext.counts <- FeatureMatrix(
  fragments = frags.ext,
  features = combined.peaks,
  cells = rownames(metadata)
)

ext_assay <- CreateChromatinAssay(ext.counts, fragments = frags.ext, min.cells = 10, min.features = 200)
ext <- CreateSeuratObject(ext_assay, assay = "ATAC", project=ext_name, meta.data=metadata)


Computing hash

Extracting reads overlapping genomic regions

“Feature names cannot have underscores ('_'), replacing with dashes ('-')”
“Keys should be one or more alphanumeric characters followed by an underscore, setting key from atac to atac_”


### Create for each sample

In [29]:
allSE = c() 

samples_df <- cbind(sample_names, samples)
print('samples_df')
print(samples_df)
for (row in 1:nrow(samples_df)){
    exp <- (samples_df[[row, "samples"]])
    name <- (samples_df[[row, "sample_names"]]) 
    
#for (exp in samples) {
    print('exp')
    print(exp)
    print('name')
    print(name)
    barcode_path <- file.path(cellr_in, exp, "outs", "filtered_peak_bc_matrix", "barcodes.tsv")    
    barcodes <- readr::read_tsv(barcode_path, col_names = F) # %>% tidyr::unite(barcode)
    barcodes <- as.data.frame(barcodes) %>%  tibble::column_to_rownames(var="X1") %>% tibble::add_column(proj=name)
    frag_file <- file.path(cellr_in, exp, "outs", "fragments.tsv.gz")
    
    cells.meta.f <- file.path(cellr_in, exp, "outs", "singlecell.csv") 
    cells.meta <- as.data.frame(readr::read_csv(cells.meta.f)) %>% tibble::column_to_rownames(var="barcode") %>% tibble::add_column(proj=name)
    cells.meta <- cells.meta[rownames(cells.meta) %in% rownames(barcodes), ]

    # quantify multiome peaks in the scATAC-seq dataset
    
    
    print("Creating fragments object")
    frags.curr <- CreateFragmentObject(path = frag_file, cells= rownames(barcodes))
    #print(frags.curr)
    print("Quantifying peaks")
    ## Quantify peaks
    curr.counts <- FeatureMatrix(
      fragments = frags.curr,
      features = combined.peaks,
      cells = rownames(barcodes),
      process_n = cores
    )
    
    print("Creating chromatin assay")
    ## Create the objects and use simple filters
    curr_assay <- CreateChromatinAssay(curr.counts, fragments = frags.curr, min.cells = 10, min.features = 200)
    curr <- CreateSeuratObject(curr_assay, assay = "ATAC", project=name, meta.data=cells.meta)
    print('curr_assay')
    print(head(curr_assay))
    print('curr')
    print(head(curr[[]]))
    allSE = c(allSE, curr)
    #return(curr)
}

allSE

#allSE <- sapply(samples, create_frag, cellr_in=cellr_in)

[1] "samples_df"
     sample_names samples
[1,] "Input"      "Input"
[2,] "Flt3l"      "Flt3l"
[1] "exp"
[1] "Input"
[1] "name"
[1] "Input"


[1mRows: [22m[34m6848[39m [1mColumns: [22m[34m1[39m
[36m──[39m [1mColumn specification[22m [36m──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): X1

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m490990[39m [1mColumns: [22m[34m18[39m
[36m──[39m [1mColumn specification[22m [36m──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): barcode, cell_id
[32mdbl[39m (16): total, duplicate, chimeric, unmapped, 

[1] "Creating fragments object"


Computing hash



[1] "Quantifying peaks"


Extracting reads overlapping genomic regions



[1] "Creating chromatin assay"


“Keys should be one or more alphanumeric characters followed by an underscore, setting key from atac to atac_”


[1] "curr_assay"
data frame with 0 columns and 10 rows
[1] "curr"
                   orig.ident nCount_ATAC nFeature_ATAC total duplicate
AAACGAAAGACGCCAA-1      Input       17372         14717 61053     21242
AAACGAAAGATATGAC-1      Input        3331          3034 29954      4853
AAACGAAAGTAATGTG-1      Input        7795          6933 32111      9018
AAACGAAAGTAGTCGG-1      Input        9617          8345 37923     12542
AAACGAACACATTGCA-1      Input        9695          7900 46206     12553
AAACGAACACGCTCAG-1      Input        7267          6289 38973      7584
                   chimeric unmapped lowmapq mitochondrial passed_filters
AAACGAAAGACGCCAA-1      263      248    2186         11814          25300
AAACGAAAGATATGAC-1       67      103     626         19521           4784
AAACGAAAGTAATGTG-1       96      122    1319         11232          10324
AAACGAAAGTAGTCGG-1      147      162    1705         10189          13178
AAACGAACACATTGCA-1      148      184    1239         16880  

[1mRows: [22m[34m7651[39m [1mColumns: [22m[34m1[39m
[36m──[39m [1mColumn specification[22m [36m──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): X1

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m314920[39m [1mColumns: [22m[34m18[39m
[36m──[39m [1mColumn specification[22m [36m──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): barcode, cell_id
[32mdbl[39m (16): total, duplicate, chimeric, unmapped, 

[1] "Creating fragments object"


Computing hash



[1] "Quantifying peaks"


Extracting reads overlapping genomic regions



[1] "Creating chromatin assay"


“Some cells in meta.data not present in provided counts matrix.”
“Keys should be one or more alphanumeric characters followed by an underscore, setting key from atac to atac_”


[1] "curr_assay"
data frame with 0 columns and 10 rows
[1] "curr"
                   orig.ident nCount_ATAC nFeature_ATAC total duplicate
AAACGAAAGGAGACTC-1      Flt3l         593           585  2286       298
AAACGAAAGTCACGCC-1      Flt3l        1402          1373  4711       555
AAACGAACAAGACTAA-1      Flt3l        2605          2519  9178      1509
AAACGAACAAGCGAAC-1      Flt3l        2675          2602  8825      1544
AAACGAACACAGCCAC-1      Flt3l        3245          3118 10589      1294
AAACGAACACAGGTAG-1      Flt3l         701           693  2384       240
                   chimeric unmapped lowmapq mitochondrial passed_filters
AAACGAAAGGAGACTC-1        7        7     124           749           1101
AAACGAAAGTCACGCC-1       16       12     289           853           2986
AAACGAACAAGACTAA-1       37       19     585          1339           5689
AAACGAACAAGCGAAC-1       24       24     535          1308           5390
AAACGAACACAGCCAC-1       41       31     456          2764  

[[1]]
An object of class Seurat 
387037 features across 6848 samples within 1 assay 
Active assay: ATAC (387037 features, 0 variable features)

[[2]]
An object of class Seurat 
274951 features across 7588 samples within 1 assay 
Active assay: ATAC (274951 features, 0 variable features)


### Add the external with the others

In [30]:
allSE <- c(ext, allSE )
allSE

[[1]]
An object of class Seurat 
205832 features across 2196 samples within 1 assay 
Active assay: ATAC (205832 features, 0 variable features)

[[2]]
An object of class Seurat 
387037 features across 6848 samples within 1 assay 
Active assay: ATAC (387037 features, 0 variable features)

[[3]]
An object of class Seurat 
274951 features across 7588 samples within 1 assay 
Active assay: ATAC (274951 features, 0 variable features)


In [31]:
sample_names <- c(ext_name, sample_names)

## QC Metrics

In [32]:
library(Rsamtools)

qc <- function(se){
    # add the gene information to the object
    Annotation(se) <- annotations
    
    # compute nucleosome signal score per cell
    se <- NucleosomeSignal(object = se)
    
    # compute TSS enrichment score per cell
    se <- TSSEnrichment(object = se, fast = FALSE)

    # add blacklist ratio and fraction of reads in peaks
    se$pct_reads_in_peaks <- se$peak_region_fragments / se$passed_filters * 100
    se$blacklist_ratio <- se$blacklist_region_fragments / se$peak_region_fragments
    se$high.tss <- ifelse(se$TSS.enrichment > 2, 'High', 'Low')
    se$nucleosome_group <- ifelse(se$nucleosome_signal > 4, 'NS > 4', 'NS < 4')

    return(se)
}
vPlot <- function(se){
      vPlot <- VlnPlot(
      object = se,
      features = c('pct_reads_in_peaks', 'peak_region_fragments',
                   'TSS.enrichment', 'blacklist_ratio', 'nucleosome_signal'),
      pt.size = 0.1,
      ncol = 5
    )  
    vPlot <- vPlot +    # Create grid of plots with title
             plot_annotation(title = se$orig.ident[[1]]) & 
             theme(plot.title = element_text(hjust = 0.5, size=15))
    #print(vPlot)
    return(vPlot)
}


“package ‘Rsamtools’ was built under R version 4.1.2”
Loading required package: Biostrings

“package ‘Biostrings’ was built under R version 4.1.2”
Loading required package: XVector

“package ‘XVector’ was built under R version 4.1.2”

Attaching package: ‘Biostrings’


The following object is masked from ‘package:base’:

    strsplit




In [33]:
# allSE <- lapply(allSE, qc)
# saveRDS(allSE, file.path(outdir, paste0("allSamples.rds")))
# lapply(allSE,vPlot)
# ggsave(file.path(outdir, paste0("QC_01.png")))
# ggsave(file.path(outdir, paste0("QC_01.pdf")))

In [37]:
filtCells <- function(se, min_peak_region_fragments=10,
                      max_peak_region_fragments=15000,
                     min_pct_reads_in_peaks=15,
                     max_blacklist_ratio=0.05,
                     max_nucleosome_signal=4,
                     min_TSS_enrichment=2){
    print('se before filt')
    print(se)
    se <- subset(
      x = se,
      subset = peak_region_fragments > min_peak_region_fragments &
               peak_region_fragments < max_peak_region_fragments &
               pct_reads_in_peaks > min_pct_reads_in_peaks &
               blacklist_ratio < max_blacklist_ratio &
               nucleosome_signal < max_nucleosome_signal  &
               TSS.enrichment > min_TSS_enrichment
    )
    print('se after filt')
    print(se)
    return(se)
}

if (to.filt.cells){
    allSE <- lapply(allSE, filtCells,
                    min_peak_region_fragments=min_peak_region_fragments,
                    max_peak_region_fragments=max_peak_region_fragments,
                    min_pct_reads_in_peaks=min_pct_reads_in_peaks,
                    max_blacklist_ratio=max_blacklist_ratio,
                    max_nucleosome_signal=max_nucleosome_signal,
                    min_TSS_enrichment=min_TSS_enrichment)
}

# Merge

In [38]:
allSE

[[1]]
An object of class Seurat 
205832 features across 2196 samples within 1 assay 
Active assay: ATAC (205832 features, 0 variable features)

[[2]]
An object of class Seurat 
387037 features across 6848 samples within 1 assay 
Active assay: ATAC (387037 features, 0 variable features)

[[3]]
An object of class Seurat 
274951 features across 7588 samples within 1 assay 
Active assay: ATAC (274951 features, 0 variable features)


In [39]:
# Add sample names to cell prefix here.
for (i in 1:length(allSE)){
    print(i)
    curr.SE <- allSE[[i]]
    allSE[[i]] <- RenameCells(allSE[[i]], add.cell.id=curr.SE$orig.ident[[1]])
}

allSE[[1]][["ATAC"]]

[1] 1
[1] 2
[1] 3


ChromatinAssay data with 205832 features for 2196 cells
Variable features: 0 
Genome: 
Annotation present: FALSE 
Motifs present: FALSE 
Fragment files: 1 

In [42]:
allSE

[[1]]
An object of class Seurat 
205832 features across 2196 samples within 1 assay 
Active assay: ATAC (205832 features, 0 variable features)

[[2]]
An object of class Seurat 
387037 features across 6848 samples within 1 assay 
Active assay: ATAC (387037 features, 0 variable features)

[[3]]
An object of class Seurat 
274951 features across 7588 samples within 1 assay 
Active assay: ATAC (274951 features, 0 variable features)


In [47]:
GRanges(allSE[[1]])

ERROR: Error in as(seqnames, "GRanges"): no method or default for coercing “Seurat” to “GRanges”


In [55]:
granges(GetAssay(allSE[[1]]))

GRanges object with 205832 ranges and 0 metadata columns:
                       seqnames        ranges strand
                          <Rle>     <IRanges>  <Rle>
       [1]                 chr1    9866-10738      *
       [2]                 chr1 267760-268275      *
       [3]                 chr1 271046-271568      *
       [4]                 chr1 605242-605830      *
       [5]                 chr1 778171-779981      *
       ...                  ...           ...    ...
  [205828] chr22_KI270879v1_alt 266949-267449      *
  [205829] chr22_KI270879v1_alt 267717-268217      *
  [205830] chr22_KI270879v1_alt 277672-278172      *
  [205831] chr22_KI270879v1_alt 278274-278774      *
  [205832] chr22_KI270879v1_alt 278875-279375      *
  -------
  seqinfo: 33 sequences from an unspecified genome; no seqlengths

In [58]:
unique(seqnames(granges(GetAssay(allSE[[3]]))))

In [59]:
unique(seqnames(granges(GetAssay(allSE[[1]]))))

In [None]:
# intersecting.regions <- GetIntersectingFeatures(
#   object.1 = young_object,
#   object.2 = old_object,
#   sep.1 = c("-", "-"),
#   sep.2 = c("-", "-")
# )

In [60]:
# merge all datasets, adding a cell ID to make sure cell names are unique
# combined <- merge(
#   x = allSE[[1]],
#   y = unlist(allSE[2:length(allSE)]),
# )

if(length(sample_names) == 1){
    combined = allSE[[1]]
}else{
    combined <- merge(
      x = allSE[[1]],
      y = allSE[[2]])
}

“Expected 3 pieces. Additional pieces discarded in 101 rows [18904, 67753, 67754, 80218, 80219, 80220, 80221, 89658, 119569, 119570, 119571, 119572, 119573, 119574, 146210, 184443, 184444, 184445, 184446, 184447, ...].”


ERROR: Error in .get_data_frame_col_as_numeric(df, granges_cols[["start"]]): some values in the "start" column cannot be turned into numeric values


In [40]:
if(length(sample_names) > 2) {
    for (i in 3:length(sample_names)){
        combined <- merge(x=combined,
                          y = allSE[[i]])
        }
}

“Expected 3 pieces. Additional pieces discarded in 101 rows [18904, 67753, 67754, 80218, 80219, 80220, 80221, 89658, 119569, 119570, 119571, 119572, 119573, 119574, 146210, 184443, 184444, 184445, 184446, 184447, ...].”


ERROR: Error in .get_data_frame_col_as_numeric(df, granges_cols[["start"]]): some values in the "start" column cannot be turned into numeric values


In [None]:
combined <- FindTopFeatures(combined, min.cutoff = 20)
combined

In [None]:
combined

In [None]:
combined$orig.ident <- factor(combined$orig.ident, levels = sample_names)

VlnPlot(
  object = combined,
  features = c('nCount_ATAC', 'peak_region_fragments', 'passed_filters',
               'duplicate', 'unmapped'),
  split.by = "orig.ident",
  pt.size = 0.1,
  ncol = 3
)


In [None]:
# Binarize and run LSI
combined <- BinarizeCounts(combined)
combined <- RunTFIDF(combined)
combined <- RunSVD(combined)
combined <- RunUMAP(combined, dims = lsi_start_comp:50, reduction = 'lsi')
DimPlot(combined, group.by = "proj", pt.size = 0.1)

In [None]:
pDepthCorr <- DepthCor(combined)
pDepthCorr

### Split back into groups - add external sample_names

In [None]:
sample_names


In [None]:
allSE


In [None]:
sample_names

In [None]:
allSE[[1]]

In [None]:
allSE[[2]]

In [None]:
allSE <- lapply(sample_names,  function(x) subset(combined, subset = orig.ident == x))
allSE

# Integrate, using external data as first entry, which serves as the anchor

In [None]:
if( length(sample_names) == 1){
    integrated = allSE[[1]]
}else{
    # find integration anchors
    print("getting anchors ")
    integration.anchors <- FindIntegrationAnchors(
      object.list = allSE, #c(ext,curr),
      anchor.features = allSE[[1]], #rownames(ext),
      reduction = "rlsi",
      dims = lsi_start_comp:30
    )
    print("integrating embeddings")
    # integrate LSI embeddings
    integrated <- IntegrateEmbeddings(
      anchorset = integration.anchors,
      reductions = combined[["lsi"]],
      new.reduction.name = "integrated_lsi",
      dims.to.integrate = 2:30
    )
}


In [None]:
p1 <- DimPlot(combined, group.by = "proj")

# create a new UMAP using the integrated embeddings
integrated <- RunUMAP(integrated, reduction = "integrated_lsi", dims = 2:30)
p2 <- DimPlot(integrated, group.by = "proj")
ggsave(file.path(outdir,"integrated.batch.png"), dpi=300)
p2

In [None]:
(p1 + ggtitle("Merged")) | (p2 + ggtitle("Integrated"))
ggsave(file.path(outdir,"integrated.merged.compare.png"), dpi=300)

In [None]:
pDepthCorr <- DepthCor(integrated, reduction='integrated_lsi')
ggsave(file.path(outdir,"integrated.depthCor.png"), plot=pDepthCorr, dpi=300)

pDepthCorr

In [None]:
#integrated <- RunUMAP(object = integrated, reduction = 'integrated_lsi', dims = 2:30)
integrated <- FindNeighbors(object = integrated, reduction = 'integrated_lsi', dims = 2:30)
integrated <- FindClusters(object = integrated, verbose = FALSE, algorithm = 3)

pclust <- DimPlot(object = integrated, label = TRUE) + NoLegend()
ggsave(file.path(outdir, "integrated.lsi.clusters.png"), pclust)
pclust

In [None]:
# add the gene information to the object
DefaultAssay(integrated) <- "ATAC"
Annotation(integrated) <- annotations
gene.activities <- GeneActivity(integrated)


In [None]:
mt_genes <- annotations[seqnames(annotations) == "chrM"]
dim(mt_genes)


In [None]:
gene.activities <- gene.activities[!(rownames(gene.activities) %in% mt_genes$gene_name),]
# add the gene activity matrix to the Seurat object as a new assay and normalize it
integrated[['RNA']] <- CreateAssayObject(counts = gene.activities)
integrated <- NormalizeData(
  object = integrated,
  assay = 'RNA',
  normalization.method = 'LogNormalize',
  scale.factor = median(integrated$nCount_RNA)
)
DefaultAssay(integrated) <- 'RNA'

In [None]:
saveRDS(integrated, file.path(outdir, paste0("allSamples.integrated.rds")))

In [None]:
Idents(integrated) <- "orig.ident"
VlnPlot(
  object = integrated,
  features = c('nCount_ATAC', 'peak_region_fragments', 'passed_filters',
               'duplicate', 'unmapped'),
  split.by = "orig.ident",
  pt.size = 0.1,
  ncol = 3
)

ggsave(file.path(outdir, paste0("QC_02.png")))

In [None]:
sessionInfo()