## Creates binarized Seurat objects, integrates conditions and annotates genes by nearby peaks

In [1]:
# Input info
#cellr_in = "/home/isaac/lewis/mito_lineage/output/mtscATAC/data/DUPI_april08_2021_Croker/MTblacklist_mtasnucl/" 
#outdir = "/mnt/md0/isshamie/Projects/Mito_Trace/output/pipeline/DUPI_april08_2021/MTblacklist_mtasnucl_Bonly/data/annotation/gff_hg38_1_2/mergedSamples/"

# Parameters
cores = 24

gff_id = "A2_black"

annotations_indir = "/mnt/md0/isshamie/Projects/Mito_Trace/data/processed/annotation_granges/"
config_f = "/data/Mito_Trace/parameters/pipeline/cosmo_server_v04/chip_paper_noControl/CHIP_aggr_samples.noChA1.yaml"
out_f = "/mnt/md0/isshamie/Projects/Mito_Trace/output/aggregate/CHIP_aggr/v04_no_ChA1//data/annotation/gff_A2/mergedSamples/merged_peaks.bed"
outdir =  "/mnt/md0/isshamie/Projects/Mito_Trace/output/aggregate/CHIP_aggr/v04_no_ChA1/data/annotation/gff_A2/mergedSamples"

In [2]:
library(repr)
options(repr.plot.width=12, repr.plot.height=12)

“package ‘repr’ was built under R version 4.1.2”


In [3]:
config_f

In [4]:
library(yaml)

samps <- read_yaml(config_f)$samples_csv
samps

“package ‘yaml’ was built under R version 4.1.2”


In [5]:
annotations <- readRDS(file.path(annotations_indir, paste0(gff_id, ".annotationGranges.rds")))

In [6]:
annotations

Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which.max, which.min


Loading required package: S4Vectors

“package ‘S4Vectors’ was built under R version 4.1.2”

Attaching package: ‘S4Vectors’


The following objects are masked from ‘package:base’:

    expand.grid, I, unname


Loading required package: IRanges

“package ‘IRanges’ was built under R version 4.1.2”
Loading required

GRanges object with 2765969 ranges and 25 metadata columns:
              seqnames        ranges strand |   source       type     score
                 <Rle>     <IRanges>  <Rle> | <factor>   <factor> <numeric>
        [1]       chr1   29554-31109      + |   HAVANA gene              NA
        [2]       chr1   29554-31097      + |   HAVANA transcript        NA
        [3]       chr1   29554-30039      + |   HAVANA exon              NA
        [4]       chr1   30564-30667      + |   HAVANA exon              NA
        [5]       chr1   30976-31097      + |   HAVANA exon              NA
        ...        ...           ...    ... .      ...        ...       ...
  [2765965] KI270734.1 138483-138667      - |  ENSEMBL CDS               NA
  [2765966] KI270734.1 138480-138482      - |  ENSEMBL stop_codon        NA
  [2765967] KI270734.1 161689-161852      - |  ENSEMBL UTR               NA
  [2765968] KI270734.1 161587-161626      - |  ENSEMBL UTR               NA
  [2765969] KI270734.1 13808

In [7]:
library(GenomicRanges)
library(Seurat)
library(Signac)
library(GenomeInfoDb)
#library(EnsDb.Hsapiens.v75)
library(ggplot2)
library(patchwork)
set.seed(1234)
library(data.table)
library(magrittr)
library(cowplot)
library(metap)
library(dplyr)
library(future)

plan("multiprocess", workers = cores)
options(future.globals.maxSize = 8000 * 1024^2)
#options(future.globals.maxSize = 50000 * 1024^2) # for 50 Gb RAM
#plan("multiprocess", workers = workers)

Registered S3 method overwritten by 'spatstat.core':
  method          from
  formula.glmmPQL MASS

Attaching SeuratObject

Attaching sp

“package ‘ggplot2’ was built under R version 4.1.3”

Attaching package: ‘data.table’


The following object is masked from ‘package:GenomicRanges’:

    shift


The following object is masked from ‘package:IRanges’:

    shift


The following objects are masked from ‘package:S4Vectors’:

    first, second


“package ‘magrittr’ was built under R version 4.1.3”

Attaching package: ‘cowplot’


The following object is masked from ‘package:patchwork’:

    align_plots


“package ‘dplyr’ was built under R version 4.1.3”

Attaching package: ‘dplyr’


The following objects are masked from ‘package:data.table’:

    between, first, last


The following objects are masked from ‘package:GenomicRanges’:

    intersect, setdiff, union


The following object is masked from ‘package:GenomeInfoDb’:

    intersect


The following objects are masked from ‘package:IRan

## Merge all peaks

In [8]:
read.peaks <- function(curr.samp.f){
    print('curr.samp')
    print(curr.samp.f)
    print(stringr::str_replace(curr.samp.f, "barcodes.tsv", "peaks.bed"))
    peaks <- read.table(
      file = stringr::str_replace(curr.samp.f, "barcodes.tsv", "peaks.bed"),
      col.names = c("chr", "start", "end")
    )
    # convert to genomic ranges
    gr <- makeGRangesFromDataFrame(peaks)
    return(gr)
}


run.experiments <- function(sample_csv){
    curr.csv <- readr::read_csv(sample_csv)
    print(curr.csv)
    curr.grs <- sapply(curr.csv$barcode_f, read.peaks)
    return(curr.grs)
}

gr.full <- sapply(samps, run.experiments)


[1mRows: [22m[34m2[39m [1mColumns: [22m[34m5[39m
[36m──[39m [1mColumn specification[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (5): sample_name, cellr_ID, fastq_ID, barcode_f, bam_f

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


[90m# A tibble: 2 × 5[39m
  sample_name cellr_ID fastq_ID             barcode_f                      bam_f
  [3m[90m<chr>[39m[23m       [3m[90m<chr>[39m[23m    [3m[90m<chr>[39m[23m                [3m[90m<chr>[39m[23m                          [3m[90m<chr>[39m[23m
[90m1[39m Flt3l       Flt3l    BC_10xATAC_rxn1b_pos /home/isaac/lewis/mito_lineag… /hom…
[90m2[39m Input       Input    BC_10xATAC_rxn1      /home/isaac/lewis/mito_lineag… /hom…
[1] "curr.samp"
[1] "/home/isaac/lewis/mito_lineage/output/mtscATAC/data/CHIP_dec172021_b1/MTBlacklist_A2/Flt3l/outs/filtered_peak_bc_matrix/barcodes.tsv"
[1] "/home/isaac/lewis/mito_lineage/output/mtscATAC/data/CHIP_dec172021_b1/MTBlacklist_A2/Flt3l/outs/filtered_peak_bc_matrix/peaks.bed"
[1] "curr.samp"
[1] "/home/isaac/lewis/mito_lineage/output/mtscATAC/data/CHIP_dec172021_b1/MTBlacklist_A2/Input/outs/filtered_peak_bc_matrix/barcodes.tsv"
[1] "/home/isaac/lewis/mito_lineage/output/mtscATAC/data/CHIP_dec172021_b1/MTBlacklist_

[1mRows: [22m[34m1[39m [1mColumns: [22m[34m5[39m
[36m──[39m [1mColumn specification[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (5): sample_name, cellr_ID, fastq_ID, barcode_f, bam_f

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


[90m# A tibble: 1 × 5[39m
  sample_name cellr_ID fastq_ID        barcode_f                           bam_f
  [3m[90m<chr>[39m[23m       [3m[90m<chr>[39m[23m    [3m[90m<chr>[39m[23m           [3m[90m<chr>[39m[23m                               [3m[90m<chr>[39m[23m
[90m1[39m Input       Input    BC_10xATAC_rxn2 /home/isaac/lewis/mito_lineage/out… /hom…
[1] "curr.samp"
[1] "/home/isaac/lewis/mito_lineage/output/mtscATAC/data/CHIP_dec172021_b2/MTBlacklist_A2/Input/outs/filtered_peak_bc_matrix/barcodes.tsv"
[1] "/home/isaac/lewis/mito_lineage/output/mtscATAC/data/CHIP_dec172021_b2/MTBlacklist_A2/Input/outs/filtered_peak_bc_matrix/peaks.bed"


[1mRows: [22m[34m1[39m [1mColumns: [22m[34m5[39m
[36m──[39m [1mColumn specification[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (5): sample_name, cellr_ID, fastq_ID, barcode_f, bam_f

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


[90m# A tibble: 1 × 5[39m
  sample_name cellr_ID fastq_ID      barcode_f                             bam_f
  [3m[90m<chr>[39m[23m       [3m[90m<chr>[39m[23m    [3m[90m<chr>[39m[23m         [3m[90m<chr>[39m[23m                                 [3m[90m<chr>[39m[23m
[90m1[39m Flt3l       J2       BC_10xATAC_J2 /home/isaac/lewis/mito_lineage/outpu… /hom…
[1] "curr.samp"
[1] "/home/isaac/lewis/mito_lineage/output/mtscATAC/data/jan21_2021/MTblacklist/J2/outs/filtered_peak_bc_matrix/barcodes.tsv"
[1] "/home/isaac/lewis/mito_lineage/output/mtscATAC/data/jan21_2021/MTblacklist/J2/outs/filtered_peak_bc_matrix/peaks.bed"


In [9]:
gr.full <- unlist(gr.full, recursive=FALSE, use.names=FALSE)
gr.full

[[1]]
GRanges object with 166097 ranges and 0 metadata columns:
           seqnames            ranges strand
              <Rle>         <IRanges>  <Rle>
       [1]     chr1        9866-10638      *
       [2]     chr1     180724-181039      *
       [3]     chr1     191222-191970      *
       [4]     chr1     267781-268267      *
       [5]     chr1     271046-271568      *
       ...      ...               ...    ...
  [166093]     chrY 56763256-56763795      *
  [166094]     chrY 56826222-56826788      *
  [166095]     chrY 56829068-56840725      *
  [166096]     chrY 56841310-56842620      *
  [166097]     chrY 56843438-56851611      *
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

[[2]]
GRanges object with 168267 ranges and 0 metadata columns:
           seqnames            ranges strand
              <Rle>         <IRanges>  <Rle>
       [1]     chr1        9975-10498      *
       [2]     chr1       16106-16366      *
       [3]     chr1     180917

In [10]:
gr.full.c <- gr.full[[1]]
if (length(gr.full)>1){
    for (i in 2:length(gr.full)){
      gr.full.c <- c(gr.full.c, gr.full[[i]])
    }
}
combined.peaks <- reduce(x = c(gr.full.c))

# Filter out bad peaks based on length
peakwidths <- width(combined.peaks)
combined.peaks <- combined.peaks[peakwidths  < 10000 & peakwidths > 20]
head(combined.peaks)

GRanges object with 6 ranges and 0 metadata columns:
      seqnames        ranges strand
         <Rle>     <IRanges>  <Rle>
  [1]     chr1    9866-10638      *
  [2]     chr1   16106-16366      *
  [3]     chr1 180724-181403      *
  [4]     chr1 184126-184527      *
  [5]     chr1 191059-192097      *
  [6]     chr1 267765-268275      *
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

In [11]:
length(combined.peaks)

## Are any peaks in the chrM

In [12]:
length(combined.peaks[seqnames(combined.peaks) != "chrM"])


In [13]:
#combined.peaks[seqnames(combined.peaks) == "chrM"]

combined.peaks[seqnames(combined.peaks) == "chr1"]


GRanges object with 19141 ranges and 0 metadata columns:
          seqnames              ranges strand
             <Rle>           <IRanges>  <Rle>
      [1]     chr1          9866-10638      *
      [2]     chr1         16106-16366      *
      [3]     chr1       180724-181403      *
      [4]     chr1       184126-184527      *
      [5]     chr1       191059-192097      *
      ...      ...                 ...    ...
  [19137]     chr1 248913003-248913478      *
  [19138]     chr1 248920611-248920967      *
  [19139]     chr1 248924437-248928616      *
  [19140]     chr1 248930024-248930301      *
  [19141]     chr1 248945754-248946299      *
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

## Remove any peaks in the chrM

In [14]:
combined.peaks = combined.peaks[seqnames(combined.peaks) != "chrM"]
length(combined.peaks)

## Save merged peaks as bed

In [15]:
gr <- unlist(as(gr.full, "GRangesList"))

In [16]:
df <- data.frame(seqnames=seqnames(gr),
  starts=start(gr)-1,
  ends=end(gr),
  names=tibble::rowid_to_column(as.data.frame(gr), "names")$names,
  scores=c(rep(".", length(gr))),
  strands=strand(gr))

df = dplyr::mutate_if(df, is.numeric, as.integer)
df

seqnames,starts,ends,names,scores,strands
<fct>,<int>,<int>,<int>,<chr>,<fct>
chr1,9865,10638,1,.,*
chr1,180723,181039,2,.,*
chr1,191221,191970,3,.,*
chr1,267780,268267,4,.,*
chr1,271045,271568,5,.,*
chr1,274281,274375,6,.,*
chr1,585966,586417,7,.,*
chr1,605241,605783,8,.,*
chr1,778170,779954,9,.,*
chr1,812745,813016,10,.,*


In [19]:
write.table(df, file=file.path(outdir,"merged_peaks.bed"), quote=F, sep="\t", row.names=F, col.names=F)

In [20]:
sessionInfo()

R version 4.1.1 (2021-08-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.4 LTS

Matrix products: default
BLAS/LAPACK: /home/isaac/miniconda3/envs/mttrace/lib/libopenblasp-r0.3.20.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] future_1.25.0        dplyr_1.0.9          metap_1.1           
 [4] cowplot_1.1.1        magrittr_2.0.3       data.table_1.14.2   
 [7] patchwork_1.1.1      ggplot2_3.3.6        Signac_1.4.0        
[10] sp_1.4-7             SeuratObject_4.1.0   Seurat_4.1.0        
[13] GenomicRanges

## Create expSignac

In [21]:
expr.load <- function(samples_df){ #sample_names, samples) {
#     sample_names <- 
#     samples <- curr.csv[]
#     print('sample_names')
#     print(sample_names)
#     print('samples')
#     print(samples)    
    allSE = c() 

    #samples_df <- cbind(sample_names, samples)
    print('samples_df')
    print(samples_df)
    for (row in 1:nrow(samples_df)){
        exp <- samples_df[row,][["cellr_ID"]]
        name <- samples_df[row,][["sample_name"]]
        
        curr_in <- dirname(dirname(samples_df[[row, "barcode_f"]]))
        print('exp')
        print(exp)
        print('name')
        print(name)
        barcode_path <- file.path(curr_in, "filtered_peak_bc_matrix", "barcodes.tsv")    
        print("barcode_path")
        print(barcode_path)
        barcodes <- readr::read_tsv(barcode_path, col_names = F) # %>% tidyr::unite(barcode)
        barcodes <- as.data.frame(barcodes) %>%  tibble::column_to_rownames(var="X1") %>% tibble::add_column(proj=name)
        frag_file <- file.path(curr_in, "fragments.tsv.gz")

        cells.meta.f <- file.path(curr_in, "singlecell.csv") 
        cells.meta <- as.data.frame(readr::read_csv(cells.meta.f)) %>% tibble::column_to_rownames(var="barcode") %>% tibble::add_column(proj=name)
        cells.meta <- cells.meta[rownames(cells.meta) %in% rownames(barcodes), ]

        # quantify multiome peaks in the scATAC-seq dataset


        print("Creating fragments object")
        frags.curr <- CreateFragmentObject(path = frag_file, cells= rownames(barcodes))
        #print(frags.curr)
        print("Quantifying peaks")
        ## Quantify peaks
        curr.counts <- FeatureMatrix(
          fragments = frags.curr,
          features = combined.peaks,
          cells = rownames(barcodes),
          process_n = cores
        )

        print("Creating chromatin assay")
        ## Create the objects and use simple filters
        curr_assay <- CreateChromatinAssay(curr.counts, fragments = frags.curr, min.cells = 10, min.features = 200)
        curr <- CreateSeuratObject(curr_assay, assay = "ATAC", project=name, meta.data=cells.meta)
        #curr <- subset(curr, nCount_peaks > 2000 & nCount_peaks < 30000)
    
        print('curr_assay')
        print(head(curr_assay))
        print('curr')
        print(head(curr[[]]))
        allSE = c(allSE, curr)
        #return(curr)
    }

    return(allSE)
}
    #allSE <- sapply(samples, create_frag, curr_in=curr_in)

In [22]:
load.se.experiments <- function(sample_csv){
    curr.csv <- readr::read_csv(sample_csv)
    #print(curr.csv)
    return(expr.load(curr.csv)) 
    #return(expr.load(curr.csv$samples, curr.csv$cellr.id)) #sapply(curr.csv, expr.load)
}



In [23]:
allSE <- sapply(samps, load.se.experiments)

[1mRows: [22m[34m2[39m [1mColumns: [22m[34m5[39m
[36m──[39m [1mColumn specification[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (5): sample_name, cellr_ID, fastq_ID, barcode_f, bam_f

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


[1] "samples_df"
[90m# A tibble: 2 × 5[39m
  sample_name cellr_ID fastq_ID             barcode_f                      bam_f
  [3m[90m<chr>[39m[23m       [3m[90m<chr>[39m[23m    [3m[90m<chr>[39m[23m                [3m[90m<chr>[39m[23m                          [3m[90m<chr>[39m[23m
[90m1[39m Flt3l       Flt3l    BC_10xATAC_rxn1b_pos /home/isaac/lewis/mito_lineag… /hom…
[90m2[39m Input       Input    BC_10xATAC_rxn1      /home/isaac/lewis/mito_lineag… /hom…
[1] "exp"
[1] "Flt3l"
[1] "name"
[1] "Flt3l"
[1] "barcode_path"
[1] "/home/isaac/lewis/mito_lineage/output/mtscATAC/data/CHIP_dec172021_b1/MTBlacklist_A2/Flt3l/outs/filtered_peak_bc_matrix/barcodes.tsv"


[1mRows: [22m[34m7651[39m [1mColumns: [22m[34m1[39m
[36m──[39m [1mColumn specification[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): X1

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m314920[39m [1mColumns: [22m[34m18[39m
[36m──[39m [1mColumn specification[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): barcode, cell_id
[32mdbl[39m (16): total, duplicate, chimeric, unmapped, lowmapq, mitochondrial, pass...

[36mℹ[39m Use `spec()` to retrieve 

[1] "Creating fragments object"


Computing hash



[1] "Quantifying peaks"


Extracting reads overlapping genomic regions



[1] "Creating chromatin assay"


“Some cells in meta.data not present in provided counts matrix.”
“Keys should be one or more alphanumeric characters followed by an underscore, setting key from atac to atac_”


[1] "curr_assay"
data frame with 0 columns and 10 rows
[1] "curr"
                   orig.ident nCount_ATAC nFeature_ATAC total duplicate
AAACGAAAGGAGACTC-1      Flt3l         542           534  2286       298
AAACGAAAGTCACGCC-1      Flt3l        1254          1227  4711       555
AAACGAACAAGACTAA-1      Flt3l        2324          2241  9178      1509
AAACGAACAAGCGAAC-1      Flt3l        2349          2283  8825      1544
AAACGAACACAGCCAC-1      Flt3l        2991          2865 10589      1294
AAACGAACACAGGTAG-1      Flt3l         627           619  2384       240
                   chimeric unmapped lowmapq mitochondrial passed_filters
AAACGAAAGGAGACTC-1        7        7     124           749           1101
AAACGAAAGTCACGCC-1       16       12     289           853           2986
AAACGAACAAGACTAA-1       37       19     585          1339           5689
AAACGAACAAGCGAAC-1       24       24     535          1308           5390
AAACGAACACAGCCAC-1       41       31     456          2764  

[1mRows: [22m[34m6848[39m [1mColumns: [22m[34m1[39m
[36m──[39m [1mColumn specification[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): X1

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m490990[39m [1mColumns: [22m[34m18[39m
[36m──[39m [1mColumn specification[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): barcode, cell_id
[32mdbl[39m (16): total, duplicate, chimeric, unmapped, lowmapq, mitochondrial, pass...

[36mℹ[39m Use `spec()` to retrieve 

[1] "Creating fragments object"


Computing hash



[1] "Quantifying peaks"


Extracting reads overlapping genomic regions



[1] "Creating chromatin assay"


“Keys should be one or more alphanumeric characters followed by an underscore, setting key from atac to atac_”


[1] "curr_assay"
data frame with 0 columns and 10 rows
[1] "curr"
                   orig.ident nCount_ATAC nFeature_ATAC total duplicate
AAACGAAAGACGCCAA-1      Input       15872         13272 61053     21242
AAACGAAAGATATGAC-1      Input        3153          2859 29954      4853
AAACGAAAGTAATGTG-1      Input        7347          6508 32111      9018
AAACGAAAGTAGTCGG-1      Input        8812          7561 37923     12542
AAACGAACACATTGCA-1      Input        8708          6971 46206     12553
AAACGAACACGCTCAG-1      Input        6834          5871 38973      7584
                   chimeric unmapped lowmapq mitochondrial passed_filters
AAACGAAAGACGCCAA-1      263      248    2186         11814          25300
AAACGAAAGATATGAC-1       67      103     626         19521           4784
AAACGAAAGTAATGTG-1       96      122    1319         11232          10324
AAACGAAAGTAGTCGG-1      147      162    1705         10189          13178
AAACGAACACATTGCA-1      148      184    1239         16880  

[1mRows: [22m[34m1[39m [1mColumns: [22m[34m5[39m
[36m──[39m [1mColumn specification[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (5): sample_name, cellr_ID, fastq_ID, barcode_f, bam_f

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


[1] "samples_df"
[90m# A tibble: 1 × 5[39m
  sample_name cellr_ID fastq_ID        barcode_f                           bam_f
  [3m[90m<chr>[39m[23m       [3m[90m<chr>[39m[23m    [3m[90m<chr>[39m[23m           [3m[90m<chr>[39m[23m                               [3m[90m<chr>[39m[23m
[90m1[39m Input       Input    BC_10xATAC_rxn2 /home/isaac/lewis/mito_lineage/out… /hom…
[1] "exp"
[1] "Input"
[1] "name"
[1] "Input"
[1] "barcode_path"
[1] "/home/isaac/lewis/mito_lineage/output/mtscATAC/data/CHIP_dec172021_b2/MTBlacklist_A2/Input/outs/filtered_peak_bc_matrix/barcodes.tsv"


[1mRows: [22m[34m4769[39m [1mColumns: [22m[34m1[39m
[36m──[39m [1mColumn specification[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): X1

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m405884[39m [1mColumns: [22m[34m18[39m
[36m──[39m [1mColumn specification[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): barcode, cell_id
[32mdbl[39m (16): total, duplicate, chimeric, unmapped, lowmapq, mitochondrial, pass...

[36mℹ[39m Use `spec()` to retrieve 

[1] "Creating fragments object"


Computing hash



[1] "Quantifying peaks"


Extracting reads overlapping genomic regions



[1] "Creating chromatin assay"


“Some cells in meta.data not present in provided counts matrix.”
“Keys should be one or more alphanumeric characters followed by an underscore, setting key from atac to atac_”


[1] "curr_assay"
data frame with 0 columns and 10 rows
[1] "curr"
                   orig.ident nCount_ATAC nFeature_ATAC total duplicate
AAACGAAAGCCGCAAA-1      Input        1276          1234 16081      1250
AAACGAACAGATGGCA-1      Input        8399          7544 44699     10292
AAACGAAGTATGTTCG-1      Input        4872          4476 21594      5059
AAACGAAGTCATTGGT-1      Input         617           606  9191       460
AAACGAATCAGAGTGG-1      Input        1942          1867 10961      1743
AAACGAATCCGGACTG-1      Input        2565          2445 15146      1793
                   chimeric unmapped lowmapq mitochondrial passed_filters
AAACGAAAGCCGCAAA-1       25       60     379         11724           2643
AAACGAACAGATGGCA-1      190      169    1753         15193          17102
AAACGAAGTATGTTCG-1      133       91     784          8125           7402
AAACGAAGTCATTGGT-1       12       19     137          7471           1092
AAACGAATCAGAGTGG-1       56       43     289          4957  

[1mRows: [22m[34m1[39m [1mColumns: [22m[34m5[39m
[36m──[39m [1mColumn specification[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (5): sample_name, cellr_ID, fastq_ID, barcode_f, bam_f

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


[1] "samples_df"
[90m# A tibble: 1 × 5[39m
  sample_name cellr_ID fastq_ID      barcode_f                             bam_f
  [3m[90m<chr>[39m[23m       [3m[90m<chr>[39m[23m    [3m[90m<chr>[39m[23m         [3m[90m<chr>[39m[23m                                 [3m[90m<chr>[39m[23m
[90m1[39m Flt3l       J2       BC_10xATAC_J2 /home/isaac/lewis/mito_lineage/outpu… /hom…
[1] "exp"
[1] "J2"
[1] "name"
[1] "Flt3l"
[1] "barcode_path"
[1] "/home/isaac/lewis/mito_lineage/output/mtscATAC/data/jan21_2021/MTblacklist/J2/outs/filtered_peak_bc_matrix/barcodes.tsv"


[1mRows: [22m[34m12009[39m [1mColumns: [22m[34m1[39m
[36m──[39m [1mColumn specification[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): X1

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1mRows: [22m[34m430951[39m [1mColumns: [22m[34m18[39m
[36m──[39m [1mColumn specification[22m [36m───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): barcode, cell_id
[32mdbl[39m (16): total, duplicate, chimeric, unmapped, lowmapq, mitochondrial, pass...

[36mℹ[39m Use `spec()` to retrieve

[1] "Creating fragments object"


Computing hash



[1] "Quantifying peaks"


Extracting reads overlapping genomic regions



[1] "Creating chromatin assay"


“Some cells in meta.data not present in provided counts matrix.”
“Keys should be one or more alphanumeric characters followed by an underscore, setting key from atac to atac_”


[1] "curr_assay"
data frame with 0 columns and 10 rows
[1] "curr"
                   orig.ident nCount_ATAC nFeature_ATAC total duplicate
AAACGAAAGAGCTCCC-1      Flt3l        5455          5127 25798      3956
AAACGAAAGCGATACG-1      Flt3l        2392          2309 18340      2478
AAACGAAAGGCTTCGC-1      Flt3l        4439          4232 28417      4398
AAACGAAAGTACAACA-1      Flt3l        1551          1510 12018      1524
AAACGAACAACGTACT-1      Flt3l        3325          3199 37690      3034
AAACGAACAAGCGGTA-1      Flt3l        1501          1463  9381      1416
                   chimeric unmapped lowmapq mitochondrial passed_filters
AAACGAAAGAGCTCCC-1      111       95    1643          4740          15253
AAACGAAAGCGATACG-1       78       92     794          9325           5573
AAACGAAAGGCTTCGC-1      132      122    1771         10224          11770
AAACGAAAGTACAACA-1       60       45     585          5688           4116
AAACGAACAACGTACT-1       82      125    1680         24974  

In [24]:
allSE.full <- unlist(allSE, recursive=FALSE, use.names=TRUE)



In [25]:
allSE.full

$CHIP_b11
An object of class Seurat 
177009 features across 7533 samples within 1 assay 
Active assay: ATAC (177009 features, 0 variable features)

$CHIP_b12
An object of class Seurat 
195627 features across 6848 samples within 1 assay 
Active assay: ATAC (195627 features, 0 variable features)

$CHIP_b2
An object of class Seurat 
154178 features across 4767 samples within 1 assay 
Active assay: ATAC (154178 features, 0 variable features)

$cd34norm
An object of class Seurat 
202351 features across 11978 samples within 1 assay 
Active assay: ATAC (202351 features, 0 variable features)


In [26]:
allSE

$CHIP_b1
$CHIP_b1[[1]]
An object of class Seurat 
177009 features across 7533 samples within 1 assay 
Active assay: ATAC (177009 features, 0 variable features)

$CHIP_b1[[2]]
An object of class Seurat 
195627 features across 6848 samples within 1 assay 
Active assay: ATAC (195627 features, 0 variable features)


$CHIP_b2
$CHIP_b2[[1]]
An object of class Seurat 
154178 features across 4767 samples within 1 assay 
Active assay: ATAC (154178 features, 0 variable features)


$cd34norm
$cd34norm[[1]]
An object of class Seurat 
202351 features across 11978 samples within 1 assay 
Active assay: ATAC (202351 features, 0 variable features)



In [27]:
saveRDS(allSE.full, file.path(outdir, paste0("allSamples.rds")))

In [28]:
file.path(outdir, paste0("allSamples.rds"))

In [29]:
# library(Rsamtools)

# qc <- function(se){
#     # add the gene information to the object
#     Annotation(se) <- annotations
    
#     # compute nucleosome signal score per cell
#     se <- NucleosomeSignal(object = se)
    
#     # compute TSS enrichment score per cell
#     se <- TSSEnrichment(object = se, fast = FALSE)

#     # add blacklist ratio and fraction of reads in peaks
#     se$pct_reads_in_peaks <- se$peak_region_fragments / se$passed_filters * 100
#     se$blacklist_ratio <- se$blacklist_region_fragments / se$peak_region_fragments
#     se$high.tss <- ifelse(se$TSS.enrichment > 2, 'High', 'Low')
#     se$nucleosome_group <- ifelse(se$nucleosome_signal > 4, 'NS > 4', 'NS < 4')

#     return(se)
# }
# vPlot <- function(se){
#       vPlot <- VlnPlot(
#       object = se,
#       features = c('pct_reads_in_peaks', 'peak_region_fragments',
#                    'TSS.enrichment', 'blacklist_ratio', 'nucleosome_signal'),
#       pt.size = 0.1,
#       ncol = 5
#     )  
#     vPlot <- vPlot +    # Create grid of plots with title
#              plot_annotation(title = se$orig.ident[[1]]) & 
#              theme(plot.title = element_text(hjust = 0.5, size=15))
#     #print(vPlot)
#     return(vPlot)
# }



In [30]:
sessionInfo()

R version 4.1.1 (2021-08-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Ubuntu 20.04.4 LTS

Matrix products: default
BLAS/LAPACK: /home/isaac/miniconda3/envs/mttrace/lib/libopenblasp-r0.3.20.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] future_1.25.0        dplyr_1.0.9          metap_1.1           
 [4] cowplot_1.1.1        magrittr_2.0.3       data.table_1.14.2   
 [7] patchwork_1.1.1      ggplot2_3.3.6        Signac_1.4.0        
[10] sp_1.4-7             SeuratObject_4.1.0   Seurat_4.1.0        
[13] GenomicRanges

In [31]:
allSE

$CHIP_b1
$CHIP_b1[[1]]
An object of class Seurat 
177009 features across 7533 samples within 1 assay 
Active assay: ATAC (177009 features, 0 variable features)

$CHIP_b1[[2]]
An object of class Seurat 
195627 features across 6848 samples within 1 assay 
Active assay: ATAC (195627 features, 0 variable features)


$CHIP_b2
$CHIP_b2[[1]]
An object of class Seurat 
154178 features across 4767 samples within 1 assay 
Active assay: ATAC (154178 features, 0 variable features)


$cd34norm
$cd34norm[[1]]
An object of class Seurat 
202351 features across 11978 samples within 1 assay 
Active assay: ATAC (202351 features, 0 variable features)



In [32]:
print

In [33]:
allSE.full


$CHIP_b11
An object of class Seurat 
177009 features across 7533 samples within 1 assay 
Active assay: ATAC (177009 features, 0 variable features)

$CHIP_b12
An object of class Seurat 
195627 features across 6848 samples within 1 assay 
Active assay: ATAC (195627 features, 0 variable features)

$CHIP_b2
An object of class Seurat 
154178 features across 4767 samples within 1 assay 
Active assay: ATAC (154178 features, 0 variable features)

$cd34norm
An object of class Seurat 
202351 features across 11978 samples within 1 assay 
Active assay: ATAC (202351 features, 0 variable features)
