In [1]:
library("here")
library(devtools)
library(Seurat)
library(harmony)
library(RcppML)
library(reshape2)
library(tidyverse)
load_all('/home/jpm73279/Socrates')


# arguments
#args <- commandArgs(TRUE)
#tn5_bed <- as.character(args[1])
#peak_bed <- as.character(args[1])
#metadata <- as.character(args[3])
#ann <- as.character(args[4])
#chr <- as.character(args[4])
#prefix <- as.character(args[5])

# arguments
args <- commandArgs(TRUE)
#tn5_bed  <- as.character(args[1])
#peak_bed  <- as.character(args[2])
#metadata  <- as.character(args[3])
#ann <-  as.character(args[4])
#chr <-  as.character(args[5])
#prefix <- as.character(args[6])


tn5_bed <- "/scratch/jpm73279/comparative_single_cell/01.alignments_annotations/sorghum_bicolor/Sorghum_leaf.sciATAC_rep2.unique.tn5.merged.V2.sorted.bed"
peak_bed <- "/scratch/jpm73279/comparative_single_cell/07.call.ACRs/Sb_peak_calls/Sb.v4.final/Sb.peaks.annot_v4.500bp_peaks.bed"
metadata <- "/home/jpm73279/Mendieta_et_al_comparative_single_cell/metrics/annotations/sb_annot_v4/Sb.leaf_annot.V4.meta.final.2022-11-14.txt"
ann <- "/home/jpm73279/genome_downloads/Sbicolor.v5.1.prelim.annot/Sbicolorv5.1.primaryTrs.gff3"
chr <- "/home/jpm73279/genome_downloads/Sbicolor.v5.1.prelim.annot/Sorghum_bicolor_var_BTx623.chrom.size"
prefix <- "Sb"

here() starts at /home/jpm73279/Mendieta_et_al_comparative_single_cell

Loading required package: usethis

Attaching SeuratObject

Loading required package: Rcpp

── [1mAttaching packages[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.3.6      [32m✔[39m [34mpurrr  [39m 0.3.5 
[32m✔[39m [34mtibble [39m 3.1.8      [32m✔[39m [34mdplyr  [39m 1.0.10
[32m✔[39m [34mtidyr  [39m 1.2.1      [32m✔[39m [34mstringr[39m 1.4.1 
[32m✔[39m [34mreadr  [39m 2.1.3      [32m✔[39m [34mforcats[39m 0.5.2 
“package ‘ggplot2’ was built under R version 4.2.1”
“package ‘tibble’ was built under R version 4.2.1”
“package ‘stringr’ was built under R version 4.2.1”
── [1mConflicts[22m ───────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m 

In [2]:
#Load Object
Zm.subcluster <- loadBEDandGenomeData(tn5_bed, ann, chr)

Running pre-check on input files and executable paths ...

BED file path = /scratch/jpm73279/comparative_single_cell/01.alignments_annotations/sorghum_bicolor/Sorghum_leaf.sciATAC_rep2.unique.tn5.merged.V2.sorted.bed ... ok

GFF file path = /home/jpm73279/genome_downloads/Sbicolor.v5.1.prelim.annot/Sbicolorv5.1.primaryTrs.gff3 ... ok

Chromosome sizes file path = /home/jpm73279/genome_downloads/Sbicolor.v5.1.prelim.annot/Sorghum_bicolor_var_BTx623.chrom.size ... ok

Macs2 is installed .... ok

 - loading data (this may take obj while for big BED files) ...

 - finished loading data



In [3]:
Zm.subcluster$meta <- read.table(metadata)
Zm.subcluster$acr <- read.table(peak_bed)

In [4]:
###################################################################################################
###################################################################################################
###################################################################################################
#' generateMatrix
#'
#' This function generates the sparse matrix from equally sized genomic bins or ACRs.
#'
#' @importFrom GenomicRanges GRanges
#' @importFrom GenomicRanges tileGenome
#' @importFrom IRanges subsetByOverlaps
#'
#' @param obj Object output from findCells or buildMetaData. Required.
#' @param filtered Logical. Whether or not to use the filtered set of cells. Defaults to TRUE.
#' @param windows Integer. Window size to build bins. If the 'peaks' parameter is set to TRUE,
#' this argument is over-ridden.
#' @param peaks Logical. If TRUE, use ACRs to build sparse matrix instead of genomic tiles.
#' Default is set to FALSE.
#' @param blacklist in bed format. If given removes black list regions from either peaks or 
#' generated bins. Default is set to null.
#' Default is set to FALSE.
#' @param verbose Logical. Whether or not to print progress.
#'
#' @rdname generateMatrix
#' @export
#'
generateMatrix_sparse <- function(obj,
                           filtered=T,
                           windows=1000,
                           peaks=FALSE,
                           blacklist=NULL,
                           organelle_scaffolds = NULL,
                           verbose=T){


    # convert tn5 bed to Granges
    tn5.gr <- GRanges(seqnames=as.character(obj$bed$V1),
                      ranges=IRanges(start=as.numeric(obj$bed$V2),
                                     end=as.numeric(obj$bed$V3)),
                      strand=as.character(obj$bed$V5),
                      names=as.character(obj$bed$V4))
    
    
    # Remove Organell Scaffolds if given
    if(is.null(organelle_scaffolds) == FALSE) {
        
        tn5.gr <- dropSeqlevels(tn5.gr, organelle_scaffolds)


    } else {
        tn5.gr <- tn5.gr
    }


    
    # Read in baclist if given
    if(is.null(blacklist) == FALSE) {
        blacklist_r <- read.table(as.character(blacklist))


        blacklist.gr <- GRanges(seqnames=as.character(blacklist_r$V1),
          
                ranges=IRanges(start=as.numeric(blacklist_r$V2),
                                         end=as.numeric(blacklist_r$V3)),
                          names=as.character(blacklist_r$V4))
    } else {
        blacklist.gr <- NULL
    }


    # use filtered barcodes?
    if(filtered){
        use <- obj$meta.v3
    }else{
        use <- obj$meta
    }




    # generate intervals
    if(!peaks){
        # build bins from specified tile length
        chr.seq.lengths <- as.numeric(obj$chr$V2)
        names(chr.seq.lengths) <- obj$chr$V1
        intervals <- tileGenome(chr.seq.lengths, tilewidth=windows, cut.last.tile.in.chrom=TRUE)


        #Remove if black list included
        #Remove procedure learned from: https://www.biostars.org/p/263214/
       if (is.null(blacklist.gr) == FALSE){


            intervals <- intervals[-queryHits(findOverlaps(intervals, blacklist.gr, type="any")),] 
            regions <- as.data.frame(intervals)
            regions <- paste(regions$seqnames, regions$start, regions$end, sep="_")




        }else{
            regions <- as.data.frame(intervals)
            regions <- paste(regions$seqnames, regions$start, regions$end, sep="_")
        }




    }else{


        # generate intervals from ACRs
        intervals <- GRanges(seqnames=as.character(obj$acr$V1),
                             ranges=IRanges(start=as.numeric(obj$acr$V2),
                                            end=as.numeric(obj$acr$V3)))


        if (is.null(blacklist.gr) == FALSE){
            intervals <- intervals[-queryHits(findOverlaps(intervals, blacklist.gr, type="any")),] 
            regions <- as.data.frame(intervals)
            regions <- paste(regions$seqnames, regions$start, regions$end, sep="_")
        }else{
            regions <- as.data.frame(intervals)
            regions <- paste(regions$seqnames, regions$start, regions$end, sep="_")
        }
    }




    # get intervals overlapping Tn5 sites by barcode
    hits <- as.data.frame(findOverlaps(tn5.gr, intervals))
    df <- data.frame(regions=regions[hits$subjectHits], barcodes=as.character(obj$bed$V4)[hits$queryHits])
    df <- df[!duplicated(df),]
    df$binary <- 1
    colnames(df) <- c("V1","V2","V3")
    
    
    #9/26/2022 include for sake of calculation of isCell 
    # make sure nSites is calculated
    #Integration Sites
    a <- df
    
    #Meta data to interset
    b <- use
    a$V1 <- factor(a$V1)
    a$V2 <- factor(a$V2)


    #Generate sparse matrix
    a <- Matrix::sparseMatrix(i=as.numeric(a$V1),
                              j=as.numeric(a$V2),
                              x=as.numeric(a$V3),
                              dimnames=list(levels(a$V1),levels(a$V2)))


    # align barcodes
    both <- intersect(rownames(b), colnames(a))
    a <- a[,both]
    b <- b[both,]


    # make sure nSites is calculated
    b$nSites   <- Matrix::colSums(a)
    b$log10nSites <- log10(b$nSites)


    # return
    obj$counts <- df
    obj$meta <- b 


    return(obj) 
    
    }



In [5]:
Zm.subcluster <- generateMatrix_sparse(Zm.subcluster, 
                                            filtered=FALSE,
                                            peaks = TRUE,
                                            verbose=FALSE)

#Zm.subcluster.vasculature <- convertSparseData(Zm.subcluster.vasculature, verbose = TRUE)

In [6]:
str(Zm.subcluster)

List of 9
 $ bed    :'data.frame':	162461909 obs. of  5 variables:
  ..$ V1: chr [1:162461909] "Chr01" "Chr01" "Chr01" "Chr01" ...
  ..$ V2: int [1:162461909] 149 168 168 168 168 168 168 175 175 176 ...
  ..$ V3: int [1:162461909] 150 169 169 169 169 169 169 176 176 177 ...
  ..$ V4: chr [1:162461909] "BC:Z:CAGATCAGTCAATGTCGGTAGAGACTA-Sorghum_leaf.sciATAC_rep2" "BC:Z:CTTGTAAGTTCCTGTTTGTCCGCTCGG-Sorghum_leaf.sciATAC_rep2" "BC:Z:CTTGTAATGTCAACTCGGGTAGCAGCT-Sorghum_leaf.sciATAC_rep2" "BC:Z:GATCAGCTTGTATGTCGATGAGCTCGG-Sorghum_leaf.sciATAC_rep2" ...
  ..$ V5: chr [1:162461909] "+" "+" "+" "+" ...
 $ gff    :Reference class 'TxDb' [package "GenomicFeatures"] with 6 fields
  ..$ conn           :Formal class 'SQLiteConnection' [package "RSQLite"] with 8 slots
  .. .. ..@ ptr                :<externalptr> 
  .. .. ..@ dbname             : chr ""
  .. .. ..@ loadable.extensions: logi TRUE
  .. .. ..@ flags              : int 70
  .. .. ..@ vfs                : chr ""
  .. .. ..@ ref             

In [8]:
    sparse_count_matrix <- Zm.subcluster$counts
    # make sure bins/cells are factors
    sparse_count_matrix$V1 <- factor(sparse_count_matrix$V1)
    sparse_count_matrix$V2 <- factor(sparse_count_matrix$V2)



    # convert to sparseMatrix format
    sparse_count_matrix <- Matrix::sparseMatrix(i=as.numeric(sparse_count_matrix$V1),
                              j=as.numeric(sparse_count_matrix$V2),
                              x=as.numeric(sparse_count_matrix$V3),
                             dimnames=list(levels(sparse_count_matrix$V1),levels(sparse_count_matrix$V2)))





In [11]:

saveRDS(sparse_count_matrix, file = paste0("Sb.test2",".peaks_by_intersections.rds"))