LTLA · LTLA · Mar 29, 2019 · Mar 29, 2019 · Mar 31, 2019 · Mar 31, 2019
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -8,17 +8,17 @@ Authors@R: c(person("Aaron", "Lun", role=c("aut", "cre"), email =
 Depends: GenomicRanges, SummarizedExperiment
 Imports: Rcpp, BiocGenerics, Rsamtools, edgeR, limma, GenomicFeatures, 
         AnnotationDbi, methods, S4Vectors, IRanges, GenomeInfoDb, stats,
-        BiocParallel, utils
+        BiocParallel, utils, GenomicAlignments
 Suggests: org.Mm.eg.db, TxDb.Mmusculus.UCSC.mm10.knownGene, testthat,
-        GenomicAlignments, knitr, BiocStyle, rmarkdown, BiocManager
+        knitr, BiocStyle, rmarkdown, BiocManager
 biocViews: MultipleComparison, ChIPSeq, Normalization, Sequencing,
         Coverage, Genetics, Annotation, DifferentialPeakCalling
 Description: Detection of differentially bound regions in ChIP-seq data
         with sliding windows, with methods for normalization and proper
         FDR control.
 License: GPL-3
 NeedsCompilation: yes
-LinkingTo: Rhtslib, zlibbioc, Rcpp
+LinkingTo: Rcpp
 SystemRequirements: C++11
 VignetteBuilder: knitr
 RoxygenNote: 6.1.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -49,7 +49,6 @@ exportMethods(show)
 import(GenomicRanges)
 import(SummarizedExperiment)
 import(methods)
-importClassesFrom(BiocParallel,BiocParallelParam)
 importClassesFrom(GenomicFeatures,TxDb)
 importClassesFrom(GenomicRanges,GRanges)
 importClassesFrom(GenomicRanges,GenomicRanges)
@@ -62,11 +61,11 @@ importFrom(BiocGenerics,end)
 importFrom(BiocGenerics,order)
 importFrom(BiocGenerics,start)
 importFrom(BiocGenerics,strand)
+importFrom(BiocGenerics,table)
 importFrom(BiocGenerics,width)
 importFrom(BiocParallel,SerialParam)
 importFrom(BiocParallel,bpisup)
 importFrom(BiocParallel,bpmapply)
-importFrom(BiocParallel,bpnworkers)
 importFrom(BiocParallel,bpstart)
 importFrom(BiocParallel,bpstop)
 importFrom(GenomeInfoDb,"seqlengths<-")
@@ -76,6 +75,10 @@ importFrom(GenomeInfoDb,seqinfo)
 importFrom(GenomeInfoDb,seqlengths)
 importFrom(GenomeInfoDb,seqlevels)
 importFrom(GenomeInfoDb,seqnames)
+importFrom(GenomicAlignments,first)
+importFrom(GenomicAlignments,last)
+importFrom(GenomicAlignments,readGAlignmentPairs)
+importFrom(GenomicAlignments,readGAlignments)
 importFrom(GenomicFeatures,exonsBy)
 importFrom(GenomicFeatures,genes)
 importFrom(GenomicRanges,GRanges)
@@ -89,6 +92,9 @@ importFrom(IRanges,promoters)
 importFrom(IRanges,ranges)
 importFrom(IRanges,trim)
 importFrom(Rcpp,sourceCpp)
+importFrom(Rsamtools,BamFile)
+importFrom(Rsamtools,ScanBamParam)
+importFrom(Rsamtools,scanBamFlag)
 importFrom(Rsamtools,scanBamHeader)
 importFrom(S4Vectors,DataFrame)
 importFrom(S4Vectors,Hits)
@@ -125,4 +131,5 @@ importFrom(stats,quantile)
 importFrom(stats,spline)
 importFrom(stats,weighted.mean)
 importFrom(utils,browseURL)
+importFrom(utils,head)
 useDynLib(csaw, .registration=TRUE, .fixes="cxx_")
diff --git a/R/getPESizes.R b/R/getPESizes.R
@@ -1,63 +1,36 @@
 #' @export
 #' @importFrom GenomicRanges GRanges
 #' @importFrom IRanges IRanges
+#' @importFrom utils head
 getPESizes <- function(bam.file, param=readParam(pe="both")) 
 # Reads a BAM file to determine the size of the PE fragments. 
 # Returns a vector of sizes which can be plotted for diagnostics. 
 # The length of the vector will also tell you how many read pairs were considered valid. 
-# The total number of reads, the number of singletons and the number of interchromosomal pairs are also reported.
+# The number of interchromosomal pairs and unoriented reads are also reported.
 # 
 # written by Aaron Lun
 # a long long time ago
 {
     if (param$pe!="both") { stop("paired-end inputs required") }
+    param <- reform(param, max.frag=Inf) # to get all fragment sizes.
 
     extracted.chrs <- .activeChrs(bam.file, param$restrict)
-    nchrs <- length(extracted.chrs)
-    totals <- singles <- one.unmapped <- mapped <- unoriented <- 0L
-    norm.list <- loose.names.1 <- loose.names.2 <- vector("list", nchrs)
+    norm.list <- vector("list", length(extracted.chrs))
+    diagnostics <- list(inter.chr=0L, unoriented=0L, discarded=0L)
 
-    for (i in seq_len(nchrs)) { 
+    for (i in seq_along(extracted.chrs)) {
         cur.chr <- names(extracted.chrs)[i]
-        output <- .extractPE(bam.file, GRanges(cur.chr, IRanges(1L, extracted.chrs[i])), param=param, diagnostics=TRUE)
-        totals <- totals + output$total
-        singles <- singles + output$single
-        unoriented <- unoriented + output$unoriented
-        one.unmapped <- one.unmapped + output$one.unmapped 
-
-        # Valid read pairs get their sizes stored.
-        all.sizes <- pmin(output$reverse[[1]] + output$reverse[[2]], extracted.chrs[i] + 1L) - output$forward[[1]] 
-        norm.list[[i]] <- all.sizes
-
-        # For inter-chromosomals; either mapped (and then store names), or implicitly unmapped.
-        loose.names.1[[i]] <- output$inter.chr[[1]]
-        loose.names.2[[i]] <- output$inter.chr[[2]]
+        cur.gr <- GRanges(cur.chr, IRanges(1L, extracted.chrs[i]))
+        out <- .extractPE(bam.file, cur.gr, param, diagnostics=TRUE)
+        norm.list[[i]] <- pmin(out$size + out$pos, extracted.chrs[i] + 1L) - out$pos # capping insert by chromosome boundaries 
+
+        so.far <- head(names(extracted.chrs), i-1L)
+        inter.set <- out$diagnostics$inter.chr
+        diagnostics$inter.chr <- diagnostics$inter.chr + sum(inter.set[!names(inter.set) %in% so.far])
+        diagnostics$unoriented <- diagnostics$unoriented + out$diagnostics$unoriented
+        diagnostics$discarded <- diagnostics$discarded + out$diagnostics$discarded
     }
 
-    # Checking whether a read is positively matched to a mapped counterpart on another chromosome.
-    # If not, then it's just a read in an unmapped pair.
-    loose.names.1 <- unlist(loose.names.1)
-    loose.names.2 <- unlist(loose.names.2)
-    inter.chr <- sum(loose.names.1 %in% loose.names.2)
-    one.unmapped <- one.unmapped + length(loose.names.2) + length(loose.names.1) - inter.chr*2L
-
-    bam.file <- path.expand(bam.file)
-    bam.index <- paste0(bam.file, ".bai")
-    out <- .Call(cxx_get_leftovers, bam.file, bam.index, names(extracted.chrs))
-    totals <- totals + out
-
-    norm.list <- unlist(norm.list)
-    mapped <- singles + one.unmapped + 2L * (unoriented + inter.chr + length(norm.list))
-
     # Returning sizes and some diagnostic data.
-    list(sizes=norm.list, 
-        diagnostics=c(
-            total.reads=totals, 
-            mapped.reads=mapped, 
-            single=singles, 
-            mate.unmapped=one.unmapped, 
-            unoriented=unoriented, 
-            inter.chr=inter.chr
-        )
-    )
+    list(sizes=unlist(norm.list), diagnostics=unlist(diagnostics))
 }
diff --git a/R/internal_extract.R b/R/internal_extract.R
@@ -1,5 +1,7 @@
+#' @importFrom GenomicAlignments readGAlignments
+#' @importFrom BiocGenerics start width 
+#' @importFrom IRanges overlapsAny
 #' @importFrom GenomeInfoDb seqnames
-#' @importFrom BiocGenerics start end
 .extractSE <- function(bam.file, where, param) 
 # Extracts single-end read data from a BAM file with removal of unmapped,
 # duplicate and poorly mapped/non-unique reads. We also discard reads in the
@@ -8,87 +10,124 @@
 # written by Aaron Lun
 # created 8 December 2013
 {
-    cur.chr <- as.character(seqnames(where)) 
-    bam.file <- path.expand(bam.file)
-    bam.index <- paste0(bam.file, ".bai")
+    sbp <- .generate_sbp(where, param)
+    reads <- readGAlignments(bam.file, param=sbp)
 
-    if (length(param$forward)==0L) { 
-        stop("read strand extraction must be specified") 
-    }
+    is.forward <- strand(reads)=="+"
+    forwards <- reads[is.forward]
+    reverses <- reads[!is.forward]
 
-    if (param$pe=="first") {
-        use.first <- TRUE
-    } else if (param$pe=="second") { 
-        use.first <- FALSE
-    } else {
-        use.first <- NA
+    if (any(seqnames(param$discard) == as.character(seqnames(where)))) {
+        forwards <- forwards[!overlapsAny(forwards, param$discard, type="within")]
+        reverses <- reverses[!overlapsAny(reverses, param$discard, type="within")]
     }
 
-    cur.discard <- .getDiscard(param, cur.chr)
+    list(
+        forward=list(pos=start(forwards), qwidth=width(forwards)),
+        reverse=list(pos=start(reverses), qwidth=width(reverses))
+    )
+}
 
-    out <- .Call(cxx_extract_single_data, bam.file, bam.index, 
-        cur.chr, start(where), end(where), 
-        param$minq, param$dedup, param$forward, use.first,
-        cur.discard$pos, cur.discard$id)
+#' @importFrom Rsamtools ScanBamParam scanBamFlag
+.generate_sbp <- function(where, param) {
+    flags <- list(isUnmappedQuery=FALSE,
+        isSecondaryAlignment=FALSE,
+        isSupplementaryAlignment=FALSE)
 
-    names(out) <- c("forward", "reverse")
-    names(out$forward) <- names(out$reverse) <- c("pos", "qwidth")
-    return(out)
-}
+    if (param$pe=="first" || param$pe=="second") {
+        flags$isFirstMateRead <- param$pe=="first"
+        flags$isSecondMateRead <- param$pe=="second"
+    } else if (param$pe=="second") {
+        flags$isPaired <- TRUE
+        flags$hasUnmappedMate <- FALSE
+    }
+
+    if (length(param$forward)==0L) { 
+        stop("read strand extraction must be specified") 
+    } else if (!is.na(param$forward)) {
+        flags$isMinusStrand <- !param$forward
+    }
 
-.getDiscard <- function(param, chr) {    
-    cur.discard <- param$processed.discard[[chr]]
-    if (is.null(cur.discard)) {
-        cur.discard <- list(pos=integer(0), id=integer(0))
+    if (param$dedup) {
+        flags$isDuplicate <- FALSE
     }
-    cur.discard
+
+    ScanBamParam(flag=do.call(scanBamFlag, flags), mapqFilter=param$minq, which=where)
 }
 
+#' @importFrom GenomicRanges GRanges
+#' @importFrom IRanges overlapsAny IRanges
+#' @importFrom BiocGenerics start end strand table
 #' @importFrom GenomeInfoDb seqnames
-#' @importFrom BiocGenerics start end
+#' @importFrom GenomicAlignments readGAlignmentPairs first last
 .extractPE <- function(bam.file, where, param, with.reads=FALSE, diagnostics=FALSE)
-# A function to extract PE data for a particular chromosome. Synchronisation
-# is expected.  We avoid sorting by name  as it'd mean we have to process the
-# entire genome at once (can't go chromosome-by-chromosome).  This probably
-# results in increased memory usage across the board, and it doesn't fit in
-# well with the rest of the pipelines which assume coordinate sorting.
+# A function to extract PE data for a particular chromosome. 
 # 
 # written by Aaron Lun
 # created 8 December 2013
 {
-    cur.chr <- as.character(seqnames(where)) 
-    bam.file <- path.expand(bam.file)
-    bam.index <- paste0(bam.file, ".bai")
+    sbp <- .generate_sbp(where, param)
+    reads <- readGAlignmentPairs(bam.file, param=sbp)
+    first.read <- first(reads)
+    second.read <- last(reads)
+
+    # Checking that reads in a pair are on the same chromosome and different strands.
+    cur.chr <- as.character(seqnames(where))
+    same.chr1 <- seqnames(first.read) == cur.chr
+    same.chr2 <- seqnames(second.read) == cur.chr
+    same.chr <- same.chr1 & same.chr2
 
-    if (!identical(param$forward, NA)) { 
-        stop("cannot specify read strand when 'pe=\"both\"'") 
+    if (diagnostics) {
+        inter.set <- table(seqnames(first.read)[!same.chr1]) + table(seqnames(second.read)[!same.chr2])
     }
 
-    cur.discard <- .getDiscard(param, cur.chr)
+    oriented <- strand(first.read) != strand(second.read)
+    keep <- oriented & same.chr
+    first.read <- first.read[keep]
+    second.read <- second.read[keep]
 
-    out <- .Call(cxx_extract_pair_data, bam.file, bam.index, 
-        cur.chr, start(where), end(where), 
-        param$minq, param$dedup, 
-        cur.discard$pos, cur.discard$id, 
-        diagnostics)
+    # Reorganizing the objects in terms of forward and reverse reads.
+    first.forward <- strand(first.read)=="+"
+    second.forward <- strand(second.read)=="+"
+    forward.read <- c(first.read[first.forward], second.read[second.forward])
+    reverse.read <- c(second.read[first.forward], first.read[second.forward])
 
-    if (diagnostics) {
-        names(out) <- c("forward", "reverse", "total", "single", "unoriented", "one.unmapped", "inter.chr")
-        return(out)
+    # Insert size from forward start to reverse end.
+    frag.starts <- start(forward.read)
+    frag.sizes <- end(reverse.read) - frag.starts + 1L
+    inward <- frag.sizes > 0L
+
+    keep <- inward & frag.sizes <= param$max.frag 
+    frag.starts <- frag.starts[keep]
+    frag.sizes <- frag.sizes[keep]
+
+    # Discarding *fragments* in blacklist regions.
+    if (any(seqnames(param$discard) == as.character(seqnames(where))) && length(frag.sizes)) {
+        cur.ranges <- GRanges(cur.chr, IRanges(frag.starts, width=frag.sizes))
+        blacklisted <- overlapsAny(cur.ranges, param$discard, type="within")
+        frag.starts <- frag.starts[!blacklisted]
+        frag.sizes <- frag.sizes[!blacklisted]
+        keep[keep] <- !blacklisted
+    } else {
+        blacklisted <- NULL
     }
 
-    left.pos <- out[[1]][[1]]
-    left.len <- out[[1]][[2]]
-    right.pos <- out[[2]][[1]]
-    right.len <- out[[2]][[2]]
+    output <- list(pos=frag.starts, size=frag.sizes)
 
-    # Computing fragment sizes.
-    all.sizes <- right.pos + right.len - left.pos
-    keep <- all.sizes <= param$max.frag 
-    output <- list(pos=left.pos[keep], size=all.sizes[keep])
     if (with.reads) {
-        output$forward <- list(pos=left.pos[keep], qwidth=left.len[keep])
-        output$reverse <- list(pos=right.pos[keep], qwidth=right.len[keep])
+        forward.read <- forward.read[keep]
+        reverse.read <- reverse.read[keep]
+        output$forward <- list(pos=start(forward.read), qwidth=width(forward.read))
+        output$reverse <- list(pos=start(reverse.read), qwidth=width(reverse.read))
     }
-    return(output)
+
+    if (diagnostics) {
+        output$diagnostics <- list(
+            inter.chr=inter.set,
+            unoriented=sum(same.chr & !oriented) + sum(!inward), 
+            discarded=sum(blacklisted)
+        )
+    }
+
+    output
 }