Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Htsfree #4

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@ Authors@R: c(person("Aaron", "Lun", role=c("aut", "cre"), email =
Depends: GenomicRanges, SummarizedExperiment
Imports: Rcpp, BiocGenerics, Rsamtools, edgeR, limma, GenomicFeatures,
AnnotationDbi, methods, S4Vectors, IRanges, GenomeInfoDb, stats,
BiocParallel, utils
BiocParallel, utils, GenomicAlignments
Suggests: org.Mm.eg.db, TxDb.Mmusculus.UCSC.mm10.knownGene, testthat,
GenomicAlignments, knitr, BiocStyle, rmarkdown, BiocManager
knitr, BiocStyle, rmarkdown, BiocManager
biocViews: MultipleComparison, ChIPSeq, Normalization, Sequencing,
Coverage, Genetics, Annotation, DifferentialPeakCalling
Description: Detection of differentially bound regions in ChIP-seq data
with sliding windows, with methods for normalization and proper
FDR control.
License: GPL-3
NeedsCompilation: yes
LinkingTo: Rhtslib, zlibbioc, Rcpp
LinkingTo: Rcpp
SystemRequirements: C++11
VignetteBuilder: knitr
RoxygenNote: 6.1.1
11 changes: 9 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ exportMethods(show)
import(GenomicRanges)
import(SummarizedExperiment)
import(methods)
importClassesFrom(BiocParallel,BiocParallelParam)
importClassesFrom(GenomicFeatures,TxDb)
importClassesFrom(GenomicRanges,GRanges)
importClassesFrom(GenomicRanges,GenomicRanges)
Expand All @@ -62,11 +61,11 @@ importFrom(BiocGenerics,end)
importFrom(BiocGenerics,order)
importFrom(BiocGenerics,start)
importFrom(BiocGenerics,strand)
importFrom(BiocGenerics,table)
importFrom(BiocGenerics,width)
importFrom(BiocParallel,SerialParam)
importFrom(BiocParallel,bpisup)
importFrom(BiocParallel,bpmapply)
importFrom(BiocParallel,bpnworkers)
importFrom(BiocParallel,bpstart)
importFrom(BiocParallel,bpstop)
importFrom(GenomeInfoDb,"seqlengths<-")
Expand All @@ -76,6 +75,10 @@ importFrom(GenomeInfoDb,seqinfo)
importFrom(GenomeInfoDb,seqlengths)
importFrom(GenomeInfoDb,seqlevels)
importFrom(GenomeInfoDb,seqnames)
importFrom(GenomicAlignments,first)
importFrom(GenomicAlignments,last)
importFrom(GenomicAlignments,readGAlignmentPairs)
importFrom(GenomicAlignments,readGAlignments)
importFrom(GenomicFeatures,exonsBy)
importFrom(GenomicFeatures,genes)
importFrom(GenomicRanges,GRanges)
Expand All @@ -89,6 +92,9 @@ importFrom(IRanges,promoters)
importFrom(IRanges,ranges)
importFrom(IRanges,trim)
importFrom(Rcpp,sourceCpp)
importFrom(Rsamtools,BamFile)
importFrom(Rsamtools,ScanBamParam)
importFrom(Rsamtools,scanBamFlag)
importFrom(Rsamtools,scanBamHeader)
importFrom(S4Vectors,DataFrame)
importFrom(S4Vectors,Hits)
Expand Down Expand Up @@ -125,4 +131,5 @@ importFrom(stats,quantile)
importFrom(stats,spline)
importFrom(stats,weighted.mean)
importFrom(utils,browseURL)
importFrom(utils,head)
useDynLib(csaw, .registration=TRUE, .fixes="cxx_")
59 changes: 16 additions & 43 deletions R/getPESizes.R
Original file line number Diff line number Diff line change
@@ -1,63 +1,36 @@
#' @export
#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges IRanges
#' @importFrom utils head
getPESizes <- function(bam.file, param=readParam(pe="both"))
# Reads a BAM file to determine the size of the PE fragments.
# Returns a vector of sizes which can be plotted for diagnostics.
# The length of the vector will also tell you how many read pairs were considered valid.
# The total number of reads, the number of singletons and the number of interchromosomal pairs are also reported.
# The number of interchromosomal pairs and unoriented reads are also reported.
#
# written by Aaron Lun
# a long long time ago
{
if (param$pe!="both") { stop("paired-end inputs required") }
param <- reform(param, max.frag=Inf) # to get all fragment sizes.

extracted.chrs <- .activeChrs(bam.file, param$restrict)
nchrs <- length(extracted.chrs)
totals <- singles <- one.unmapped <- mapped <- unoriented <- 0L
norm.list <- loose.names.1 <- loose.names.2 <- vector("list", nchrs)
norm.list <- vector("list", length(extracted.chrs))
diagnostics <- list(inter.chr=0L, unoriented=0L, discarded=0L)

for (i in seq_len(nchrs)) {
for (i in seq_along(extracted.chrs)) {
cur.chr <- names(extracted.chrs)[i]
output <- .extractPE(bam.file, GRanges(cur.chr, IRanges(1L, extracted.chrs[i])), param=param, diagnostics=TRUE)
totals <- totals + output$total
singles <- singles + output$single
unoriented <- unoriented + output$unoriented
one.unmapped <- one.unmapped + output$one.unmapped

# Valid read pairs get their sizes stored.
all.sizes <- pmin(output$reverse[[1]] + output$reverse[[2]], extracted.chrs[i] + 1L) - output$forward[[1]]
norm.list[[i]] <- all.sizes

# For inter-chromosomals; either mapped (and then store names), or implicitly unmapped.
loose.names.1[[i]] <- output$inter.chr[[1]]
loose.names.2[[i]] <- output$inter.chr[[2]]
cur.gr <- GRanges(cur.chr, IRanges(1L, extracted.chrs[i]))
out <- .extractPE(bam.file, cur.gr, param, diagnostics=TRUE)
norm.list[[i]] <- pmin(out$size + out$pos, extracted.chrs[i] + 1L) - out$pos # capping insert by chromosome boundaries

so.far <- head(names(extracted.chrs), i-1L)
inter.set <- out$diagnostics$inter.chr
diagnostics$inter.chr <- diagnostics$inter.chr + sum(inter.set[!names(inter.set) %in% so.far])
diagnostics$unoriented <- diagnostics$unoriented + out$diagnostics$unoriented
diagnostics$discarded <- diagnostics$discarded + out$diagnostics$discarded
}

# Checking whether a read is positively matched to a mapped counterpart on another chromosome.
# If not, then it's just a read in an unmapped pair.
loose.names.1 <- unlist(loose.names.1)
loose.names.2 <- unlist(loose.names.2)
inter.chr <- sum(loose.names.1 %in% loose.names.2)
one.unmapped <- one.unmapped + length(loose.names.2) + length(loose.names.1) - inter.chr*2L

bam.file <- path.expand(bam.file)
bam.index <- paste0(bam.file, ".bai")
out <- .Call(cxx_get_leftovers, bam.file, bam.index, names(extracted.chrs))
totals <- totals + out

norm.list <- unlist(norm.list)
mapped <- singles + one.unmapped + 2L * (unoriented + inter.chr + length(norm.list))

# Returning sizes and some diagnostic data.
list(sizes=norm.list,
diagnostics=c(
total.reads=totals,
mapped.reads=mapped,
single=singles,
mate.unmapped=one.unmapped,
unoriented=unoriented,
inter.chr=inter.chr
)
)
list(sizes=unlist(norm.list), diagnostics=unlist(diagnostics))
}
155 changes: 97 additions & 58 deletions R/internal_extract.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#' @importFrom GenomicAlignments readGAlignments
#' @importFrom BiocGenerics start width
#' @importFrom IRanges overlapsAny
#' @importFrom GenomeInfoDb seqnames
#' @importFrom BiocGenerics start end
.extractSE <- function(bam.file, where, param)
# Extracts single-end read data from a BAM file with removal of unmapped,
# duplicate and poorly mapped/non-unique reads. We also discard reads in the
Expand All @@ -8,87 +10,124 @@
# written by Aaron Lun
# created 8 December 2013
{
cur.chr <- as.character(seqnames(where))
bam.file <- path.expand(bam.file)
bam.index <- paste0(bam.file, ".bai")
sbp <- .generate_sbp(where, param)
reads <- readGAlignments(bam.file, param=sbp)

if (length(param$forward)==0L) {
stop("read strand extraction must be specified")
}
is.forward <- strand(reads)=="+"
forwards <- reads[is.forward]
reverses <- reads[!is.forward]

if (param$pe=="first") {
use.first <- TRUE
} else if (param$pe=="second") {
use.first <- FALSE
} else {
use.first <- NA
if (any(seqnames(param$discard) == as.character(seqnames(where)))) {
forwards <- forwards[!overlapsAny(forwards, param$discard, type="within")]
reverses <- reverses[!overlapsAny(reverses, param$discard, type="within")]
}

cur.discard <- .getDiscard(param, cur.chr)
list(
forward=list(pos=start(forwards), qwidth=width(forwards)),
reverse=list(pos=start(reverses), qwidth=width(reverses))
)
}

out <- .Call(cxx_extract_single_data, bam.file, bam.index,
cur.chr, start(where), end(where),
param$minq, param$dedup, param$forward, use.first,
cur.discard$pos, cur.discard$id)
#' @importFrom Rsamtools ScanBamParam scanBamFlag
.generate_sbp <- function(where, param) {
flags <- list(isUnmappedQuery=FALSE,
isSecondaryAlignment=FALSE,
isSupplementaryAlignment=FALSE)

names(out) <- c("forward", "reverse")
names(out$forward) <- names(out$reverse) <- c("pos", "qwidth")
return(out)
}
if (param$pe=="first" || param$pe=="second") {
flags$isFirstMateRead <- param$pe=="first"
flags$isSecondMateRead <- param$pe=="second"
} else if (param$pe=="second") {
flags$isPaired <- TRUE
flags$hasUnmappedMate <- FALSE
}

if (length(param$forward)==0L) {
stop("read strand extraction must be specified")
} else if (!is.na(param$forward)) {
flags$isMinusStrand <- !param$forward
}

.getDiscard <- function(param, chr) {
cur.discard <- param$processed.discard[[chr]]
if (is.null(cur.discard)) {
cur.discard <- list(pos=integer(0), id=integer(0))
if (param$dedup) {
flags$isDuplicate <- FALSE
}
cur.discard

ScanBamParam(flag=do.call(scanBamFlag, flags), mapqFilter=param$minq, which=where)
}

#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges overlapsAny IRanges
#' @importFrom BiocGenerics start end strand table
#' @importFrom GenomeInfoDb seqnames
#' @importFrom BiocGenerics start end
#' @importFrom GenomicAlignments readGAlignmentPairs first last
.extractPE <- function(bam.file, where, param, with.reads=FALSE, diagnostics=FALSE)
# A function to extract PE data for a particular chromosome. Synchronisation
# is expected. We avoid sorting by name as it'd mean we have to process the
# entire genome at once (can't go chromosome-by-chromosome). This probably
# results in increased memory usage across the board, and it doesn't fit in
# well with the rest of the pipelines which assume coordinate sorting.
# A function to extract PE data for a particular chromosome.
#
# written by Aaron Lun
# created 8 December 2013
{
cur.chr <- as.character(seqnames(where))
bam.file <- path.expand(bam.file)
bam.index <- paste0(bam.file, ".bai")
sbp <- .generate_sbp(where, param)
reads <- readGAlignmentPairs(bam.file, param=sbp)
first.read <- first(reads)
second.read <- last(reads)

# Checking that reads in a pair are on the same chromosome and different strands.
cur.chr <- as.character(seqnames(where))
same.chr1 <- seqnames(first.read) == cur.chr
same.chr2 <- seqnames(second.read) == cur.chr
same.chr <- same.chr1 & same.chr2

if (!identical(param$forward, NA)) {
stop("cannot specify read strand when 'pe=\"both\"'")
if (diagnostics) {
inter.set <- table(seqnames(first.read)[!same.chr1]) + table(seqnames(second.read)[!same.chr2])
}

cur.discard <- .getDiscard(param, cur.chr)
oriented <- strand(first.read) != strand(second.read)
keep <- oriented & same.chr
first.read <- first.read[keep]
second.read <- second.read[keep]

out <- .Call(cxx_extract_pair_data, bam.file, bam.index,
cur.chr, start(where), end(where),
param$minq, param$dedup,
cur.discard$pos, cur.discard$id,
diagnostics)
# Reorganizing the objects in terms of forward and reverse reads.
first.forward <- strand(first.read)=="+"
second.forward <- strand(second.read)=="+"
forward.read <- c(first.read[first.forward], second.read[second.forward])
reverse.read <- c(second.read[first.forward], first.read[second.forward])

if (diagnostics) {
names(out) <- c("forward", "reverse", "total", "single", "unoriented", "one.unmapped", "inter.chr")
return(out)
# Insert size from forward start to reverse end.
frag.starts <- start(forward.read)
frag.sizes <- end(reverse.read) - frag.starts + 1L
inward <- frag.sizes > 0L

keep <- inward & frag.sizes <= param$max.frag
frag.starts <- frag.starts[keep]
frag.sizes <- frag.sizes[keep]

# Discarding *fragments* in blacklist regions.
if (any(seqnames(param$discard) == as.character(seqnames(where))) && length(frag.sizes)) {
cur.ranges <- GRanges(cur.chr, IRanges(frag.starts, width=frag.sizes))
blacklisted <- overlapsAny(cur.ranges, param$discard, type="within")
frag.starts <- frag.starts[!blacklisted]
frag.sizes <- frag.sizes[!blacklisted]
keep[keep] <- !blacklisted
} else {
blacklisted <- NULL
}

left.pos <- out[[1]][[1]]
left.len <- out[[1]][[2]]
right.pos <- out[[2]][[1]]
right.len <- out[[2]][[2]]
output <- list(pos=frag.starts, size=frag.sizes)

# Computing fragment sizes.
all.sizes <- right.pos + right.len - left.pos
keep <- all.sizes <= param$max.frag
output <- list(pos=left.pos[keep], size=all.sizes[keep])
if (with.reads) {
output$forward <- list(pos=left.pos[keep], qwidth=left.len[keep])
output$reverse <- list(pos=right.pos[keep], qwidth=right.len[keep])
forward.read <- forward.read[keep]
reverse.read <- reverse.read[keep]
output$forward <- list(pos=start(forward.read), qwidth=width(forward.read))
output$reverse <- list(pos=start(reverse.read), qwidth=width(reverse.read))
}
return(output)

if (diagnostics) {
output$diagnostics <- list(
inter.chr=inter.set,
unoriented=sum(same.chr & !oriented) + sum(!inward),
discarded=sum(blacklisted)
)
}

output
}
Loading