#!/usr/bin/env python # # Based on 'FilterUniqueBAM' by # Martin Kircher # Martin.Kircher@eva.mpg.de # import collections import pysam import sys from argparse import ArgumentParser from pysam.calignedsegment import CIGAR2CODE _USAGE = """paleomix rmdup_collapsed [options] < sorted.bam > out.bam The rmdup_collapsed filters a BAM file for PCR duplicates unpaired reads under the assumption that any unpaired read have been generated by the merging of overlapping paired-end reads, and thereby represent the complete template sequence. PCR duplicates are therefore detected based on both the 5' and 3' alignment coordinate. Paired reads (0x1), unmapped reads (0x4), secondary alignments (0x100), reads that failed QC (0x200), and chimeric alignments (0x800), as identified using the BAM record flags, are not filtered, but simply written to the output. By default, filtered reads are flagged using the "duplicate" flag (0x400), and written to the output. Use the --remove-duplicates command-line option to instead remove these records from the output. """ _FILTERED_FLAGS = 0x1 # PE reads _FILTERED_FLAGS |= 0x4 # Unmapped _FILTERED_FLAGS |= 0x100 # Secondary alignment _FILTERED_FLAGS |= 0x200 # Failed QC _FILTERED_FLAGS |= 0x800 # Chimeric alignment _HARDCLIP = CIGAR2CODE[ord('H')] _SOFTCLIP = CIGAR2CODE[ord('S')] _INSERTION = CIGAR2CODE[ord('I')] def calc_consensus(reads): count = len(reads) outread = None maxsumqual = -1 for read in reads: # Handle reads without qualities, but favor reads with qualities qual = read.query_qualities if qual is None: # Score reads by number of bases, but prefer reads with qualities nsum = -1.0 / read.query_alignment_length else: nsum = sum(qual) if nsum > maxsumqual: outread = read maxsumqual = nsum try: # LOOK FOR PREVIOUS PCR DUPLICATE COUNTS count += read.get_tag('XP') except KeyError: pass outread.set_tag('XP', count) return outread def record_quality(record): qualities = record.query_alignment_qualities if qualities is None: return record.query_alignment_length return sum(qualities) def mark_duplicate_reads(records): # DETERMINE MOST FREQUENT CIGAR LINE by_cigar = collections.defaultdict(list) for record in records: key = tuple(record.cigartuples) by_cigar[key].append(record) def _score(records): return (len(records), sum(map(record_quality, records))) candidates = sorted(by_cigar.itervalues(), key=_score) consensus = calc_consensus(candidates[-1]) for read in records: read.is_duplicate = (read is not consensus) def write_read(args, out, record, alignments): if isinstance(record, list): # Current record is a group of one or more identical alignments. if len(record) > 2: # Select the best read and mark the others as duplicates. mark_duplicate_reads(record[1:]) alignments.pop(record[0]) record = record[1] if not (record.is_duplicate and args.remove_duplicates): out.write(record) def flush_cache(args, out, cache, alignments): last = cache[-1] if isinstance(last, list): last = last[1] last_ref_id = last.reference_id last_ref_start = last.reference_start - args.window_size while cache: first = cache[0] if isinstance(first, list): first = first[1] if first.is_unmapped \ or first.reference_id != last_ref_id \ or first.reference_start < last_ref_start: write_read(args, out, cache.popleft(), alignments) else: break def read_alignment(read): start = read.reference_start end = start at_front = True for (operation, length) in read.cigartuples: if at_front and operation in (_HARDCLIP, _SOFTCLIP): start -= length continue at_front = False if operation != _INSERTION: end += length return (read.reference_id, read.is_reverse, start, end) def process(args, infile, outfile): cache = collections.deque() alignments = {} last_pos = () for read_num, read in enumerate(infile, start=1): # Sorted order is defined as ascending reference_ids and positions current_pos = (read.reference_id, read.reference_start) if last_pos > current_pos and not read.is_unmapped: sys.stderr.write("ERROR: Input file does not appear to be sorted " "by coordinates at record %i. Aborting run!\n" % (read_num,)) return 1 elif read.flag & _FILTERED_FLAGS: cache.append(read) else: read.is_duplicate = False key = read_alignment(read) try: group = alignments[key] group.append(read) cache.append(read) except KeyError: # No previous records with matching alignment; this read will # serve to group any other reads with the same alignment. group = [key, read] cache.append(group) alignments[key] = group last_pos = current_pos if len(cache) > args.window_size: flush_cache(args, outfile, cache, alignments) while cache: write_read(args, outfile, cache.popleft(), alignments) def parse_args(argv): parser = ArgumentParser(usage=_USAGE) parser.add_argument("input", default="-", nargs="?", help="BAM file; if not set, input is read from STDIN.") parser.add_argument("--remove-duplicates", help="Remove duplicates from output; by default " "duplicates are only flagged (flag = 0x400).", default=False, action="store_true") parser.add_argument("--window-size", default=65536, type=int, help="Length of sliding window in which PCR " "duplicates are detected [default: %(default)s]") return parser.parse_args(argv) def main(argv): args = parse_args(argv) if args.input == "-" and sys.stdin.isatty(): sys.stderr.write("STDIN is a terminal, terminating!\n") return 1 elif sys.stdout.isatty(): sys.stderr.write("STDOUT is a terminal, terminating!\n") return 1 with pysam.Samfile(args.input, "rb") as infile: with pysam.Samfile("-", "wb", template=infile) as outfile: process(args, infile, outfile) return 0 if __name__ == '__main__': sys.exit(main(sys.argv[1:]))