--- /home/grg/src/paleomix/paleomix/tools/rmdup_collapsed.py 2016-06-21 12:53:29.095556207 +0930 +++ rmdup_collapsed.py 2016-12-22 16:15:44.356192323 +1030 @@ -21,10 +21,13 @@ import sys import pysam +from pysam.calignedsegment import CIGAR2CODE import random from argparse import ArgumentParser +C_HARDCLIP = CIGAR2CODE[ord('H')] +C_SOFTCLIP = CIGAR2CODE[ord('S')] def calc_consensus(reads, rng=random.random): count = len(reads) @@ -126,6 +129,10 @@ help="Seed used for randomly selecting representative " "reads when no reads have quality scores assigned" "[default: initialized using system time].") + parser.add_argument("--blocksize", default=65536, type=int, + help="Length of genomic blocks in which to find reads " + "with the same starting position. " + "[default: 65536]") return parser.parse_args(argv) @@ -145,35 +152,57 @@ with pysam.Samfile(args.input, "rb") as infile: with pysam.Samfile("-", "wb", template=infile) as outfile: - curpos = None - curvariants = {} + curpos = startpos = None + var1 = {} + var2 = {} for (read_num, read) in enumerate(infile): - if curpos and ((read.tid, read.pos) != curpos): + + rpos = read.pos + rlen = len(read.seq) + c = read.cigar + + if c is not None: + if c[0][0] == C_HARDCLIP: + rpos -= c[0][1] + rlen += c[0][1] + c = c[1:] + if len(c) > 0 and c[0][0] == C_SOFTCLIP: + rpos -= c[0][1] + + if curpos and curpos > (read.tid, startpos[1]+2*args.blocksize): # Sort order is defined as ascending 'tid's and positions - if curpos > (read.tid, read.pos) and not read.is_unmapped: + if curpos > (read.tid, rpos+2*args.blocksize) and not read.is_unmapped: sys.stderr.write("ERROR: Input file does not appear " "to be sorted by coordinates at " "record %i, aborting ...\n" % (read_num,)) return 1 - _flush_buffer(outfile, curvariants, - args.remove_duplicates) - curpos = None + _flush_buffer(outfile, var1, args.remove_duplicates) + var1 = var2 + var2.clear() + curpos = startpos = None if read.flag & _FILTERED_FLAGS: outfile.write(read) continue - curpos = (read.tid, read.pos) - nkey = (read.is_reverse, read.pos, read.alen) + nkey = (read.is_reverse, rpos, rlen) + if startpos and startpos > (read.tid, rpos-args.blocksize): + curvaraints = var2 + else: + curvariants = var1 if nkey in curvariants: curvariants[nkey][0].append(read) curvariants[nkey][1] += 1 else: curvariants[nkey] = [[read], 1] + curpos = (read.tid, rpos) + if startpos is None: + startpos = curpos - _flush_buffer(outfile, curvariants, args.remove_duplicates) + _flush_buffer(outfile, var1, args.remove_duplicates) + _flush_buffer(outfile, var2, args.remove_duplicates) return 0