--- /home/grg/src/paleomix/paleomix/tools/rmdup_collapsed.py	2016-06-21 12:53:29.095556207 +0930
+++ rmdup_collapsed.py	2016-12-22 16:15:44.356192323 +1030
@@ -21,10 +21,13 @@
 
 import sys
 import pysam
+from pysam.calignedsegment import CIGAR2CODE
 import random
 
 from argparse import ArgumentParser
 
+C_HARDCLIP = CIGAR2CODE[ord('H')]
+C_SOFTCLIP = CIGAR2CODE[ord('S')]
 
 def calc_consensus(reads, rng=random.random):
     count = len(reads)
@@ -126,6 +129,10 @@
                         help="Seed used for randomly selecting representative "
                              "reads when no reads have quality scores assigned"
                              "[default: initialized using system time].")
+    parser.add_argument("--blocksize", default=65536, type=int,
+                        help="Length of genomic blocks in which to find reads "
+                             "with the same starting position. "
+                             "[default: 65536]")
 
     return parser.parse_args(argv)
 
@@ -145,35 +152,57 @@
 
     with pysam.Samfile(args.input, "rb") as infile:
         with pysam.Samfile("-", "wb", template=infile) as outfile:
-            curpos = None
-            curvariants = {}
+            curpos = startpos = None
+            var1 = {}
+            var2 = {}
             for (read_num, read) in enumerate(infile):
-                if curpos and ((read.tid, read.pos) != curpos):
+
+                rpos = read.pos
+                rlen = len(read.seq)
+                c = read.cigar
+
+                if c is not None:
+                    if c[0][0] == C_HARDCLIP:
+                        rpos -= c[0][1]
+                        rlen += c[0][1]
+                        c = c[1:]
+                    if len(c) > 0 and c[0][0] == C_SOFTCLIP:
+                        rpos -= c[0][1]
+
+                if curpos and curpos > (read.tid, startpos[1]+2*args.blocksize):
                     # Sort order is defined as ascending 'tid's and positions
-                    if curpos > (read.tid, read.pos) and not read.is_unmapped:
+                    if curpos > (read.tid, rpos+2*args.blocksize) and not read.is_unmapped:
                         sys.stderr.write("ERROR: Input file does not appear "
                                          "to be sorted by coordinates at "
                                          "record %i, aborting ...\n"
                                          % (read_num,))
                         return 1
 
-                    _flush_buffer(outfile, curvariants,
-                                  args.remove_duplicates)
-                    curpos = None
+                    _flush_buffer(outfile, var1, args.remove_duplicates)
+                    var1 = var2
+                    var2.clear()
+                    curpos = startpos = None
 
                 if read.flag & _FILTERED_FLAGS:
                     outfile.write(read)
                     continue
 
-                curpos = (read.tid, read.pos)
-                nkey = (read.is_reverse, read.pos, read.alen)
+                nkey = (read.is_reverse, rpos, rlen)
+                if startpos and startpos > (read.tid, rpos-args.blocksize):
+                    curvaraints = var2
+                else:
+                    curvariants = var1
                 if nkey in curvariants:
                     curvariants[nkey][0].append(read)
                     curvariants[nkey][1] += 1
                 else:
                     curvariants[nkey] = [[read], 1]
+                curpos = (read.tid, rpos)
+                if startpos is None:
+                    startpos = curpos
 
-            _flush_buffer(outfile, curvariants, args.remove_duplicates)
+            _flush_buffer(outfile, var1, args.remove_duplicates)
+            _flush_buffer(outfile, var2, args.remove_duplicates)
 
     return 0