#!/usr/bin/env python
#
# Based on 'FilterUniqueBAM' by
#   Martin Kircher
#   Martin.Kircher@eva.mpg.de
#
import collections
import pysam
import sys

from argparse import ArgumentParser
from pysam.calignedsegment import CIGAR2CODE


_USAGE = """paleomix rmdup_collapsed [options] < sorted.bam > out.bam

The rmdup_collapsed filters a BAM file for PCR duplicates unpaired reads under
the assumption that any unpaired read have been generated by the merging of
overlapping paired-end reads, and thereby represent the complete template
sequence. PCR duplicates are therefore detected based on both the 5' and 3'
alignment coordinate.

Paired reads (0x1), unmapped reads (0x4), secondary alignments (0x100),
reads that failed QC (0x200), and chimeric alignments (0x800), as identified
using the BAM record flags, are not filtered, but simply written to the output.

By default, filtered reads are flagged using the "duplicate" flag (0x400), and
written to the output. Use the --remove-duplicates command-line option to
instead remove these records from the output.
"""

_FILTERED_FLAGS = 0x1     # PE reads
_FILTERED_FLAGS |= 0x4    # Unmapped
_FILTERED_FLAGS |= 0x100  # Secondary alignment
_FILTERED_FLAGS |= 0x200  # Failed QC
_FILTERED_FLAGS |= 0x800  # Chimeric alignment

_HARDCLIP = CIGAR2CODE[ord('H')]
_SOFTCLIP = CIGAR2CODE[ord('S')]
_INSERTION = CIGAR2CODE[ord('I')]


def calc_consensus(reads):
    count = len(reads)
    outread = None
    maxsumqual = -1
    for read in reads:
        # Handle reads without qualities, but favor reads with qualities
        qual = read.query_qualities
        if qual is None:
            # Score reads by number of bases, but prefer reads with qualities
            nsum = -1.0 / read.query_alignment_length
        else:
            nsum = sum(qual)

        if nsum > maxsumqual:
            outread = read
            maxsumqual = nsum

        try:
            # LOOK FOR PREVIOUS PCR DUPLICATE COUNTS
            count += read.get_tag('XP')
        except KeyError:
            pass

    outread.set_tag('XP', count)

    return outread


def record_quality(record):
    qualities = record.query_alignment_qualities
    if qualities is None:
        return record.query_alignment_length

    return sum(qualities)


def mark_duplicate_reads(records):
    # DETERMINE MOST FREQUENT CIGAR LINE
    by_cigar = collections.defaultdict(list)
    for record in records:
        key = tuple(record.cigartuples)
        by_cigar[key].append(record)

    def _score(records):
        return (len(records), sum(map(record_quality, records)))

    candidates = sorted(by_cigar.itervalues(), key=_score)
    consensus = calc_consensus(candidates[-1])
    for read in records:
        read.is_duplicate = (read is not consensus)


def write_read(args, out, record, alignments):
    if isinstance(record, list):
        # Current record is a group of one or more identical alignments.
        if len(record) > 2:
            # Select the best read and mark the others as duplicates.
            mark_duplicate_reads(record[1:])

        alignments.pop(record[0])
        record = record[1]

    if not (record.is_duplicate and args.remove_duplicates):
        out.write(record)


def flush_cache(args, out, cache, alignments):
    last = cache[-1]
    if isinstance(last, list):
        last = last[1]

    last_ref_id = last.reference_id
    last_ref_start = last.reference_start - args.window_size

    while cache:
        first = cache[0]
        if isinstance(first, list):
            first = first[1]

        if first.is_unmapped \
                or first.reference_id != last_ref_id \
                or first.reference_start < last_ref_start:
            write_read(args, out, cache.popleft(), alignments)
        else:
            break


def read_alignment(read):
    start = read.reference_start
    end = start
    at_front = True

    for (operation, length) in read.cigartuples:
        if at_front and operation in (_HARDCLIP, _SOFTCLIP):
            start -= length
            continue

        at_front = False
        if operation != _INSERTION:
            end += length

    return (read.reference_id, read.is_reverse, start, end)


def process(args, infile, outfile):
    cache = collections.deque()
    alignments = {}
    last_pos = ()
    for read_num, read in enumerate(infile, start=1):
        # Sorted order is defined as ascending reference_ids and positions
        current_pos = (read.reference_id, read.reference_start)
        if last_pos > current_pos and not read.is_unmapped:
            sys.stderr.write("ERROR: Input file does not appear to be sorted "
                             "by coordinates at record %i. Aborting run!\n"
                             % (read_num,))
            return 1
        elif read.flag & _FILTERED_FLAGS:
            cache.append(read)
        else:
            read.is_duplicate = False
            key = read_alignment(read)

            try:
                group = alignments[key]
                group.append(read)
                cache.append(read)
            except KeyError:
                # No previous records with matching alignment; this read will
                # serve to group any other reads with the same alignment.
                group = [key, read]
                cache.append(group)
                alignments[key] = group

        last_pos = current_pos
        if len(cache) > args.window_size:
            flush_cache(args, outfile, cache, alignments)

    while cache:
        write_read(args, outfile, cache.popleft(), alignments)


def parse_args(argv):
    parser = ArgumentParser(usage=_USAGE)
    parser.add_argument("input", default="-", nargs="?",
                        help="BAM file; if not set, input is read from STDIN.")
    parser.add_argument("--remove-duplicates",
                        help="Remove duplicates from output; by default "
                             "duplicates are only flagged (flag = 0x400).",
                        default=False, action="store_true")
    parser.add_argument("--window-size", default=65536, type=int,
                        help="Length of sliding window in which PCR "
                             "duplicates are detected [default: %(default)s]")

    return parser.parse_args(argv)


def main(argv):
    args = parse_args(argv)

    if args.input == "-" and sys.stdin.isatty():
        sys.stderr.write("STDIN is a terminal, terminating!\n")
        return 1
    elif sys.stdout.isatty():
        sys.stderr.write("STDOUT is a terminal, terminating!\n")
        return 1

    with pysam.Samfile(args.input, "rb") as infile:
        with pysam.Samfile("-", "wb", template=infile) as outfile:
            process(args, infile, outfile)

    return 0


if __name__ == '__main__':
    sys.exit(main(sys.argv[1:]))