In [1]:
"""
Bonito utils
"""

import os
import re
import random
from glob import glob
from collections import defaultdict, OrderedDict

import toml
import torch
import parasail
import numpy as np

split_cigar = re.compile(r"(?P<len>\d+)(?P<op>\D+)")

def parasail_to_sam(result, seq, sey):
    """
    Extract reference start and sam compatible cigar string.

    :param result: parasail alignment result.
    :param seq: query sequence.

    :returns: reference start coordinate, cigar string.
    """
    cigstr = result.cigar.decode.decode()
    if not sey:
        return _, cigstr
    first = re.search(split_cigar, cigstr)

    first_count, first_op = first.groups()
    prefix = first.group()
    rstart = result.cigar.beg_ref
    cliplen = result.cigar.beg_query

    clip = '' if cliplen == 0 else '{}S'.format(cliplen)
    if first_op == 'I':
        pre = '{}S'.format(int(first_count) + cliplen)
    elif first_op == 'D':
        pre = clip
        rstart = int(first_count)
    else:
        pre = '{}{}'.format(clip, prefix)

    mid = cigstr[len(prefix):]
    end_clip = len(seq) - result.end_query - 1
    suf = '{}S'.format(end_clip) if end_clip > 0 else ''
    new_cigstr = ''.join((pre, mid, suf))
    return rstart, new_cigstr


def accuracy(ref, seq, sey, balanced=False):
    """
    Calculate the accuracy between `ref` and `seq`
    """
    alignment = parasail.sw_trace_striped_32(ref, seq, 8, 4, parasail.dnafull)
    counts = defaultdict(int)
    _, cigar = parasail_to_sam(alignment, seq, sey)

    for count, op  in re.findall(split_cigar, cigar):
        counts[op] += int(count)

    if balanced:
        accuracy = (counts['='] - counts['I']) / (counts['='] + counts['X'] + counts['D'])
    else:
        accuracy = counts['='] / (counts['='] + counts['I'] + counts['X'] + counts['D'])
    return accuracy * 100


def alignment(ref, seq, output=False, global_=True):
    """
    Print the alignment between `ref` and `seq`
    """
    measure = parasail.nw_trace_striped_32 if global_ else parasail.sw_trace_striped_32
    alignment = measure(ref, seq, 8, 4, parasail.dnafull)
    if output:
        print(alignment.traceback.query)
        print(alignment.traceback.comp)
        print(alignment.traceback.ref)
        print("  Score=%s" % alignment.score)
    return alignment.traceback.query, alignment.traceback.comp, alignment.traceback.ref, alignment



In [7]:
reference = [reference, "ACGT", "ACGTAC", reference + "GGG", "GTGGT", reference[::-1], "AAAA" + reference[::-1], "AAAA", "ACACA", "TGACGTTAT", "CATCTCAG"]

In [6]:
score_length = 10
def trim(a): return pad_text("{:6.2f}".format(a), score_length)
def pad_text(text, max_len): return str(text) + ''.join([' '] * (max_len-len(str(text))))
max_pred_leng = max(max([len(x) for x in predictions]), len(reference))+5


print(pad_text('', 5), pad_text('', max_pred_leng), pad_text('', max_pred_leng), pad_text('Balanced', score_length), pad_text('', score_length), pad_text('Unbalanced', score_length))
print(pad_text('No', 5), pad_text("Alignment global" , max_pred_leng), pad_text("Alignment local" , max_pred_leng), pad_text("No alter", score_length),pad_text("Seymour", score_length),pad_text("No alter", score_length),pad_text("Seymour", score_length))
print(''.join(['-']*90))
for index in range(len(predictions)):
    pred = predictions[index]
    no_alter = accuracy(reference, pred, False, False) 
    seymour = accuracy(reference, pred, True, False)
    no_alter_b = accuracy(reference, pred, False, True) 
    seymour_b = accuracy(reference, pred, True, True)
    
    q, c, r, _ = alignment(reference, pred)
    ql, cl, rl, _ = alignment(reference, pred, global_=False)
    print(pad_text('', 5), pad_text(q, max_pred_leng), ql)
    print(pad_text(index+1, 5), pad_text(c, max_pred_leng), pad_text(cl, max_pred_leng), trim(no_alter_b), trim(seymour_b), trim(no_alter), trim(seymour))
    print(pad_text('', 5), pad_text(r, max_pred_leng), rl)
    
    print()

                                                                                                                                                                                                                                                                      Balanced              Unbalanced
No    Alignment global                                                                                                                Alignment local                                                                                                                 No alter   Seymour    No alter   Seymour   
------------------------------------------------------------------------------------------
      ACGTGGTACACGTGGTACACGTGGTACACGTGGTACACGTGGTACACGTGGTACACGTGGTACACGTGGTACACGTGGTACACGTGGTACACGTGGTACACGTGGTACACGTGGTACA          ACGTGGTACACGTGGTACACGTGGTACACGTGGTACACGTGGTACACGTGGTACACGTGGTACACGTGGTACACGTGGTACACGTGGTACACGTGGTACACGTGGTACACGTGGT
1     ||||||||||||||||||||||||||||||||||||||||||||||||||||