In [287]:
%matplotlib inline

import pysam
import itertools
import numpy as np
import scipy.sparse as sp
import matplotlib.pylab as plt
import pandas as pd
import natsort
import os
from collections import namedtuple
from collections import Counter
import distance

from typing import List, Set, Tuple

In [150]:
def alignments_grouped_by_read_name_generator(bam_file: pysam.libcalignmentfile.AlignmentFile):
    """Iterates through a read-name-sorted BAM file and groups all alignments of a read.
    
    Returns:
        a tuple of read-name and a list of its alignments
    """
    bam_file.reset()
    for alignment in itertools.groupby(bam_file, key=lambda alignment: alignment.query_name):
        read_name = alignment[0]
        grouper = alignment[1]
        alignments = []
        try:
            while True:
                alignment = grouper.__next__()
                alignments.append(alignment)
        except StopIteration:
            pass
        yield read_name, alignments

## Assert equality of alignments

In [219]:
op_bam_file = './optimus/merged.ns.bam'
cr_bam_file = './cell_ranger/cellranger.ns.bam'

op_bam = pysam.AlignmentFile(op_bam_file)
cr_bam = pysam.AlignmentFile(cr_bam_file)

In [230]:
bam_record_attributes = namedtuple('bam_record_attributes', 'contig, start, end, cigar, flag')
bam_tags = namedtuple('bam_tags', 'CR, UR, CB')

def get_barcode_from_record(rec, barcode_tag_key='CR'):
    orig_barcode = None
    try:
        orig_barcode = rec.get_tag(barcode_tag_key)
    except:
        return None
    
    # record "-x" (for CellRanger)
    fixed_barcode = orig_barcode.split('-')[0]
    
    return fixed_barcode

def get_record_attributes(recs: List[pysam.AlignedSegment]) -> List[bam_record_attributes]:
    return [bam_record_attributes(
        contig=rec.reference_name,
        start=rec.reference_start,
        end=rec.reference_end,
        cigar=rec.cigarstring,
        flag=rec.flag) for rec in recs]

def get_record_tags(recs: List[pysam.AlignedSegment]) -> List[bam_tags]:
    return [bam_tags(
        CR=get_barcode_from_record(rec, 'CR'),
        UR=get_barcode_from_record(rec, 'UR'),
        CB=get_barcode_from_record(rec, 'CB')) for rec in recs]

def get_sorted_attributes_tags_list(attr_list, tags_list):
    # sort by alignment
    sorted_index_attr_list = sorted(
        enumerate(attr_list),
        key=lambda entry: (entry[1].contig, entry[1].start, entry[1].end, entry[1].cigar))
    sort_index = [entry[0] for entry in sorted_index_attr_list]
    sorted_attr_list = [entry[1] for entry in sorted_index_attr_list]
    sorted_tags_list = [tags_list[j] for j in sort_index]
    return sorted_attr_list, sorted_tags_list

In [234]:
# instantiate generators for alignements grouped by read-name
op_grouped_records_gen = alignments_grouped_by_read_name_generator(op_bam)
cr_grouped_records_gen = alignments_grouped_by_read_name_generator(cr_bam)

# master list of parsed records (for further analysis)
op_master_rec_attr_list = list()
cr_master_rec_attr_list = list()
op_master_rec_tags_list = list()
cr_master_rec_tags_list = list()

# indices of discordant records
discordant_alignment_indices = list()
discordant_flag_indices = list()
discordant_CR_indices = list()
discordant_UR_indices = list()
discordant_CB_indices = list()

read_index = 0
for (op_read_name, op_recs), (cr_read_name, cr_recs) in zip(op_grouped_records_gen, cr_grouped_records_gen):
    assert op_read_name == cr_read_name, "Different read names"
    assert len(op_recs) == len(cr_recs), "Different number of alignment positions"
    
    op_rec_attr_list = get_record_attributes(op_recs)
    cr_rec_attr_list = get_record_attributes(cr_recs)
    op_tags_list = get_record_tags(op_recs)
    cr_tags_list = get_record_tags(cr_recs)
    
    # sort by alignment position
    sorted_op_rec_attr_list, sorted_op_tags_list = get_sorted_attributes_tags_list(
        op_rec_attr_list, op_tags_list)
    sorted_cr_rec_attr_list, sorted_cr_tags_list = get_sorted_attributes_tags_list(
        cr_rec_attr_list, cr_tags_list)
    
    # add to the master list
    op_master_rec_attr_list.append(sorted_op_rec_attr_list)
    cr_master_rec_attr_list.append(sorted_cr_rec_attr_list)
    op_master_rec_tags_list.append(sorted_op_tags_list)
    cr_master_rec_tags_list.append(sorted_cr_tags_list)
    
    # discordant records
    op_alignments = [(attr.contig, attr.start, attr.end, attr.cigar) for attr in sorted_op_rec_attr_list]
    cr_alignments = [(attr.contig, attr.start, attr.end, attr.cigar) for attr in sorted_cr_rec_attr_list]
    if op_alignments != cr_alignments:
        discordant_alignment_indices.append(read_index)

    op_flags = [attr.flag for attr in sorted_op_rec_attr_list]
    cr_flags = [attr.flag for attr in sorted_cr_rec_attr_list]
    if op_flags != cr_flags:
        discordant_flag_indices.append(read_index)

    op_CR = [tags.CR for tags in sorted_op_tags_list]
    cr_CR = [tags.CR for tags in sorted_cr_tags_list]
    if op_CR != cr_CR:
        discordant_CR_indices.append(read_index)

    op_UR = [tags.UR for tags in sorted_op_tags_list]
    cr_UR = [tags.UR for tags in sorted_cr_tags_list]
    if op_UR != cr_UR:
        discordant_UR_indices.append(read_index)

    op_CB = [tags.CB for tags in sorted_op_tags_list]
    cr_CB = [tags.CB for tags in sorted_cr_tags_list]
    if op_CB != cr_CB:
        discordant_CB_indices.append(read_index)

    # increment index
    read_index += 1    

In [243]:
print(f'number of reads with discordant alignments: {len(discordant_alignment_indices)}')
print(f'number of reads with discordant flags: {len(discordant_flag_indices)}')
print(f'number of reads with discordant CR: {len(discordant_CR_indices)}')
print(f'number of reads with discordant UR: {len(discordant_UR_indices)}')
print(f'number of reads with discordant CB: {len(discordant_CB_indices)}')

number of reads with discordant alignments: 0
number of reads with discordant flags: 543424
number of reads with discordant CR: 0
number of reads with discordant UR: 0
number of reads with discordant CB: 2764


## Reads with discordant CB

In [280]:
op_missing_CB = 0
cr_missing_CB = 0

# list of tuples of d(op_CB, op_CR), d(cr_CB, cr_CR), d(op_CB, cr_CB)
hamming_distance_tuples = list()
implicated_CR_set = set()

for idx in discordant_CB_indices:
    op_CB_list = list(set(tags.CB for tags in op_master_rec_tags_list[idx]))
    cr_CB_list = list(set(tags.CB for tags in cr_master_rec_tags_list[idx]))
    assert len(op_CB_list) == 1
    assert len(cr_CB_list) == 1
    op_CB = op_CB_list[0]
    cr_CB = cr_CB_list[0]
    
    if op_CB is None:
        op_missing_CB += 1
        continue
        
    if cr_CB is None:
        cr_missing_CB += 1
        continue

    op_CR = op_master_rec_tags_list[idx][0].CR
    cr_CR = cr_master_rec_tags_list[idx][0].CR
    
    assert op_CR == cr_CR
    
    implicated_CR_set.add(op_CR)
    
    op_CB_CR_hamming_dist = distance.hamming(op_CB, op_CR)
    cr_CB_CR_hamming_dist = distance.hamming(cr_CB, cr_CR)    
    op_cr_CB_CB_hamming_dist = distance.hamming(op_CB, cr_CB)
    
    hamming_distance_tuples.append((op_CB_CR_hamming_dist, cr_CB_CR_hamming_dist, op_cr_CB_CB_hamming_dist))
    
op_CB_CR_hamming_dist_hist = Counter([entry[0] for entry in hamming_distance_tuples])
cr_CB_CR_hamming_dist_hist = Counter([entry[1] for entry in hamming_distance_tuples])
op_cr_CB_CB_hamming_dist_hist = Counter([entry[2] for entry in hamming_distance_tuples])

In [286]:
print(f'Optimus reads missing CB: {op_missing_CB}')
print(f'CellRanger reads missing CB: {cr_missing_CB}')
print(f'Discordant CB between CellRanger and Optimus: {len(hamming_distance_tuples)}')
print(f'Distribution of (CB, CR) Hamming distance for Optimus reads: \
{op_CB_CR_hamming_dist_hist}')
print(f'Distribution of (CB, CR) Hamming distance for CellRanger reads: \
{cr_CB_CR_hamming_dist_hist}')
print(f'Distribution of (CB, CB) Hamming distance between Optimus and CellRanger reads: \
{op_cr_CB_CB_hamming_dist_hist}')
print(f'Number of unique implicated CRs: {len(implicated_CR_set)}')

Optimus reads missing CB: 0
CellRanger reads missing CB: 1271
Discordant CB between CellRanger and Optimus: 1493
Distribution of (CB, CR) Hamming distance for Optimus reads: Counter({1: 1493})
Distribution of (CB, CR) Hamming distance for CellRanger reads: Counter({1: 1493})
Distribution of (CB, CB) Hamming distance between Optimus and CellRanger reads: Counter({2: 1493})
Number of unique implicated CRs: 1107


**Summary**

- All Optimus reads have CB tags
- Some CellRanger reads do not have CB tags
- Both Optimis and CellRanger only correct barcodes within 1HD from the whitelist
- Optimus and CellRanger correct barcode errors in _different ways_