# Load Testing Data

In [1]:
import pandas as pd
import numpy as np
import pysam
import os

In [2]:
def parse_record(record):
    """line parser to build dataframe, supports missing tags in test data"""
    data = {
        'qname': record.query_name,
        'flag': record.flag,
        'reference': record.reference_id,
        'position': record.pos,
        'mapq': record.query_alignment_qualities,
        'cigar': record.cigarstring,
        'rnext': record.rnext, 
        'pnext': record.pnext,
        'tlen': record.tlen, 
        'sequence': record.seq,
        'quality': record.qual,
    }
    for name, tag in record.get_tags():
        data[name] = tag
    return pd.Series(data)

input_sam_file = 'data/small-cell-sorted.bam'
with pysam.AlignmentFile(input_sam_file, 'rb') as f:
    records = []
    for record in f:
        records.append(parse_record(record))

data = pd.concat(records, axis=1).T

results_scalar = {}  # will hold the calculations we make

# add a strand field
data['strand'] = [f & 16 for f in data['flag']]



# Build Expectations for Testing Data

## Number of Reads

In [3]:
results_scalar['n_reads'] = len(data)
print(results_scalar['n_reads'])

656


## Number of Genes

In [4]:
results_scalar['n_genes'] = len(data.groupby(['GE']))
print(results_scalar['n_genes'])

11


In [5]:
mean_n_genes = data.groupby(['CB']).apply(lambda x: len(set(x['GE']))).mean()
print(mean_n_genes)

1.9827586206896552


Gene table should have 8 entries plus a header for a total of 9 lines

## Number of Molecules

Molecules are defined as a unique triplet of CB, UB, and GE

In [6]:
results_scalar['n_molecules'] = len(data.groupby(['CB', 'UB', 'GE']))
print(results_scalar['n_molecules'])

249


## Number of Fragments

Fragments are defined as molecules are (CB, UB, GE) but must additionally have a unique position

In [7]:
results_scalar['n_fragments'] = len(data.groupby(['CB', 'UB', 'GE', 'position']))
print(results_scalar['n_fragments'])

499


## Most Abundant Gene

Based on the above, at least one of the genes has to be observed more than once. Which is it? 

In [8]:
results_scalar['most_abundant'] = data.groupby(['GE']).size().idxmax()
results_scalar['most_abundant_gene_n_observations'] = data.groupby(['GE']).size().max()
print(results_scalar['most_abundant'], results_scalar['most_abundant_gene_n_observations'])

MTATP6P1 300


## Cell with most reads

In [9]:
data.groupby(['CB']).apply(lambda x: len(x)).max()

94

## perfect molecule barcodes

In [10]:
results_scalar['perfect_molecule_barcodes'] = 0
for c, r in zip(data['UB'], data['UR']):
    if c == r:
        results_scalar['perfect_molecule_barcodes'] += 1

In [11]:
results_scalar['perfect_molecule_barcodes']

655

Calculate the alignment metrics

In [12]:
results_scalar['reads_mapped_exonic'] = sum(data['XF'] == 'CODING')

In [13]:
results_scalar['reads_mapped_intronic'] = sum(data['XF'] == 'INTRONIC')

In [14]:
results_scalar['reads_mapped_utr'] = sum(data['XF'] == 'UTR')

In [15]:
results_scalar['reads_mapped_uniquely'] = sum(data['NH'] == 1)

In [16]:
results_scalar['duplicate_reads'] = sum((data['flag'] & 1024).astype(bool))

In [17]:
results_scalar['spliced_reads'] = sum(1 for v in data['cigar'] if 'N' in v)

In [18]:
results_scalar

{'duplicate_reads': 107,
 'most_abundant': 'MTATP6P1',
 'most_abundant_gene_n_observations': 300,
 'n_fragments': 499,
 'n_genes': 11,
 'n_molecules': 249,
 'n_reads': 656,
 'perfect_molecule_barcodes': 655,
 'reads_mapped_exonic': 609,
 'reads_mapped_intronic': 28,
 'reads_mapped_uniquely': 656,
 'reads_mapped_utr': 19,
 'spliced_reads': 2}

Calculate the higher-order metrics

In [19]:
calc_func_fraction_from_acii = lambda x: sum(1 for c in x if ord(c) > 63) / len(x)
calc_func_fraction = lambda x: sum(1 for c in x if c > 30) / len(x)
calc_func_mean = lambda x: np.mean([c for c in x])

data['num_UY_qual_fraction'] = data['UY'].apply(calc_func_fraction_from_acii)

data['num_base_qual_fraction'] = data['mapq'].apply(calc_func_fraction)
data['num_base_qual_mean'] = data['mapq'].apply(calc_func_mean)

grouped_by_cell = data.groupby(['CB'])

In [20]:
results_series = {}

In [21]:
# vector values
# I changed these to retain the index to make merging into a dataframe easier, and guarantee same order. 
results_series['molecule_barcode_fraction_bases_above_30_mean'] = grouped_by_cell.mean()['num_UY_qual_fraction']
results_series['molecule_barcode_fraction_bases_above_30_variance'] = grouped_by_cell.var()['num_UY_qual_fraction']

results_series['genomic_reads_fraction_bases_quality_above_30_mean'] = grouped_by_cell.mean()['num_base_qual_fraction']
results_series['genomic_reads_fraction_bases_quality_above_30_variance'] = grouped_by_cell.var()['num_base_qual_fraction']
results_series['genomic_read_quality_mean'] = grouped_by_cell.mean()['num_base_qual_mean']
results_series['genomic_read_quality_variance'] = grouped_by_cell.var()['num_base_qual_mean']

reads_per_cell = data.groupby(['CB']).size()

In [22]:
data.iloc[3, :]

AS                                                                       96
CB                                                         AAACCTGAGAAACCTA
CR                                                         AAACCTGAGAAACCTA
CY                                                         AAFFFJJJJJJJJJJJ
GE                                                                      NaN
GS                                                                      NaN
HI                                                                        1
MD                                                                       98
NH                                                                        1
NM                                                                        0
RG                                                                        A
SR                                                                 GTAATTGC
SY                                                                 AAAFFJ<J
UB          

In [23]:
np.where(data['CB'] == 'AAACCTGAGAAACCTA')

(array([3]),)

In [24]:
# molecules_per_cell = grouped_by_cell.apply(lambda x: len(x.groupby(['UB', 'GE']).size()))
molecules_per_cell = data.groupby(['CB', 'GE', 'UB']).size().groupby(['CB']).size()  # does not work well for NaN genes
# fragments_per_cell = grouped_by_cell.apply(lambda x: len(x.groupby(['UB', 'GE', 'position', 'reference', 'strand']).size()))
fragments_per_cell = data.groupby(['CB', 'UB', 'position', 'reference', 'strand']).size().groupby(['CB']).size()
reads_per_molecule = reads_per_cell / molecules_per_cell
reads_per_fragment = reads_per_cell / fragments_per_cell
fragments_per_molecule = fragments_per_cell / molecules_per_cell
results_series['reads_per_molecule'] = reads_per_molecule
results_series['reads_per_fragment'] = reads_per_fragment
results_series['fragments_per_molecule'] = fragments_per_molecule

# scalar values
results_scalar['fragments_with_single_read_evidence'] = np.sum(data.groupby(['CB', 'UB', 'GE', 'position']).size() == 1)
results_scalar['molecules_with_single_read_evidence'] = np.sum(data.groupby(['CB', 'UB', 'GE']).size() == 1)

In [25]:
for k, v in results_series.items():
    print(k, len(v))

molecule_barcode_fraction_bases_above_30_mean 58
molecule_barcode_fraction_bases_above_30_variance 58
genomic_reads_fraction_bases_quality_above_30_mean 58
genomic_reads_fraction_bases_quality_above_30_variance 58
genomic_read_quality_mean 58
genomic_read_quality_variance 58
reads_per_molecule 58
reads_per_fragment 58
fragments_per_molecule 58


In [26]:
# write out the array information for the testing file
for k, vals in results_series.items():
    print(k, ('np.array([' + ', '.join('{:.4f}'.format(i) for i in np.array(vals)) + '])').replace('nan', 'np.nan'))

molecule_barcode_fraction_bases_above_30_mean np.array([1.0000, 0.9500, 1.0000, 1.0000, 0.9778, 1.0000, 1.0000, 1.0000, 0.9833, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 0.9759, 1.0000, 1.0000, 0.9830, 1.0000, 1.0000, 1.0000, 0.9778, 0.9783, 1.0000, 0.9800, 1.0000, 1.0000, 1.0000, 1.0000, 0.9500, 1.0000, 0.9895, 1.0000, 0.9760, 1.0000, 1.0000, 1.0000, 0.9889, 1.0000, 0.9600, 1.0000, 0.9909, 1.0000, 1.0000, 0.9556, 0.9800, 1.0000, 0.9000, 1.0000, 0.9588, 1.0000, 1.0000, 0.9889, 0.8000, 0.9538, 0.9909, 0.9929, 0.9571])
molecule_barcode_fraction_bases_above_30_variance np.array([np.nan, 0.0050, np.nan, np.nan, 0.0019, 0.0000, 0.0000, np.nan, 0.0015, np.nan, 0.0000, 0.0000, np.nan, 0.0000, 0.0048, 0.0000, 0.0000, 0.0029, 0.0000, np.nan, 0.0000, 0.0044, 0.0109, 0.0000, 0.0020, 0.0000, 0.0000, np.nan, 0.0000, 0.0100, np.nan, 0.0010, 0.0000, 0.0052, 0.0000, 0.0000, 0.0000, 0.0011, 0.0000, 0.0162, 0.0000, 0.0016, 0.0000, np.nan, 0.0178, 0.0020, np.nan, np.nan, 0.0000, 0.0163, np.nan, np.nan, 0.0

# Write Results to File for Automated Testing

In [27]:
pd.Series(results_scalar).to_csv('%s_testing_knowledge_scalar.csv' % input_sam_file.replace('.bam', ''))
pd.DataFrame(results_series).to_csv('%s_testing_knowledge_series.csv' % input_sam_file.replace('.bam', ''))

In [28]:
# do a comparison of the whole 2d dataframe at once
np.allclose(
    pd.DataFrame(results_series).fillna(0).values,  # fill nans with zero, call values to get the numpy array the dataframe is based on
    pd.read_csv('data/small-cell-sorted_testing_knowledge_series.csv', index_col=0, header=0).fillna(0).values
)

True

In [29]:
# to get most_abundant alone: 

In [30]:
test_read_scalar = pd.read_csv('data/small-cell-sorted_testing_knowledge_scalar.csv', index_col=0, header=None, squeeze=True)

# extract this, we're going to drop it from the array to do some conversion to numeric
most_abundant = test_read_scalar['most_abundant'] 

# drop most abundant, convert to float, fill any NaN values with 0, and call .values to get the numpy array pandas objects are based on.
for_comparison = test_read_scalar.drop('most_abundant').astype(float).fillna(0).values


# note, have to drop the string value and convert to float before this works. 
np.allclose(
    pd.Series(results_scalar).drop('most_abundant').fillna(0).values,  # do the same thing as above to the one in memory
    for_comparison
)

True

In [31]:
# get a metric from a dataframe: 
df = pd.DataFrame(results_series)
df['genomic_read_quality_mean']

CB
AAAACCTGGTAGAAGG    25.377551
AAACACGTGTCTGGAG    32.505102
AAACCTGAGAAACCTA    27.775510
AAACCTGAGATGTGTA    39.918367
AAACCTGAGCACATCT    34.363946
AAACCTGAGCATCATC    34.596867
AAACCTGAGCTAACTC    37.459184
AAACCTGAGCTAGCCC    35.948980
AAACCTGAGCTAGTGG    31.634485
AAACCTGAGGCCCGTT    26.586957
AAACCTGCACATTAGC    36.750000
AAACCTGCACTGTTAG    39.537415
AAACCTGCAGACGCCT    28.089552
AAACCTGCAGGCTCAC    33.704082
AAACCTGCATAGTAAG    33.607900
AAACCTGCATGAACCT    36.278732
AAACCTGGTAAGAGGA    30.847209
AAACCTGGTAGAAGGA    34.840207
AAACCTGGTCCAGTGC    35.932717
AAACCTGGTCGTTGTA    24.775510
AAACCTGGTGTCTGAT    34.360316
AAACCTGGTTTGCATG    31.093386
AAACCTGGTTTGTTTC    33.288016
AAACCTGTCACGATGT    36.709184
AAACCTGTCCGTTGCT    31.964741
AAACCTGTCCTGTAGA    30.215751
AAACCTGTCGCCAAAT    35.395582
AAACCTGTCGCGATCG    27.683673
AAACCTGTCGTGGACC    35.867376
AAACCTGTCTACCAGA    27.452688
AAACCTGTCTACGAGT    34.391753
AAACCTGTCTCAAGTG    33.732258
AAACCTGTCTCGCTTG    33.642465
AAACCTG

In [32]:
# get a numpy array from the dataframe
compare_me = df['genomic_read_quality_mean'].values

In [33]:
# compare two numpy arrays that are slightly different
eps = np.random.rand(11) * 1e-8
np.allclose(compare_me, compare_me + eps)


ValueError: operands could not be broadcast together with shapes (58,) (11,) 

In [None]:
# it is actually discriminative, though
np.allclose(compare_me, np.arange(11))

# Look at the metrics output

In [None]:
cell_metrics = pd.read_csv('data/cell_metrics.csv', index_col=0)

In [None]:
cell_metrics['n_genes']

In [None]:
!cat data/cell_metrics.csv