# Load Testing Data

In [1]:
import pandas as pd
import numpy as np
import pysam
import os

In [2]:
def parse_record(record):
    """line parser to build dataframe, supports missing tags in test data"""
    data = {
        'qname': record.query_name,
        'flag': record.flag,
        'reference': record.reference_id,
        'position': record.pos,
        'mapq': record.query_alignment_qualities,
        'cigar': record.cigarstring,
        'rnext': record.rnext, 
        'pnext': record.pnext,
        'tlen': record.tlen, 
        'sequence': record.seq,
        'quality': record.qual,
    }
    for name, tag in record.get_tags():
        data[name] = tag
    return pd.Series(data)

input_sam_file = 'data/small-gene-sorted.bam'
with pysam.AlignmentFile(input_sam_file, 'rb') as f:
    records = []
    for record in f:
        records.append(parse_record(record))

data = pd.concat(records, axis=1).T

results_scalar = {}  # will hold the calculations we make



# Build Expectations for Testing Data

## Number of Reads

In [3]:
results_scalar['n_reads'] = len(data)
print(results_scalar['n_reads'])

300


## Number of Genes

In [4]:
results_scalar['n_genes'] = len(data.groupby(['GE']))
print(results_scalar['n_genes'])

8


Gene table should have 8 entries plus a header for a total of 9 lines

## Number of Molecules

Molecules are defined as a unique triplet of CB, UB, and GE

In [5]:
results_scalar['n_molecules'] = len(data.groupby(['CB', 'UB', 'GE']))
print(results_scalar['n_molecules'])

88


## Number of Fragments

Fragments are defined as molecules are (CB, UB, GE) but must additionally have a unique position

In [6]:
results_scalar['n_fragments'] = len(data.groupby(['CB', 'UB', 'GE', 'position']))
print(results_scalar['n_fragments'])

217


## Most Abundant Gene

Based on the above, at least one of the genes has to be observed more than once. Which is it? 

In [7]:
results_scalar['most_abundant'] = data.groupby(['GE']).size().idxmax()
results_scalar['most_abundant_gene_n_observations'] = data.groupby(['GE']).size().max()
print(results_scalar['most_abundant'], results_scalar['most_abundant_gene_n_observations'])

AL627309.7 245


In [8]:
results_scalar['perfect_molecule_barcodes'] = 0
for c, r in zip(data['UB'], data['UR']):
    if c == r:
        results_scalar['perfect_molecule_barcodes'] += 1

Calculate the alignment metrics

In [32]:
results_scalar

{'duplicate_reads': 90,
 'fragments_with_single_read_evidence': 155,
 'molecules_with_single_read_evidence': 42,
 'most_abundant': 'AL627309.7',
 'most_abundant_gene_n_observations': 245,
 'n_fragments': 217,
 'n_genes': 8,
 'n_molecules': 88,
 'n_reads': 300,
 'perfect_molecule_barcodes': 300,
 'reads_mapped_exonic': 300,
 'reads_mapped_intronic': 0,
 'reads_mapped_uniquely': 300,
 'reads_mapped_utr': 0,
 'spliced_reads': 29}

In [9]:
results_scalar['reads_mapped_exonic'] = sum(data['XF'] == 'CODING')

In [10]:
results_scalar['reads_mapped_intronic'] = sum(data['XF'] == 'INTRONIC')

In [11]:
results_scalar['reads_mapped_utr'] = sum(data['XF'] == 'UTR')

In [12]:
results_scalar['reads_mapped_uniquely'] = sum(data['NH'] == 1)

In [13]:
results_scalar['duplicate_reads'] = sum((data['flag'] & 1024).astype(bool))

In [14]:
results_scalar['spliced_reads'] = sum(1 for v in data['cigar'] if 'N' in v)

Calculate the higher-order metrics

In [15]:
calc_func_fraction_from_acii = lambda x: sum(1 for c in x if ord(c) > 63) / len(x)
calc_func_fraction = lambda x: sum(1 for c in x if c > 30) / len(x)
calc_func_mean = lambda x: np.mean([c for c in x])

data['num_UY_qual_fraction'] = data['UY'].apply(calc_func_fraction_from_acii)

data['num_base_qual_fraction'] = data['mapq'].apply(calc_func_fraction)
data['num_base_qual_mean'] = data['mapq'].apply(calc_func_mean)

grouped_by_gene = data.groupby(['GE'])

In [16]:
results_series = {}

In [17]:
# vector values
# I changed these to retain the index to make merging into a dataframe easier, and guarantee same order. 
results_series['molecule_barcode_fraction_bases_above_30_mean'] = grouped_by_gene.mean()['num_UY_qual_fraction']
results_series['molecule_barcode_fraction_bases_above_30_variance'] = grouped_by_gene.var()['num_UY_qual_fraction']

results_series['genomic_reads_fraction_bases_quality_above_30_mean'] = grouped_by_gene.mean()['num_base_qual_fraction']
results_series['genomic_reads_fraction_bases_quality_above_30_variance'] = grouped_by_gene.var()['num_base_qual_fraction']
results_series['genomic_read_quality_mean'] = grouped_by_gene.mean()['num_base_qual_mean']
results_series['genomic_read_quality_variance'] = grouped_by_gene.var()['num_base_qual_mean']

reads_per_gene = data.groupby(['GE']).size()

In [18]:
molecules_per_gene = grouped_by_gene.apply(lambda x: len(x.groupby(['UB', 'CB']).size()))
fragments_per_gene = grouped_by_gene.apply(lambda x: len(x.groupby(['UB', 'CB', 'position']).size()))
reads_per_molecule = reads_per_gene / molecules_per_gene
reads_per_fragment = reads_per_gene / fragments_per_gene
fragments_per_molecule = fragments_per_gene / molecules_per_gene
results_series['reads_per_molecule'] = reads_per_molecule
results_series['reads_per_fragment'] = reads_per_fragment
results_series['fragments_per_molecule'] = fragments_per_molecule

# scalar values
results_scalar['fragments_with_single_read_evidence'] = np.sum(data.groupby(['CB', 'UB', 'GE', 'position']).size() == 1)
results_scalar['molecules_with_single_read_evidence'] = np.sum(data.groupby(['CB', 'UB', 'GE']).size() == 1)

In [39]:
# write out the array information for the testing file
for k, vals in pd.DataFrame(results_series).iteritems():
    print(k, 'np.array([' + ', '.join('{:.4f}'.format(i) for i in vals.values) + '])')

fragments_per_molecule np.array([1.0000, 1.0000, 1.0000, 1.8750, 2.9831, 1.2500, 1.0000, 1.3077])
genomic_read_quality_mean np.array([36.2143, 24.8469, 25.4792, 35.3664, 34.0956, 33.0364, 20.7423, 27.3078])
genomic_read_quality_variance np.array([nan, nan, nan, 18.4553, 21.6745, 33.6572, nan, 53.5457])
genomic_reads_fraction_bases_quality_above_30_mean np.array([0.8878, 0.3980, 0.4271, 0.8148, 0.7681, 0.7216, 0.1546, 0.5089])
genomic_reads_fraction_bases_quality_above_30_variance np.array([nan, nan, nan, 0.0282, 0.0346, 0.0537, nan, 0.0849])
molecule_barcode_fraction_bases_above_30_mean np.array([1.0000, 1.0000, 0.8000, 0.9885, 0.9833, 0.9857, 0.7000, 0.9444])
molecule_barcode_fraction_bases_above_30_variance np.array([nan, nan, nan, 0.0011, 0.0051, 0.0014, nan, 0.0120])
reads_per_fragment np.array([1.0000, 1.0000, 1.0000, 1.7333, 1.3920, 1.4000, 1.0000, 1.0588])
reads_per_molecule np.array([1.0000, 1.0000, 1.0000, 3.2500, 4.1525, 1.7500, 1.0000, 1.3846])


# Write Results to File for Automated Testing

In [20]:
pd.Series(results_scalar).to_csv('%s_testing_knowledge_scalar.csv' % input_sam_file.replace('.bam', ''))
pd.DataFrame(results_series).to_csv('%s_testing_knowledge_series.csv' % input_sam_file.replace('.bam', ''))

In [21]:
# do a comparison of the whole 2d dataframe at once
np.allclose(
    pd.DataFrame(results_series).fillna(0).values,  # fill nans with zero, call values to get the numpy array the dataframe is based on
    pd.read_csv('data/small-gene-sorted_testing_knowledge_series.csv', index_col=0, header=0).fillna(0).values
)

True

In [22]:
# to get most_abundant alone: 

In [23]:
test_read_scalar = pd.read_csv('data/small-gene-sorted_testing_knowledge_scalar.csv', index_col=0, header=None, squeeze=True)

# extract this, we're going to drop it from the array to do some conversion to numeric
most_abundant = test_read_scalar['most_abundant'] 

# drop most abundant, convert to float, fill any NaN values with 0, and call .values to get the numpy array pandas objects are based on.
for_comparison = test_read_scalar.drop('most_abundant').astype(float).fillna(0).values


# note, have to drop the string value and convert to float before this works. 
np.allclose(
    pd.Series(results_scalar).drop('most_abundant').fillna(0).values,  # do the same thing as above to the one in memory
    for_comparison
)

True

In [24]:
# get a metric from a dataframe: 
df = pd.DataFrame(results_series)
df['genomic_read_quality_mean']

GE
ACAP3         36.214286
AGRN          24.846939
AL627309.1    25.479167
AL627309.5    35.366414
AL627309.7    34.095625
AL645608.2    33.036443
AL645608.3    20.742268
AL645608.4    27.307758
Name: genomic_read_quality_mean, dtype: float64

In [25]:
# get a numpy array from the dataframe
compare_me = df['genomic_read_quality_mean'].values

In [26]:
# compare two numpy arrays that are slightly different
eps = np.random.rand(8) * 1e-8
np.allclose(compare_me, compare_me + eps)


True

In [27]:
# it is actually discriminative, though
np.allclose(compare_me, np.arange(8))

False

# Look at the metrics output

In [28]:
gene_metrics = pd.read_csv('data/gene_metrics.csv', index_col=0)

In [29]:
gene_metrics

Unnamed: 0,n_reads,noise_reads,perfect_molecule_barcodes,reads_mapped_exonic,reads_mapped_intronic,reads_mapped_utr,reads_mapped_uniquely,reads_mapped_multiple,duplicate_reads,spliced_reads,...,genomic_read_quality_variance,n_molecules,n_fragments,reads_per_molecule,reads_per_fragment,fragments_per_molecule,fragments_with_single_read_evidence,molecules_with_single_read_evidence,number_cells_detected_multiple,number_cells_expressing
ACAP3,1,0,1,1,0,0,1,0,0,1,...,,1,1,1.0,1.0,1.0,1,1,0,1
AGRN,1,0,1,1,0,0,1,0,0,1,...,,1,1,1.0,1.0,1.0,1,1,0,1
AL627309.1,1,0,1,1,0,0,1,0,0,1,...,,1,1,1.0,1.0,1.0,1,1,0,1
AL627309.5,26,0,26,26,0,0,26,0,11,26,...,18.455293,8,15,3.25,1.733333,1.875,7,2,6,8
AL627309.7,245,0,245,245,0,0,245,0,76,0,...,21.6745,59,176,4.152542,1.392045,2.983051,124,22,38,57
AL645608.2,7,0,7,7,0,0,7,0,2,0,...,33.657186,4,5,1.75,1.4,1.25,4,2,2,4
AL645608.3,1,0,1,1,0,0,1,0,0,0,...,,1,1,1.0,1.0,1.0,1,1,0,1
AL645608.4,18,0,18,18,0,0,18,0,1,0,...,53.54574,13,17,1.384615,1.058824,1.307692,16,12,1,13


In [30]:
!cat data/gene_metrics.csv

,n_reads,noise_reads,perfect_molecule_barcodes,reads_mapped_exonic,reads_mapped_intronic,reads_mapped_utr,reads_mapped_uniquely,reads_mapped_multiple,duplicate_reads,spliced_reads,antisense_reads,molecule_barcode_fraction_bases_above_30_mean,molecule_barcode_fraction_bases_above_30_variance,genomic_reads_fraction_bases_quality_above_30_mean,genomic_reads_fraction_bases_quality_above_30_variance,genomic_read_quality_mean,genomic_read_quality_variance,n_molecules,n_fragments,reads_per_molecule,reads_per_fragment,fragments_per_molecule,fragments_with_single_read_evidence,molecules_with_single_read_evidence,number_cells_detected_multiple,number_cells_expressing
ACAP3,1,0,1,1,0,0,1,0,0,1,0,1.0,nan,0.8877551020408163,nan,36.214285714285715,nan,1,1,1.0,1.0,1.0,1,1,0,1
AGRN,1,0,1,1,0,0,1,0,0,1,0,1.0,nan,0.3979591836734694,nan,24.846938775510203,nan,1,1,1.0,1.0,1.0,1,1,0,1
AL627309.1,1,0,1,1,0,0,1,0,0,1,0,0.8,nan,0.4270833333333333,nan,25.479166666666668,nan,1,1,1.0,1.0,1.0,1,1,0,1
AL627309