# Load Testing Data

In [1]:
import pandas as pd
import numpy as np
import os

In [6]:
field_names = [
    'qname', 'flag', 'reference', 'position', 'mapq', 'cigar', 'rnext', 'pnext', 'tlen', 'sequence', 'quality', 
    'CB', 'UB', 'MD', 'GE', 'XF', 'RG', 'NH', 'HI', 'jI', 'NM', 'jM', 'nM', 'CR', 'SR', 'UR', 'AS', 'GS', 'CY', 'SY', 'UY'
]

input_sam_file = 'data/small-gene-sorted.sam'  # note needs a sam file
data = pd.read_table(input_sam_file, header=None)
data.columns = field_names

results_scalar = {}  # will hold the calculations we make

# Build Expectations for Testing Data

## Number of Reads

In [7]:
results_scalar['n_reads'] = len(data)
print(results_scalar['n_reads'])

300


## Number of Genes

In [8]:
results_scalar['n_genes'] = len(data.groupby(['GE']))
print(results_scalar['n_genes'])

8


Gene table should have 8 entries plus a header for a total of 9 lines

## Number of Molecules

Molecules are defined as a unique triplet of CB, UB, and GE

In [9]:
results_scalar['n_molecules'] = len(data.groupby(['CB', 'UB', 'GE']))
print(results_scalar['n_molecules'])

88


## Number of Fragments

Fragments are defined as molecules are (CB, UB, GE) but must additionally have a unique position

In [10]:
results_scalar['n_fragments'] = len(data.groupby(['CB', 'UB', 'GE', 'position']))
print(results_scalar['n_fragments'])

217


## Most Abundant Gene

Based on the above, at least one of the genes has to be observed more than once. Which is it? 

In [11]:
results_scalar['most_abundant'] = data.groupby(['GE']).size().idxmax().split(':')[-1]
results_scalar['most_abundant_gene_n_observations'] = data.groupby(['GE']).size().max()
print(results_scalar['most_abundant'], results_scalar['most_abundant_gene_n_observations'])

AL627309.7 245


In [12]:
results_scalar['perfect_molecule_barcodes'] = 0
for c, r in zip(data['UB'], data['UR']):
    if c.split(':')[-1] == r.split(':')[-1]:
        results_scalar['perfect_molecule_barcodes'] += 1

Calculate the alignment metrics

In [13]:
results_scalar['reads_mapped_exonic'] = sum(data['XF'] == 'XF:Z:CODING')

In [14]:
results_scalar['reads_mapped_intronic'] = sum(data['XF'] == 'XF:Z:INTRONIC')

In [15]:
results_scalar['reads_mapped_utr'] = sum(data['XF'] == 'XF:Z:UTR')

In [16]:
results_scalar['reads_mapped_uniquely'] = sum(data['NH'] == 'NH:i:1')

In [17]:
results_scalar['duplicate_reads'] = sum((data['flag'] & 1024).astype(bool))

In [18]:
results_scalar['spliced_reads'] = sum(1 for v in data['cigar'] if 'N' in v)

Calculate the higher-order metrics

In [42]:
calc_func_fraction = lambda x: sum(1 for c in x.split(':')[-1] if ord(c) > 63) / len(x.split(':')[-1])
calc_func_mean = lambda x: np.mean([ord(c) - 33 for c in x.split(':')[-1]])

data['num_UY_qual_fraction'] = data['UY'].apply(calc_func_fraction)

data['num_base_qual_fraction'] = data['quality'].apply(calc_func_fraction)
data['num_base_qual_mean'] = data['quality'].apply(calc_func_mean)

grouped_by_gene = data.groupby(['GE'])

In [43]:
results_series = {}

In [44]:
# vector values
# I changed these to retain the index to make merging into a dataframe easier, and guarantee same order. 
results_series['molecule_barcode_fraction_bases_above_30_mean'] = grouped_by_gene.mean()['num_UY_qual_fraction']
results_series['molecule_barcode_fraction_bases_above_30_variance'] = grouped_by_gene.var()['num_UY_qual_fraction']

results_series['genomic_reads_fraction_bases_quality_above_30_mean'] = grouped_by_gene.mean()['num_base_qual_fraction']
results_series['genomic_reads_fraction_bases_quality_above_30_variance'] = grouped_by_gene.var()['num_base_qual_fraction']
results_series['genomic_read_quality_mean'] = grouped_by_gene.mean()['num_base_qual_mean']
results_series['genomic_read_quality_variance'] = grouped_by_gene.var()['num_base_qual_mean']

reads_per_gene = data.groupby(['GE']).size()

In [46]:
molecules_per_gene = grouped_by_gene.apply(lambda x: len(x.groupby(['UB', 'CB']).size()))
fragments_per_gene = grouped_by_gene.apply(lambda x: len(x.groupby(['UB', 'CB', 'position']).size()))
reads_per_molecule = reads_per_gene / molecules_per_gene
reads_per_fragment = reads_per_gene / fragments_per_gene
fragments_per_molecule = fragments_per_gene / molecules_per_gene
results_series['reads_per_molecule'] = reads_per_molecule
results_series['reads_per_fragment'] = reads_per_fragment
results_series['fragments_per_molecule'] = fragments_per_molecule

# scalar values
results_scalar['fragments_with_single_read_evidence'] = np.sum(data.groupby(['CB', 'UB', 'GE', 'position']).size() == 1)
results_scalar['molecules_with_single_read_evidence'] = np.sum(data.groupby(['CB', 'UB', 'GE']).size() == 1)

In [48]:
pd.DataFrame(results_series)

Unnamed: 0_level_0,fragments_per_molecule,genomic_read_quality_mean,genomic_read_quality_variance,genomic_reads_fraction_bases_quality_above_30_mean,genomic_reads_fraction_bases_quality_above_30_variance,molecule_barcode_fraction_bases_above_30_mean,molecule_barcode_fraction_bases_above_30_variance,reads_per_fragment,reads_per_molecule
GE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GE:Z:ACAP3,1.0,36.214286,,0.887755,,1.0,,1.0,1.0
GE:Z:AGRN,1.0,24.846939,,0.397959,,1.0,,1.0,1.0
GE:Z:AL627309.1,1.0,25.306122,,0.418367,,0.8,,1.0,1.0
GE:Z:AL627309.5,1.875,35.382261,18.336345,0.815542,0.027979,0.988462,0.001062,1.733333,3.25
GE:Z:AL627309.7,2.983051,34.08813,21.54384,0.768055,0.034475,0.983265,0.005088,1.392045,4.152542
GE:Z:AL645608.2,1.25,33.036443,33.657186,0.721574,0.053718,0.985714,0.001429,1.4,1.75
GE:Z:AL645608.3,1.0,20.857143,,0.163265,,0.7,,1.0,1.0
GE:Z:AL645608.4,1.307692,27.269274,54.069164,0.507937,0.08529,0.944444,0.012026,1.058824,1.384615


# Write Results to File for Automated Testing

In [51]:
pd.Series(results_scalar).to_csv('%s_testing_knowledge_scalar.csv' % input_sam_file.replace('.sam', ''))
pd.DataFrame(results_series).to_csv('%s_testing_knowledge_series.csv' % input_sam_file.replace('.sam', ''))

In [54]:
# do a comparison of the whole 2d dataframe at once
np.allclose(
    pd.DataFrame(results_series).fillna(0).values,  # fill nans with zero, call values to get the numpy array the dataframe is based on
    pd.read_csv('data/small-gene-sorted_testing_knowledge_series.csv', index_col=0, header=0).fillna(0).values
)

True

In [None]:
# to get most_abundant alone: 

In [64]:
test_read_scalar = pd.read_csv('data/small-gene-sorted_testing_knowledge_scalar.csv', index_col=0, header=None, squeeze=True)

# extract this, we're going to drop it from the array to do some conversion to numeric
most_abundant = test_read_scalar['most_abundant'] 

# drop most abundant, convert to float, fill any NaN values with 0, and call .values to get the numpy array pandas objects are based on.
for_comparison = test_read_scalar.drop('most_abundant').astype(float).fillna(0).values


# note, have to drop the string value and convert to float before this works. 
np.allclose(
    pd.Series(results_scalar).drop('most_abundant').fillna(0).values,  # do the same thing as above to the one in memory
    for_comparison
)

True

In [65]:
# get a metric from a dataframe: 
df = pd.DataFrame(results_series)
df['genomic_read_quality_mean']

GE
GE:Z:ACAP3         36.214286
GE:Z:AGRN          24.846939
GE:Z:AL627309.1    25.306122
GE:Z:AL627309.5    35.382261
GE:Z:AL627309.7    34.088130
GE:Z:AL645608.2    33.036443
GE:Z:AL645608.3    20.857143
GE:Z:AL645608.4    27.269274
Name: genomic_read_quality_mean, dtype: float64

In [73]:
# get a numpy array from the dataframe
compare_me = df['genomic_read_quality_mean'].values

In [74]:
# compare two numpy arrays that are slightly different
eps = np.random.rand(8) * 1e-8
np.allclose(compare_me, compare_me + eps)


True

In [75]:
# it is actually discriminative, though
np.allclose(compare_me, np.arange(8))

False

# Look at the metrics output

In [432]:
gene_metrics = pd.read_csv('data/gene_metrics.csv', index_col=0)

In [433]:
gene_metrics

Unnamed: 0,n_reads,noise_reads,perfect_molecule_barcodes,reads_mapped_exonic,reads_mapped_intronic,reads_mapped_utr,reads_mapped_uniquely,reads_mapped_multiple,duplicate_reads,spliced_reads,...,genomic_read_quality_variance,n_molecules,n_fragments,reads_per_molecule,reads_per_fragment,fragments_per_molecule,fragments_with_single_read_evidence,molecules_with_single_read_evidence,number_cells_detected_multiple,number_cells_expressing
ACAP3,1,0,1,1,0,0,1,0,0,1,...,,1,1,1.0,1.0,1.0,1,1,0,1
AGRN,1,0,1,1,0,0,1,0,0,1,...,,1,1,1.0,1.0,1.0,1,1,0,1
AL627309.1,1,0,1,1,0,0,1,0,0,1,...,,1,1,1.0,1.0,1.0,1,1,0,1
AL627309.5,26,0,26,26,0,0,26,0,6,26,...,18.455293,8,15,3.25,1.733333,1.875,7,2,6,8
AL627309.7,245,0,245,245,0,0,245,0,43,0,...,21.6745,59,176,4.152542,1.392045,2.983051,124,22,38,57
AL645608.2,7,0,7,7,0,0,7,0,2,0,...,33.657186,4,5,1.75,1.4,1.25,4,2,2,4
AL645608.3,1,0,1,1,0,0,1,0,0,0,...,,1,1,1.0,1.0,1.0,1,1,0,1
AL645608.4,18,0,18,18,0,0,18,0,0,0,...,53.54574,13,17,1.384615,1.058824,1.307692,16,12,1,13


In [348]:
!cat data/gene_metrics.csv

,n_reads,noise_reads,perfect_molecule_barcodes,reads_mapped_exonic,reads_mapped_intronic,reads_mapped_utr,reads_mapped_uniquely,reads_mapped_multiple,duplicate_reads,spliced_reads,antisense_reads,molecule_barcode_fraction_bases_above_30_mean,molecule_barcode_fraction_bases_above_30_variance,genomic_reads_fraction_bases_quality_above_30_mean,genomic_reads_fraction_bases_quality_above_30_variance,genomic_read_quality_mean,genomic_read_quality_variance,n_molecules,n_fragments,reads_per_molecule,reads_per_fragment,fragments_per_molecule,fragments_with_single_read_evidence,molecules_with_single_read_evidence,number_cells_detected_multiple,number_cells_expressing
ACAP3,1,0,1,1,0,0,1,0,0,1,0,1.0,nan,0.8877551020408163,nan,36.214285714285715,nan,1,1,1.0,1.0,1.0,1,1,0,1
AGRN,1,0,1,1,0,0,1,0,0,1,0,1.0,nan,0.3979591836734694,nan,24.846938775510203,nan,1,1,1.0,1.0,1.0,1,1,0,1
AL627309.1,1,0,1,1,0,0,1,0,0,1,0,0.8,nan,0.4270833333333333,nan,25.306122448979593,nan,1,1,1.0,1.0,1.0,1,1,0,1
AL627309

In [349]:
test = pd.read_csv('/Users/jsoto/IdeaProjects/sctools/src/sctools/test/data/gene_metrics.csv', index_col=0)

In [350]:
test['n_reads']

ACAP3           1
AGRN            1
AL627309.1      1
AL627309.5     26
AL627309.7    245
AL645608.2      7
AL645608.3      1
AL645608.4     18
Name: n_reads, dtype: int64

In [351]:
pd.read_csv('')

FileNotFoundError: File b'' does not exist