# Infer EC50 values for the `171201_Eva` chip

## Import `Python` modules

In [1]:
import os
import sys
import re
import glob

import xml.etree.ElementTree as ET
sys.path.append("/home/jupyter/tacc-work/jupyter_packages/lib/python2.7/site-packages")
from FlowCytometryTools import *

import numpy as np
import pandas
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(font_scale=1.5)

# Initialize directories
resultsdir = "results/171201_Eva_10861"
resultsdir_parse_metadata = os.path.join(resultsdir, 'parse_metadata')
dirs = [resultsdir, resultsdir_parse_metadata]
for dir_i in dirs:
    if not os.path.isdir(dir_i):
        os.makedirs(dir_i)

## Make the input experimental-summary file

This library has multiple replicates. Below is a list of replicates, as specified by unique `plan_id` variables from the BIOFAB, along with the matched and naive controls, which lack `plan_id`s. Devin said that the `aq_item_id` for the naive sample for the replicate with the lower `plan_id` also has a lower `aq_item_id` than the other naive sample.

* `plan_id`: 10861; `aq_item_id` for matched naive control: 128412
* `plan_id`: 11284; `aq_item_id` for matched naive control: 128440

### Specify all inputs

In [2]:
# Specify the location of FASTQ files
fastq_dir = '/work/05402/haddox/jupyter/sd2e-community/shared-q1-workshop/strcklnd/ngs_data/run_8/EVA_Nov17_new_prep/'
fastq_summary_file = '/work/05402/haddox/jupyter/sd2e-community/shared-q1-workshop/strcklnd/ngs_data/run_8/manifest.csv'

# Specify the locations of directories with FCS files with part of the FACS data
facs_dir = "/work/05402/haddox/jupyter/sd2e-community/shared-q1-workshop/strcklnd/facs_data/production_data/"
job_ids = ['Job_60359', 'Job_60063', 'Job_59787']
plan_id = 10861
facs_dirs = ','.join([os.path.join(facs_dir, job_id) for job_id in job_ids])
job_ids = ','.join(job_ids)

# Specify the FACS channel name to investigate
facs_channel_name = u'FITC-A'

# Make a list of input XML files with the other part of the FACS data
xml_files = ','.join(glob.glob(
    os.path.join(facs_dir, 'sort_reports', '*/*.xml')
))

# Specify column-specific values to analyze in the metadata files.
library_name = "EVA_NOV17"
ignore_aq_item_ids = '128440,128442' # ignore the sequencing data for the naive sample for the other replicate and the (trypsin, 0) sample for this replicate

# Specify the output metadata CSV file
experimental_metadata_output_file = os.path.join(resultsdir_parse_metadata, 'experimental_metadata_from_script.csv')

# Read in input arguments, as if it were a new script
job_ids = job_ids.split(',')
facs_dirs = facs_dirs.split(',')
xml_files = xml_files.split(',')
ignore_aq_item_ids = ignore_aq_item_ids.split(',')

# Read in data on location of deep-sequencing data
fastq_df = pandas.read_csv(fastq_summary_file)

# Remove entires to ignore, as specified above
indices_to_drop = [i for (i, row) in fastq_df.iterrows() if str(row['aq_item_id']) in ignore_aq_item_ids]
fastq_df.drop(indices_to_drop, inplace=True)

# Piece together paths to FASTQ files and tweak some of entries to match
# expected patterns/cases
fastq_df['fastq_id'] = fastq_df.apply(lambda row: os.path.join(
    str(row['aq_item_id']), 'Files/'
), axis=1)
fastq_df['protease'] = fastq_df['protease'].apply(lambda x: x.lower())
fastq_df['strain'] = fastq_df['strain'].apply(lambda x: x.replace(' ', '_'))

# To conform with previous naming schemes, I will relabel the
# naive sample (naive, 0) to be the naive trypsin sample (trypsin, 0)
fastq_df['protease'] = fastq_df['protease'].apply(lambda x: x.replace('naive', 'trypsin'))

# Index by protease name and concentration
fastq_df.set_index(['protease', 'concentration'], inplace=True)

# Downsample to include only samples within library of interest using the
# `plan_id` to select the replicate of interest
fastq_df = fastq_df[
    (fastq_df['strain'] == library_name) &\
    (fastq_df['plan_id'].isin([plan_id, np.nan]))
]
fastq_df.sort_index(inplace=True)
fastq_df

Unnamed: 0_level_0,Unnamed: 1_level_0,aq_item_id,plan_id,strain,fastq_id
protease,concentration,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
chymotrypsin,9.5,128444,10861.0,EVA_NOV17,128444/Files/
chymotrypsin,28.0,128446,10861.0,EVA_NOV17,128446/Files/
chymotrypsin,83.0,128456,10861.0,EVA_NOV17,128456/Files/
chymotrypsin,250.0,128452,10861.0,EVA_NOV17,128452/Files/
chymotrypsin,750.0,128460,10861.0,EVA_NOV17,128460/Files/
chymotrypsin,2250.0,128462,10861.0,EVA_NOV17,128462/Files/
trypsin,0.0,128412,,EVA_NOV17,128412/Files/
trypsin,9.5,128448,10861.0,EVA_NOV17,128448/Files/
trypsin,28.0,128450,10861.0,EVA_NOV17,128450/Files/
trypsin,83.0,128454,10861.0,EVA_NOV17,128454/Files/


### For each sample in the experiment, quantify the total number of events and the number of events that passed the sorting threshold

In [3]:
def s_log(sample, channel_names):
    new_sample = sample.copy()
    new_data = new_sample.data

    for channel_name in channel_names:
        new_data[channel_name] = np.log10(new_data[channel_name])
        
    new_data.replace(to_replace = -np.inf, value = -1, inplace = True)
    new_data = new_data.dropna()
    new_sample.data = new_data
    
    return new_sample

In [4]:
# Read in data from the `manifest.txt` files
manifest_df = pandas.DataFrame()
for (job_id, facs_dir) in zip(job_ids, facs_dirs):
    df = pandas.read_csv(os.path.join(facs_dir, 'manifest.txt'))
    df['job_id'] = job_id
    df['directory'] = facs_dir
    df['specimen'] = df['filename'].apply(lambda x : x[:12])
    df['tube'] = df['filename'].apply(lambda x : x[13:21])
    df['strain'] = df['strain'].apply(lambda x: x.replace(' ', '_'))
    manifest_df = pandas.concat([manifest_df, df])
manifest_df.set_index(['job_id', 'specimen', 'tube'], inplace=True)

In [5]:
# Quantify the total number of events and events passing the gate
fitc_data_dict = {}
total_number_of_recorded_events = []
number_of_events_passing_fitc_gate = []
for (i, row) in manifest_df.iterrows():
    
    # Read in the data for the appropriate channel and transform it to a log10 scale
    fcs_file = os.path.join(row['directory'], row['filename'])
    sample = FCMeasurement(ID='Test Sample', datafile=fcs_file)
    #print("Analyzing the channel: {0}".format(channel_name))
    sample = s_log(sample, [facs_channel_name])
    df = sample.data
    
    # Count the number of total events and events passing the gate
    total_number_of_recorded_events.append(len(df.index.values))
    number_of_events_passing_fitc_gate.append(
        sum(df[facs_channel_name] > 3.0)
    )

# Append the FACS data to the manifest dataframe
manifest_df['total_number_of_recorded_events'] = total_number_of_recorded_events
manifest_df['number_of_events_passing_fitc_gate'] = number_of_events_passing_fitc_gate
manifest_df['fraction_collected'] = \
    manifest_df['number_of_events_passing_fitc_gate'] / manifest_df['total_number_of_recorded_events']
columns_to_show = [
    'strain', 'protease', 'concentration', 'total_number_of_recorded_events',
    'number_of_events_passing_fitc_gate', 'fraction_collected'
]
manifest_df = manifest_df[
    manifest_df['strain'] == library_name
]
manifest_df[columns_to_show]



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,strain,protease,concentration,total_number_of_recorded_events,number_of_events_passing_fitc_gate,fraction_collected
job_id,specimen,tube,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Job_60359,Specimen_001,Tube_010,EVA_NOV17,chymotrypsin,0.0,99993,77560,0.775654
Job_60359,Specimen_001,Tube_006,EVA_NOV17,chymotrypsin,750.0,99996,21064,0.210648
Job_60359,Specimen_001,Tube_007,EVA_NOV17,chymotrypsin,2250.0,99995,6140,0.061403
Job_60359,Specimen_001,Tube_011,EVA_NOV17,trypsin,0.0,99997,76259,0.762613
Job_60359,Specimen_001,Tube_008,EVA_NOV17,trypsin,750.0,99997,19489,0.194896
Job_60359,Specimen_001,Tube_009,EVA_NOV17,trypsin,2250.0,99997,2217,0.022171
Job_60063,Specimen_001,Tube_010,EVA_NOV17,chymotrypsin,0.0,99995,67805,0.678084
Job_60063,Specimen_001,Tube_006,EVA_NOV17,chymotrypsin,83.0,98303,64106,0.652127
Job_60063,Specimen_001,Tube_007,EVA_NOV17,chymotrypsin,250.0,99998,52457,0.52458
Job_60063,Specimen_001,Tube_011,EVA_NOV17,trypsin,0.0,99995,70467,0.704705


Go through the dataframe, making a list of samples to drop. Specifically, I will drop naive samples that are not from the first day of sorting. But, before doing so, I will record data from them to include in the `parent_expression` column of the output file.

In [6]:
map_job_id_to_sort_round = {
    'Job_59787' : 1, 'Job_60063' : 2, 'Job_60359' : 3
}
naive_sample_expression = {'trypsin':{}, 'chymotrypsin':{}}
list_of_indices_to_drop = []

for (i, row) in manifest_df.iterrows():
    (job_id, specimen, tube) = i
    sort_round = map_job_id_to_sort_round[job_id]
    if row['concentration'] == 0.0:
        naive_sample_expression[row['protease']][sort_round] = row['fraction_collected']
        if sort_round != 1:
            list_of_indices_to_drop.append(i)
            
manifest_df.drop(list_of_indices_to_drop, axis='index', inplace=True)
manifest_df[columns_to_show]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,strain,protease,concentration,total_number_of_recorded_events,number_of_events_passing_fitc_gate,fraction_collected
job_id,specimen,tube,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Job_60359,Specimen_001,Tube_006,EVA_NOV17,chymotrypsin,750.0,99996,21064,0.210648
Job_60359,Specimen_001,Tube_007,EVA_NOV17,chymotrypsin,2250.0,99995,6140,0.061403
Job_60359,Specimen_001,Tube_008,EVA_NOV17,trypsin,750.0,99997,19489,0.194896
Job_60359,Specimen_001,Tube_009,EVA_NOV17,trypsin,2250.0,99997,2217,0.022171
Job_60063,Specimen_001,Tube_006,EVA_NOV17,chymotrypsin,83.0,98303,64106,0.652127
Job_60063,Specimen_001,Tube_007,EVA_NOV17,chymotrypsin,250.0,99998,52457,0.52458
Job_60063,Specimen_001,Tube_008,EVA_NOV17,trypsin,83.0,99993,64109,0.641135
Job_60063,Specimen_001,Tube_009,EVA_NOV17,trypsin,250.0,99997,53023,0.530246
Job_59787,Specimen_001,Tube_006,EVA_NOV17,trypsin,0.0,99997,37520,0.375211
Job_59787,Specimen_001,Tube_007,EVA_NOV17,chymotrypsin,9.5,99999,32048,0.320483


Read in data from the XML files on the total number of cells collected. I will do this for all XML files the BIOFAB has uploaded, even ones that aren't relevant to this experiment. Then, I will identify the relevant ones using the `job_id`, `specimen`, and `tube` columns.

In [7]:
# Compile data in each XML file
xml_data_dict = {
    key : []
    for key in ['job_id', 'specimen', 'tube', 'cells_collected', 'intended_number_of_cells_collected'] #
}
cells_collected_pattern = re.compile(r'\w+ : (?P<cells_collected>\d+) / (?P<intended_number_of_cells_collected>\d+)')
for filename in xml_files:
    tree = ET.parse(filename)
    root = tree.getroot()

    # Get experiment metadata
    xml_data_dict['job_id'].append( root.findall(".//*[@name='Experiment']")[0].get('value') )
    xml_data_dict['specimen'].append( root.findall(".//*[@name='Specimen']")[0].get('value') )
    xml_data_dict['tube'].append( root.findall(".//*[@name='Tube']")[0].get('value') )
    xml_data_dict['xmlfile'] = filename
    
    # Get experiment counts
    cells_collected_data = root[4].findall(".//*[@col='1']")[0].text
    match = re.match(cells_collected_pattern, cells_collected_data)
    if match:
        xml_data_dict['cells_collected'].append(
            int(match.group('cells_collected'))
        )
        xml_data_dict['intended_number_of_cells_collected'].append(
            int(match.group('intended_number_of_cells_collected'))
        )
    else:
        xml_data_dict['cells_collected'].append(None)
        xml_data_dict['intended_number_of_cells_collected'].append(None)
          
# Convert data to a dataframe
xml_data_df = pandas.DataFrame.from_dict(xml_data_dict)
xml_data_df.set_index(['job_id', 'specimen', 'tube'], inplace=True)
#xml_data_df

Merge the FACS data from the two input sources

In [8]:
facs_df = manifest_df.merge(xml_data_df, left_index=True, right_index=True, how='left')
facs_df = facs_df[
    facs_df['strain'] == library_name
]
facs_df.set_index(['protease', 'concentration'], inplace=True, drop=False)
columns_to_show = ['cells_collected', 'fraction_collected']
facs_df[columns_to_show].sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,cells_collected,fraction_collected
protease,concentration,Unnamed: 2_level_1,Unnamed: 3_level_1
chymotrypsin,9.5,,0.320483
chymotrypsin,28.0,,0.29746
chymotrypsin,83.0,7704000.0,0.652127
chymotrypsin,250.0,6312000.0,0.52458
chymotrypsin,750.0,2107594.0,0.210648
chymotrypsin,750.0,3954.0,0.210648
chymotrypsin,2250.0,648405.0,0.061403
trypsin,0.0,,0.375211
trypsin,9.5,,0.309303
trypsin,28.0,,0.279043


Merge the FACS and deep-sequencing data

In [9]:
df = fastq_df.merge(facs_df, left_index=True, right_index=True, how="outer")
columns_to_write = ['fastq_id', 'cells_collected', 'fraction_collected']
df = df[columns_to_write]
df.reset_index(inplace=True)
df

Unnamed: 0,protease,concentration,fastq_id,cells_collected,fraction_collected
0,chymotrypsin,9.5,128444/Files/,,0.320483
1,chymotrypsin,28.0,128446/Files/,,0.29746
2,chymotrypsin,83.0,128456/Files/,7704000.0,0.652127
3,chymotrypsin,250.0,128452/Files/,6312000.0,0.52458
4,chymotrypsin,750.0,128460/Files/,2107594.0,0.210648
5,chymotrypsin,750.0,128460/Files/,3954.0,0.210648
6,chymotrypsin,2250.0,128462/Files/,648405.0,0.061403
7,trypsin,0.0,128412/Files/,,0.375211
8,trypsin,9.5,128448/Files/,,0.309303
9,trypsin,28.0,128450/Files/,,0.279043


There are two entries for (chymotrypsin, 750.0) one with a large number of counts, one with a small number. I will remove the row with the smaller number.

In [10]:
df.drop([5], inplace=True)

Add a row for a naive sample for chymotrypsin, using the same deep-sequencing data as for trypsin 

In [11]:
df_with_extra_row = pandas.DataFrame.from_dict(
    {
        'protease' : ['chymotrypsin'],
        'concentration' : [0],
        'fastq_id' : [df.iloc[6]['fastq_id']],
        'cells_collected' : [df.iloc[6]['cells_collected']],
        'fraction_collected' : [df.iloc[6]['fraction_collected']]
    }
)
df_with_extra_row
df = pandas.concat([df, df_with_extra_row])

Show concatenated dataframe, sorted by protease and protease concentration.

In [12]:
df.sort_values(by=['protease', 'concentration'], inplace=True)
df

Unnamed: 0,cells_collected,concentration,fastq_id,fraction_collected,protease
0,,0.0,128412/Files/,0.375211,chymotrypsin
0,,9.5,128444/Files/,0.320483,chymotrypsin
1,,28.0,128446/Files/,0.29746,chymotrypsin
2,7704000.0,83.0,128456/Files/,0.652127,chymotrypsin
3,6312000.0,250.0,128452/Files/,0.52458,chymotrypsin
4,2107594.0,750.0,128460/Files/,0.210648,chymotrypsin
6,648405.0,2250.0,128462/Files/,0.061403,chymotrypsin
7,,0.0,128412/Files/,0.375211,trypsin
8,,9.5,128448/Files/,0.309303,trypsin
9,,28.0,128450/Files/,0.279043,trypsin


Manually add missing columns

In [13]:
# Add columns
df['experiment_id'] = library_name
df['selection_strength'] = 2 * [i for i in range(0,7)]
df['conc_factor'] = 2 * ['', '3', '3', '3', '3', '3', '3']
df['parent'] = 2 * ['', '0', '0', '2', '2', '4', '4']

Add the row called `parent_expression`, which reports the fraction of collected cells (i.e., passed the selection threshold) in a naive sample that was ***not*** treated with protease and which was sorted on the same day as protease-treated samples.

In [14]:
# Make a dictionary to map protease concentration to sort round
map_protease_conc_to_sort_round = {
    0.0 : 1,
    9.5 : 1,
    28.0 : 1,
    83.0 : 2,
    250.0 : 2,
    750.0 : 3,
    2250.0 : 3
}

# Add a `parent_expression` column to the dataframe. Use the same value for
# all protease-treated samples from the first sort round, regardless of protease.
# This value is derived from (trypsin, 0), but is relative to both proteases
# since it is naive sample. Add blank entires for the naive samples from the first
# sort.
parent_expression_list = []
for (i, row) in df.iterrows():
    protease = row['protease']
    concentration = row['concentration']
    sort_round = map_protease_conc_to_sort_round[concentration]
    if concentration == 0.0:
        parent_expression_list.append('')
    elif (protease, sort_round) == ('chymotrypsin', 1):
        parent_expression_list.append(naive_sample_expression['trypsin'][sort_round])
    else:
        parent_expression_list.append(naive_sample_expression[protease][sort_round])
df['parent_expression'] = parent_expression_list
df

Unnamed: 0,cells_collected,concentration,fastq_id,fraction_collected,protease,experiment_id,selection_strength,conc_factor,parent,parent_expression
0,,0.0,128412/Files/,0.375211,chymotrypsin,EVA_NOV17,0,,,
0,,9.5,128444/Files/,0.320483,chymotrypsin,EVA_NOV17,1,3.0,0.0,0.375211
1,,28.0,128446/Files/,0.29746,chymotrypsin,EVA_NOV17,2,3.0,0.0,0.375211
2,7704000.0,83.0,128456/Files/,0.652127,chymotrypsin,EVA_NOV17,3,3.0,2.0,0.678084
3,6312000.0,250.0,128452/Files/,0.52458,chymotrypsin,EVA_NOV17,4,3.0,2.0,0.678084
4,2107594.0,750.0,128460/Files/,0.210648,chymotrypsin,EVA_NOV17,5,3.0,4.0,0.775654
6,648405.0,2250.0,128462/Files/,0.061403,chymotrypsin,EVA_NOV17,6,3.0,4.0,0.775654
7,,0.0,128412/Files/,0.375211,trypsin,EVA_NOV17,0,,,
8,,9.5,128448/Files/,0.309303,trypsin,EVA_NOV17,1,3.0,0.0,0.375211
9,,28.0,128450/Files/,0.279043,trypsin,EVA_NOV17,2,3.0,0.0,0.375211


Rename columns and show the final dataframe.

In [15]:
# Rename columns
df.rename(
    {'protease':'protease_type'},
    inplace=True,
    axis='columns'
)
column_order = [
    'experiment_id', 'protease_type', 'concentration', 'selection_strength', 'parent', 'conc_factor', 'fastq_id',
    'parent_expression', 'fraction_collected', 'cells_collected'
]
df[column_order]

Unnamed: 0,experiment_id,protease_type,concentration,selection_strength,parent,conc_factor,fastq_id,parent_expression,fraction_collected,cells_collected
0,EVA_NOV17,chymotrypsin,0.0,0,,,128412/Files/,,0.375211,
0,EVA_NOV17,chymotrypsin,9.5,1,0.0,3.0,128444/Files/,0.375211,0.320483,
1,EVA_NOV17,chymotrypsin,28.0,2,0.0,3.0,128446/Files/,0.375211,0.29746,
2,EVA_NOV17,chymotrypsin,83.0,3,2.0,3.0,128456/Files/,0.678084,0.652127,7704000.0
3,EVA_NOV17,chymotrypsin,250.0,4,2.0,3.0,128452/Files/,0.678084,0.52458,6312000.0
4,EVA_NOV17,chymotrypsin,750.0,5,4.0,3.0,128460/Files/,0.775654,0.210648,2107594.0
6,EVA_NOV17,chymotrypsin,2250.0,6,4.0,3.0,128462/Files/,0.775654,0.061403,648405.0
7,EVA_NOV17,trypsin,0.0,0,,,128412/Files/,,0.375211,
8,EVA_NOV17,trypsin,9.5,1,0.0,3.0,128448/Files/,0.375211,0.309303,
9,EVA_NOV17,trypsin,28.0,2,0.0,3.0,128450/Files/,0.375211,0.279043,


Write the above dataframe to an output file

In [16]:
print("Writing metadata to the experimental summary file: {0}".format(experimental_metadata_output_file))
df[column_order].to_csv(experimental_metadata_output_file, index=False)

Writing metadata to the experimental summary file: results/171201_Eva_10861/parse_metadata/experimental_metadata_from_script.csv


## Compute EC50 values from the input deep-sequencing and FACS data

In [2]:
# Define input variables
data_dir = 'data/171201_Eva/'
designed_sequences_file = os.path.join(data_dir, 'designed_sequences.csv')
experimental_summary_file = experimental_metadata_output_file
fastq_dir = fastq_dir
pear_path = '/home/05402/haddox/software/pear/bin/pear'
output_dir = resultsdir

NameError: name 'experimental_metadata_output_file' is not defined

In [18]:
# Write the command to carry everything out
cmd = ' '.join([
    'python',
    'scripts/compute_ec50_values_from_deep_sequencing_data.py',
    '--designed_sequences_file {0}'.format(designed_sequences_file),
    '--experimental_summary_file {0}'.format(experimental_summary_file),
    '--fastq_dir {0}'.format(fastq_dir),
    '--pear_path {0}'.format(pear_path),
    '--output_dir {0}'.format(output_dir)
])

#! {cmd}
print(cmd)

python scripts/compute_ec50_values_from_deep_sequencing_data.py --designed_sequences_file data/171201_Eva/designed_sequences.csv --experimental_summary_file results/171201_Eva_10861/parse_metadata/experimental_metadata_from_script.csv --fastq_dir /work/05402/haddox/jupyter/sd2e-community/shared-q1-workshop/strcklnd/ngs_data/run_8/EVA_Nov17_new_prep/ --pear_path /home/05402/haddox/software/pear/bin/pear --output_dir results/171201_Eva_10861


## Analyze the stability scores

## To do:

* Quantify sequencing depth
* Look at the distribution of counts over time

Quantify the number of assembled reads per sample

In [None]:
def ParsePAREOutfile(outfile):
    """
    This function parses the output data generated by PARE when assemblying paired-end reads
    
    Args:
        `outfile`: the path to a file with the output data generated by PARE
        
    Returns:
        A tupple with the following three variables in the order they appear in the below list:
            `n_assembled_reads` : the total number of assembled reads
            `n_discarded_reads` : the total number of discarded reads
            `n_non_assembled_reads` : the total number of non_assembled_reads
    """
    
    # Pattern used to extract data
    n_reads_pattern = re.compile(r'\: (?P<n_reads>[\d,]+) /')
    
    # Use regular expressions to extract the relevant info from the file
    n_assembled_reads = n_discarded_reads = n_non_assembled_reads = n_total_reads = None
    with open(outfile) as f:
        #print(f.readlines())
        for line in f:
            if 'Assembled reads .' in line:
                if n_assembled_reads:
                    raise ValueError("Already found data for `n_assembled_reads`")
                n_assembled_match = re.search(n_reads_pattern, line)
                n_assembled_reads = int(n_assembled_match.group('n_reads').replace(',', ''))
                
            elif 'Discarded reads .' in line:
                if n_discarded_reads:
                    raise ValueError("Already found data for `n_discarded_reads`")
                n_discarded_match = re.search(n_reads_pattern, line)
                n_discarded_reads = int(n_discarded_match.group('n_reads').replace(',', ''))
                
            elif 'Not assembled reads .' in line:
                if n_non_assembled_reads:
                    raise ValueError("Already found data for `n_non_assembled_reads`")
                n_non_assembled_match = re.search(n_reads_pattern, line)
                n_non_assembled_reads = int(n_non_assembled_match.group('n_reads').replace(',', ''))
    
    return (n_assembled_reads, n_discarded_reads, n_non_assembled_reads)


In [None]:
# Find log files from PARE
pare_log_dir = os.path.join(resultsdir, 'paired_FASTQ_files')

selection_indices = list(range(0,7))
experiment_names = ['trypsin_{0}'.format(i) for i in selection_indices] + \
    ['chymotrypsin_{0}'.format(i) for i in selection_indices]
log_files = glob.glob(os.path.join(pare_log_dir, '*.log'))
log_files_dict = {
    experiment_name : []
    for experiment_name in experiment_names
}
# For each log file, look for an experimental name that matches the beginning
# of the base name of the log file
for log_file_name in log_files:
    matching_experiments = []
    for experiment_name in log_files_dict:
        log_file_basename = os.path.basename(log_file_name)
        if log_file_basename.find('{0}-'.format(experiment_name)) == 0:
            log_files_dict[experiment_name].append(log_file_name)
            matching_experiments.append(experiment_name)
    if len(matching_experiments) == 0:
        raise ValueError("Could not find a matching experiment for the log file: {0}".format(log_file_name))
    if len(matching_experiments) > 1:
        print(matching_experiments)
        raise ValueError("Found multiple matching experiments for the log file: {0}".format(log_file_name))
    assert len(matching_experiments) == 1

In [None]:
# Read in depth and quality scores from the output of the `PEAR` program and store in a pandas dataframe
assembly_d = {
    key : []
    for key in ['experiment_name', 'n_assembled_reads', 'n_discarded_reads', 'n_non_assembled_reads']
}
for experiment_name in log_files_dict:
    assembly_d['experiment_name'].append(experiment_name)
    n_assembled_reads = n_discarded_reads = n_non_assembled_reads = 0
    for log_file_name in log_files_dict[experiment_name]:
        (n_assembled_reads_i, n_discarded_reads_i, n_non_assembled_reads_i) = \
            ParsePAREOutfile(log_file_name)
        n_assembled_reads += n_assembled_reads_i
        n_discarded_reads += n_discarded_reads_i
        n_non_assembled_reads += n_non_assembled_reads_i
    assembly_d['n_assembled_reads'].append(n_assembled_reads)
    assembly_d['n_discarded_reads'].append(n_discarded_reads)
    assembly_d['n_non_assembled_reads'].append(n_non_assembled_reads)

assembly_df = pandas.DataFrame.from_dict(assembly_d)

# Plot the data for each replicate as stacked bar charts
fig = plt.figure(figsize=(10,7))

# Get data for each bar
assembly_df.set_index('experiment_name', inplace=True)
labels = sorted(assembly_df.index.values)
first_bar = assembly_df.loc[labels]['n_assembled_reads']
second_bar = assembly_df.loc[labels]['n_non_assembled_reads']
third_bar = assembly_df.loc[labels]['n_discarded_reads']
assert(len(first_bar) == len(second_bar))

# Make plot
plot_indices = np.arange(len(first_bar))
width = 0.35
plt.barh(plot_indices, first_bar, label='assembled', align='center')
plt.barh(plot_indices, second_bar, left=first_bar, label='non-assembled', color='red', align='center')
plt.barh(plot_indices, third_bar, left=second_bar, label='discarded', color='purple', align='center')
plt.yticks(plot_indices, labels)
plt.xlabel('Number of reads')
plt.legend()
plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
def adjust_ylim(l):
    mn, mx = l
    mn += 1.5
    mx -= .5
    return (mn, mx)
plt.ylim(adjust_ylim(plt.ylim()))
plt.yticks()

Look at the distribution of stability scores

In [None]:
trypsin_df = pandas.read_csv('results/Inna_April_2016/stability_scores/trypsin_stability_scores.txt', sep='\t')
chymotrypsin_df = pandas.read_csv('results/Inna_April_2016/stability_scores/chymotrypsin_stability_scores.txt', sep='\t')
sns.distplot(trypsin_df['stabilityscore'])
sns.distplot(chymotrypsin_df['stabilityscore'])
plt.show()

In [None]:
trypsin_df

In [None]:
import gzip

filename = '/work/05402/haddox/jupyter/sd2e-community/shared-q1-workshop/strcklnd/ngs_data/run_6/EVA_Nov17/125459/Files/125459-naive_S1_L001_R1_001.fastq.gz'
with gzip.open(filename, 'rb') as f:
    for (i, line) in enumerate(f):
        print(line.find('CATATG'))
        print(line)
        if i > 100:
            break

## Truncate the length of input design sequences to match the length of the deep-sequencing reads

The sequence `CATATG` appears at about nucleotide 24-27 in the forward reads. Since the forward reads are only 150 nt, and since the ends of the first ten reads look low quality, I will trim the input design sequences to the first 30 amino acids (=90/3).

In [None]:
df = pandas.read_csv('data/171201_Eva/designed_sequences.csv')
df.rename(
    {'protein_sequence':'full_sequence'},
    inplace=True,
    axis='columns'
)
df['protein_sequence'] = df['full_sequence'].apply(
    lambda x: str(x)[:30]
)

shortened_designed_sequences_file = os.path.join(
    resultsdir_parse_metadata,
    'shortened_designed_sequences.csv'
)
print("Writing shortened design sequences to the file: {0}".format(
    shortened_designed_sequences_file
))
df[['name', 'protein_sequence']].to_csv(
    shortened_designed_sequences_file, index=False
)