# Code that reproduces the complete analysis from Rocklin et al., 2017, Science

In [28]:
# Import `Python` modules
import os
import sys
import pandas

In [41]:
# Define input variables
data_dir = 'data/Rocklin_2017_Science/'
designed_sequences_file = os.path.join(data_dir, 'designed_protein_sequences.csv')
experimental_summary_file = os.path.join(data_dir, 'experimental_summary.csv')
fastq_dir = '/work/05402/haddox/jupyter/sd2e-community/archive/ingest/Q0/sd2.biofab.upload/Rocklin_ProtStab/'
pear_path = '/home/05402/haddox/software/pear/bin/pear'

Call the python script

In [42]:
# Write the command to carry everything out
cmd = ' '.join([
    'python',
    'scripts/compute_ec50_values_from_deep_sequencing_data.py',
    '--designed_sequences_file {0}'.format(designed_sequences_file),
    '--experimental_summary_file {0}'.format(experimental_summary_file),
    '--fastq_dir {0}'.format(fastq_dir),
    '--pear_path {0}'.format(pear_path)
])

! {cmd}

{'chymotrypsin', 'trypsin'}
{0, 1, 2, 3, 4, 5, 6}


The below cells are for testing parts of the script in this notebook

In [6]:
# Import `Python` modules
import os
import sys
import pandas

In [39]:
# Read in the designed sequences
designed_seqs_df = pandas.read_csv(designed_sequences_file)
designed_seqs_df.set_index('protein_sequence', inplace=True)

# Read in data from the experiments summary file and get a list of
# unique proteases and unique selection indices
summary_df = pandas.read_csv(experimental_summary_file)
proteases = set(summary_df['protease_type'])
selection_indices = set(summary_df['selection_strength'])
print(proteases)
print(selection_indices)

{'chymotrypsin', 'trypsin'}
{0, 1, 2, 3, 4, 5, 6}


In [None]:
# Iterate through each sample in the experimental summary dataframe and compile
# a list of FASTQ files for each sample
FASTQ_files = {}
for (i, row) in summary_df.iterrows():

    # Get sample metadata
    protease_type = row['protease type']
    selection_index = row['selection_index']
    experiment_name = '{0}_{1}'.format(protease_type, selection_index)
    fastq_id = row['fastq_id'].replace('_', '-')

    # Find R1 and R2 files and append them to a list
    r1_files = glob.glob('{0}/{1}*_R1_*.fastq*'.format(fastq_dir, fastq_id))
    r2_files = [f.replace('_R1_', '_R2_') for f in r1_files]
    assert experiment_name not in FASTQ_files.keys(), \
        "Duplicate experiment name: {0}".format(experiment_name)
    FASTQ_files[experiment_name] = list(zip(r1_files, r2_files))

    # Make sure that each sample has the same number of R1 and R2 files, that
    # there are more than one of each, and that the patterns "_R1_" and "_R2_"
    # don't appear more than once
    assert(len(r1_files) == len(r2_files))
    if len(r1_files) == 0:
        raise ValueError(
            "Failed to find FASTQ files for the fastq_id: {0}".format(fastq_ID)
        )
    for (f1, f2) in zip(r1_files, r2_files):
        assert f1.count('_R1_') == f2.count('_R2_') == 1, \
            "The string '_R1_' or '_R2_' appear multiple times in file name"
        if not os.path.isfile(f2):
            raise ValueError(
                "Failed to find a matching R2 file for: {0}".format(f)
            )