# Infer EC50 values from data generated using the high-throughput stability assay for the `Inna April 2016` chip

## Import `Python` modules

In [1]:
import os
import sys
import re
import glob

import xml.etree.ElementTree as ET
sys.path.append("/home/jupyter/tacc-work/jupyter_packages/lib/python2.7/site-packages")
from FlowCytometryTools import *

import numpy as np
import pandas
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(font_scale=1.5)

# Initialize directories
resultsdir = "results/Inna_April_2016"
resultsdir_parse_metadata = os.path.join(resultsdir, 'parse_metadata')
dirs = [resultsdir, resultsdir_parse_metadata]
for dir_i in dirs:
    if not os.path.isdir(dir_i):
        os.makedirs(dir_i)

## Specify all inputs

In [2]:
# Specify the location of FASTQ files
fastq_dir = '/work/05402/haddox/jupyter/sd2e-community/shared-q1-workshop/strcklnd/ngs_data/run_4'

# Specify the locations of directories with FCS files with part of the FACS data 
facs_dir = "/work/05402/haddox/jupyter/sd2e-community/shared-q1-workshop/strcklnd/facs_data/production_data/"
job_ids = ['Job_56649', 'Job_56497', 'Job_56243']
facs_dirs = ','.join([os.path.join(facs_dir, job_id) for job_id in job_ids])
job_ids = ','.join(job_ids)

# Specify the FACS channel name to investigate
facs_channel_name = u'FITC-A'

# Make a list of input XML files with the other part of the FACS data
xml_files = ','.join(glob.glob(
    os.path.join(facs_dir, 'sort_reports', '*/*.xml')
))

# Specify column-specific values to analyze in the metadata files.
library_name = "Inna April 2016"
ignore_aq_item_ids = '118852'

# Specify the output metadata CSV file
output_file = os.path.join(resultsdir_parse_metadata, 'experimental_metadata_from_script.csv')

Read in input args, as if it were a new script

In [3]:
job_ids = job_ids.split(',')
facs_dirs = facs_dirs.split(',')
xml_files = xml_files.split(',')
ignore_aq_item_ids = ignore_aq_item_ids.split(',')

## Read in data on location of deep-sequencing data

In [4]:
fastq_summary_file = os.path.join(fastq_dir, 'run_4.csv')
fastq_df = pandas.read_csv(fastq_summary_file)

# Remove entires to ignore 
indices_to_drop = [i for (i, row) in fastq_df.iterrows() if row['aq_item_id'] in ignore_aq_item_ids]
fastq_df.drop(indices_to_drop, inplace=True)
fastq_df['fastq_id'] = fastq_df.apply(lambda row: os.path.join(row['aq_item_id'], row['filename_prefix']), axis=1)
fastq_df['protease'] = fastq_df['protease'].apply(lambda x: x.lower())
fastq_df['strain'] = fastq_df['strain'].apply(lambda x: x.replace(' ', '_'))
fastq_df.set_index(['protease', 'concentration'], inplace=True)

## For each sample in the experiment, quantify the number of total events and the number of events that pass the sorting threshold

In [5]:
def s_log(sample, channel_names):
    new_sample = sample.copy()
    new_data = new_sample.data

    for channel_name in channel_names:
        new_data[channel_name] = np.log10(new_data[channel_name])
        
    new_data.replace(to_replace = -np.inf, value = -1, inplace = True)
    new_data = new_data.dropna()
    new_sample.data = new_data
    
    return new_sample

Read in data from the `manifest.txt` files

In [6]:
# Read in data from the `manifest.txt` files
manifest_df = pandas.DataFrame()
for (job_id, facs_dir) in zip(job_ids, facs_dirs):
    df = pandas.read_csv(os.path.join(facs_dir, 'manifest.txt'))
    df['job_id'] = job_id
    df['directory'] = facs_dir
    df['specimen'] = df['filename'].apply(lambda x : x[:12])
    df['tube'] = df['filename'].apply(lambda x : x[13:21])
    manifest_df = pandas.concat([manifest_df, df])
manifest_df.set_index(['job_id', 'specimen', 'tube'], inplace=True)

In [7]:
# Quantify the total number of events and events passing the gate
fitc_data_dict = {}
total_number_of_recorded_events = []
number_of_events_passing_fitc_gate = []
for (i, row) in manifest_df.iterrows():
    
    # Read in the data for the appropriate channel and transform it to a log10 scale
    fcs_file = os.path.join(row['directory'], row['filename'])
    sample = FCMeasurement(ID='Test Sample', datafile=fcs_file)
    #print("Analyzing the channel: {0}".format(channel_name))
    sample = s_log(sample, [facs_channel_name])
    df = sample.data
    
    # Count the number of total events and events passing the gate
    total_number_of_recorded_events.append(len(df.index.values))
    number_of_events_passing_fitc_gate.append(
        sum(df[facs_channel_name] > 3.0)
    )

# Append the FACS data to the manifest dataframe
manifest_df['total_number_of_recorded_events'] = total_number_of_recorded_events
manifest_df['number_of_events_passing_fitc_gate'] = number_of_events_passing_fitc_gate
manifest_df['fraction_collected'] = \
    manifest_df['number_of_events_passing_fitc_gate'] / manifest_df['total_number_of_recorded_events']
columns_to_show = [
    'strain', 'protease', 'concentration', 'total_number_of_recorded_events',
    'number_of_events_passing_fitc_gate', 'fraction_collected'
]
#manifest_df[columns_to_show]



Read in data from the XML files

In [8]:
# Compile data in each XML file
xml_data_dict = {
    key : []
    for key in ['job_id', 'specimen', 'tube', 'cells_collected', 'intended_number_of_cells_collected'] #
}
cells_collected_pattern = re.compile(r'\w+ : (?P<cells_collected>\d+) / (?P<intended_number_of_cells_collected>\d+)')
for filename in xml_files:
    tree = ET.parse(filename)
    root = tree.getroot()

    # Get experiment metadata
    xml_data_dict['job_id'].append( root.findall(".//*[@name='Experiment']")[0].get('value') )
    xml_data_dict['specimen'].append( root.findall(".//*[@name='Specimen']")[0].get('value') )
    xml_data_dict['tube'].append( root.findall(".//*[@name='Tube']")[0].get('value') )
    
    # Get experiment counts
    cells_collected_data = root[4].findall(".//*[@col='1']")[0].text
    match = re.match(cells_collected_pattern, cells_collected_data)
    if match:
        xml_data_dict['cells_collected'].append(
            int(match.group('cells_collected'))
        )
        xml_data_dict['intended_number_of_cells_collected'].append(
            int(match.group('intended_number_of_cells_collected'))
        )
    else:
        xml_data_dict['cells_collected'].append(None)
        xml_data_dict['intended_number_of_cells_collected'].append(None)
          
# Convert data to a dataframe
xml_data_df = pandas.DataFrame.from_dict(xml_data_dict)
xml_data_df.set_index(['job_id', 'specimen', 'tube'], inplace=True)
#xml_data_df

Merge the FACS data from the two input sources

In [9]:
facs_df = manifest_df.merge(xml_data_df, left_index=True, right_index=True, how='left')
facs_df = facs_df[
    facs_df['strain'] == library_name
]
facs_df.set_index(['protease', 'concentration'], inplace=True)
columns_to_show = ['cells_collected', 'fraction_collected']
#facs_df[columns_to_show]

Merge the FACS and deep-sequencing data

In [10]:
df = fastq_df.merge(facs_df, left_index=True, right_index=True, how="outer")
columns_to_write = ['fastq_id', 'cells_collected', 'fraction_collected']
df = df[columns_to_write]
df.reset_index(inplace=True)
df

Unnamed: 0,protease,concentration,fastq_id,cells_collected,fraction_collected
0,chymotrypsin,9.5,118855/A_Aprilexp1round1_4,1200000.0,0.189052
1,chymotrypsin,28.0,118856/A_Aprilexp1round1_5,1200000.0,0.174285
2,chymotrypsin,83.0,118863/A_Aprilexp1round2_3,5641051.0,0.636093
3,chymotrypsin,250.0,118864/A_Aprilexp1round2_4,5631873.0,0.50509
4,chymotrypsin,750.0,118859/A_Aprilexp1round3_3,2564592.0,0.305309
5,chymotrypsin,2250.0,118860/A_Aprilexp1round3_4,1127664.0,0.128904
6,trypsin,0.0,naive/run2-09-jan25-2018_S9,1200000.0,0.208038
7,trypsin,9.5,118853/A_Aprilexp1round1_2,1200000.0,0.182106
8,trypsin,28.0,118854/A_Aprilexp1round1_3,1200000.0,0.166087
9,trypsin,83.0,118861/A_Aprilexp1round2_1,7560000.0,0.628806


Add a row for a naive sample for chymotrypsin, using the same deep-sequencing data as for trypsin 

In [11]:
df_with_extra_row = pandas.DataFrame.from_dict(
    {
        'protease' : ['chymotrypsin'],
        'concentration' : [0],
        'fastq_id' : [df.iloc[6]['fastq_id']],
        'cells_collected' : [df.iloc[6]['cells_collected']],
        'fraction_collected' : [df.iloc[6]['fraction_collected']]
    }
)
df_with_extra_row
df = pandas.concat([df, df_with_extra_row])

Show concatenated dataframe, sorted by protease and protease concentration.

In [12]:
df.sort_values(by=['protease', 'concentration'], inplace=True)
df

Manually add missing columns

In [13]:
# Add columns
df['experiment_id'] = 'Inna_April_2016_rep1'
df['selection_strength'] = 2 * [i for i in range(0,7)]
df['conc_factor'] = 2 * ['', '3', '3', '3', '3', '3', '3']
df['parent'] = 2 * ['', '0', '0', '2', '2', '4', '4']
df['parent_expression'] = 2 * ['', 0.208038, 0.208038, '', '', '', '']

# Rename columns
df.rename(
    {'protease':'protease_type'},
    inplace=True,
    axis='columns'
)
column_order = [
    'experiment_id', 'protease_type', 'concentration', 'selection_strength', 'parent', 'conc_factor', 'fastq_id',
    'parent_expression', 'fraction_collected', 'cells_collected'
]
df[column_order]

Unnamed: 0,experiment_id,protease_type,concentration,selection_strength,parent,conc_factor,fastq_id,parent_expression,fraction_collected,cells_collected
0,Inna_April_2016_rep1,chymotrypsin,0.0,0,,,naive/run2-09-jan25-2018_S9,,0.208038,1200000.0
0,Inna_April_2016_rep1,chymotrypsin,9.5,1,0.0,3.0,118855/A_Aprilexp1round1_4,0.208038,0.189052,1200000.0
1,Inna_April_2016_rep1,chymotrypsin,28.0,2,0.0,3.0,118856/A_Aprilexp1round1_5,0.208038,0.174285,1200000.0
2,Inna_April_2016_rep1,chymotrypsin,83.0,3,2.0,3.0,118863/A_Aprilexp1round2_3,,0.636093,5641051.0
3,Inna_April_2016_rep1,chymotrypsin,250.0,4,2.0,3.0,118864/A_Aprilexp1round2_4,,0.50509,5631873.0
4,Inna_April_2016_rep1,chymotrypsin,750.0,5,4.0,3.0,118859/A_Aprilexp1round3_3,,0.305309,2564592.0
5,Inna_April_2016_rep1,chymotrypsin,2250.0,6,4.0,3.0,118860/A_Aprilexp1round3_4,,0.128904,1127664.0
6,Inna_April_2016_rep1,trypsin,0.0,0,,,naive/run2-09-jan25-2018_S9,,0.208038,1200000.0
7,Inna_April_2016_rep1,trypsin,9.5,1,0.0,3.0,118853/A_Aprilexp1round1_2,0.208038,0.182106,1200000.0
8,Inna_April_2016_rep1,trypsin,28.0,2,0.0,3.0,118854/A_Aprilexp1round1_3,0.208038,0.166087,1200000.0


Write the above dataframe to an output file

In [14]:
print("Writing metadata to the experimental summary file: {0}".format(output_file))
df[column_order].to_csv(output_file, index=False)

Writing metadata to the experimental summary file: results/Inna_April_2016/parse_metadata/experimental_metadata_from_script.csv


Past code from Devin

In [None]:
datafile = "/work/05402/haddox/jupyter/sd2e-community/shared-q1-workshop/strcklnd/facs_data/production_data/Job_56243/Specimen_001_Tube_006_006.fcs"
channel_name = u'FITC-A'
possible_channel_names = [channel_name]
sample = FCMeasurement(ID='Test Sample', datafile=datafile)
channel_name = list(set(possible_channel_names).intersection(set(list(sample.channel_names)))).pop(0)
sample = s_log(sample, [channel_name])
df = sample.data
all_events = df.count()
high_fitc = df[(df[channel_name] > 3.0)].count()
print(all_events[channel_name])
print(high_fitc[channel_name])
sns.distplot(df[channel_name])

In [None]:
sample.channel_names

Code used to install the `flowcytometrytools` package.

In [None]:
!PYTHONUSERBASE=~/tacc-work/jupyter_packages pip install --user flowcytometrytools