In [None]:
import pandas as pd
import numpy as np

import bokeh
from bokeh.models import FactorRange, ColumnDataSource, LinearColorMapper,  ColorBar
from bokeh.transform import transform

import iqplot

from tqdm.notebook import tqdm

import Bio.Seq

bokeh.io.output_notebook()

Function to determine amino acid sequence from nucleotide.

In [None]:
#This function takes in a dataframe containing sequences (labeled 'barcode') and outputs a dataframe of similar size, containing the amino acid data as a column 'aa'.

def data_ops(dataframe):
    AA_seq = np.empty_like(dataframe['barcode'])

    for i, barcode in enumerate(dataframe['barcode']):
        AA_seq[i] = Bio.Seq.translate(dataframe['barcode'][i])

    dataframe['aa'] = AA_seq.astype('str')
    
    columns = ['sample', 'barcode', 'aa', 'counts']
    
    dataframe = dataframe[columns]
    
    return(dataframe)

The purpose of this code is to analyze the sequencing data from CAP-B comparison experiments. First, we want to load the csv files containing count data (generated with `FASTQ_barcode_counting.ipynb`) and concatenate the files into a single dataframe. 

In [None]:
name_prefix = ['CAP-B-pool', 'CREATE-Brain-1', 'CREATE-Brain-2', 'CREATE-Liver-1', 'CREATE-Liver-2', 'T7-Brain-1', 'T7-Brain-2', 'T7-Liver-1', 'T7-Liver-2']

df_output = pd.DataFrame()

for i in range(len(name_prefix)): 
    ##Set up sample fie name
    sample_name = name_prefix[i]
    
    # Set up path to file
    path = 'analysis/' + sample_name
    
    ##Read in ABI count file
    df = pd.read_csv(path + '_counts.csv')
    
    ##Add a column of sample name to the df
    df.insert(0, column = 'sample', value = sample_name)
    
    if i == 0:
        df_output = df
    else:
        ##Append the current sample count frame to the output frame
        df_output = pd.concat([df_output, df], ignore_index = True)

We can examine the dataframe:

In [None]:
df_output

Next, we can read in the sequences that were used to assemble the library. Sequences not present here will be discarded (as they are intrinsically contamination of our library). The resulting dataframe is called `df_validated`.

In [None]:
df_lut = pd.read_csv('analysis/Barcode_sequences.csv')

inds = df_output['barcode'].isin(df_lut['CAP-B Sequence'].values)

df_validated = df_output.loc[inds].reset_index(drop = True)

df_validated.head()

We want to ensure that negative data can be plotted, so we create a dataframe that contains zero counts when a sequence doesn't appear in a tissue. This will enable plotting later on.

In [None]:
df_validated_thorough = pd.DataFrame()

# Loop through each sample
for i, sample in enumerate(name_prefix):
    # index that sample in the dataframe
    inds = df_validated['sample'] == sample
    
    # Create a dictionary that is empty at all positions
    empty_barcode_dict = pd.Series(np.zeros_like(len(df_lut)), index=df_lut['CAP-B Sequence'].values).to_dict()

    # Create a dictionary that contains just the count values
    count_dict = pd.Series(df_validated.loc[inds, 'counts'].values, index=df_validated.loc[inds, 'barcode']).to_dict()

    # Fill the dictionary with the corresponding count value, leaving zeros where there are no counts 
    for sequence in tqdm(df_validated.loc[inds, 'barcode']):
                
            empty_barcode_dict[sequence] = count_dict[sequence]
                
    #Create a new dictionary with this data
    temp_df = pd.DataFrame.from_dict(empty_barcode_dict, orient = 'index', columns = ['counts'])
    temp_df = temp_df.reset_index(drop = False, names = 'barcode')
        
    temp_df['sample'] = sample
        
    df_validated_thorough = pd.concat([df_validated_thorough, temp_df], ignore_index = False)
    
df_validated_thorough = df_validated_thorough.reset_index(drop = True)

df_validated_thorough = df_validated_thorough[['sample','barcode','counts']]

df_validated_thorough.head()

We can then confirm that each sample has the correct number of sequences.

In [None]:
for sample in df_validated_thorough['sample'].unique():
    inds = df_validated_thorough['sample'] == sample
    print(len(df_validated_thorough.loc[inds]))

We can then determine the amino acid sequence for each sample. We place this in a dataframe named `df_validated` (overwriting the previous version of this).

In [None]:
df_validated = data_ops(df_validated_thorough)

We can then calculate the enrichment of our sequences.

In [None]:
df_enrichment = pd.DataFrame()

#Iterate through the different sample names
for i, sample in enumerate(name_prefix):
    inds = df_validated['sample'] == sample
    
    #We want to determine the prevalence of each sequence within the pool and create a dictionary of that data.
    if sample == 'CAP-B-pool':
        
        pool_dict = pd.Series(df_validated.loc[inds, 'counts'].values,index=df_validated.loc[inds, 'barcode']).to_dict()
        pool_sum = np.sum(df_validated.loc[inds, 'counts'].values)
    
    
    else:
        
        # For each sample, we want to first create a dictionary of all the count values
        count_dict = pd.Series(df_validated.loc[inds, 'counts'].values, index=df_validated.loc[inds, 'barcode']).to_dict()
        
        # We also want to make a dictionary of enrichment that we will fill in
        enrichment_dict = pd.Series(np.zeros_like(len(df_validated.loc[inds])), index=df_validated.loc[inds, 'barcode']).to_dict()
        bc_sum = np.sum(df_validated.loc[inds, 'counts'].values)
        
        #Enrichment calculation is performed for each sequence, comparing the prevalence within each tissue to the prevalence in the pool. If the sequence isn't present in the pool, it is considered to be present at a single read instead. 
        for sequence in tqdm(df_validated.loc[inds, 'barcode']):
            
            if pool_dict[sequence] != 0:
                
                enrichment_dict[sequence] = (count_dict[sequence] / bc_sum) / (pool_dict[sequence] / pool_sum) 
            
            else:
                
                enrichment_dict[sequence] = (count_dict[sequence] / bc_sum) / (1 / pool_sum) 

        #Create a new dataframe containing the enrichment values
        temp_df = pd.DataFrame.from_dict(enrichment_dict, orient = 'index', columns = ['enrichment'])
        temp_df = temp_df.reset_index(drop = False, names = 'barcode')
        
        temp_df['sample'] = sample
        
        df_enrichment = pd.concat([df_enrichment, temp_df], ignore_index = False)
        
    print(sample, 'complete')
    
df_enrichment = df_enrichment.reset_index(drop = True)
df_enrichment.head()

Next we want to organize this dataframe.

In [None]:
#Include selection, strain, tissue, and replicate information
selection = np.empty(len(df_enrichment)).astype(str)
strain = np.empty(len(df_enrichment)).astype(str)
tissue = np.empty(len(df_enrichment)).astype(str)
replicate = np.empty(len(df_enrichment)).astype(int)

for i, condition in enumerate(df_enrichment['sample'].str.split('-')):
    selection[i] = condition[0]
    tissue[i] = condition[1]
    replicate[i] = condition[2]
    
df_enrichment['selection'] = selection
df_enrichment['tissue'] = tissue
df_enrichment['replicate'] = replicate

# Include count and amino acid data
df_exp = pd.merge(df_enrichment, df_validated, on = ['sample','barcode'])

# Reorganize that data
df_exp = df_exp[['sample','selection', 'tissue','replicate','barcode','aa','counts','enrichment']]
df_exp = df_exp.sort_values('aa')
df_exp = df_exp.sort_values(['selection','tissue','replicate'])
df_exp = df_exp.reset_index(drop=True)

# Display
df_exp.head()

Now we can start explaring the data by plotting. First, we can examine two different samples for the enrichment of each sequence across the samples. We first do this in a linear plot.

In [None]:
sample_1 = 'CREATE-Brain-1'
sample_2 = 'CREATE-Brain-2'

p = bokeh.plotting.figure(title='Enrichment', width=300, height=300, x_axis_label = sample_1,  y_axis_label = sample_2)

p.scatter(df_exp.loc[df_exp['sample'] == sample_1, 'enrichment'], df_exp.loc[df_exp['sample'] == sample_2, 'enrichment'], alpha = 0.2)

bokeh.io.show(p)

Most of the data appears tightly on the axis. This indicates that there is a lack of reproducibility between these samples. We can examine the log plot to see if we drop samples that are zero in either sample if this persists. 

In [None]:
p = bokeh.plotting.figure(title='Enrichment', width=300, height=300, x_axis_label = sample_1,  y_axis_label = sample_2, x_axis_type = 'log', y_axis_type = 'log', x_range=(0.01, 1000), y_range=(0.01, 1000))

p.scatter(df_exp.loc[df_exp['sample'] == sample_1, 'enrichment'], df_exp.loc[df_exp['sample'] == sample_2, 'enrichment'], alpha = 0.2)

# Toggle this if svg is desired
#p.output_backend = 'svg'

bokeh.io.show(p)

We can see four clusters in the data, corresponding to whether the sequence is high or low in either sample.

Next, we want to isolate samples that are enriched across all brain measurements.

In [None]:
inds = (df_exp['sample'] == 'CREATE-Brain-1') &  (df_exp['enrichment'] > 1)
CREATE_Brain_1_enriched = set(df_exp.loc[inds, 'barcode'].values)

inds = (df_exp['sample'] == 'CREATE-Brain-2') &  (df_exp['enrichment'] > 1)
CREATE_Brain_2_enriched = set(df_exp.loc[inds, 'barcode'].values)

inds = (df_exp['sample'] == 'T7-Brain-1') &  (df_exp['enrichment'] > 1)
T7_Brain_1_enriched = set(df_exp.loc[inds, 'barcode'].values)

inds = (df_exp['sample'] == 'T7-Brain-2') &  (df_exp['enrichment'] > 1)
T7_Brain_2_enriched = set(df_exp.loc[inds, 'barcode'].values)

cross_enriched = CREATE_Brain_1_enriched & CREATE_Brain_2_enriched & T7_Brain_1_enriched & T7_Brain_2_enriched

inds = df_exp['barcode'].isin(cross_enriched)

df_brain_enriched = df_exp.loc[inds]

Then, we want to identify which of those sequences are unenriched across all liver measurements.

In [None]:
inds = (df_exp['sample'] == 'CREATE-Liver-1') &  (df_exp['enrichment'] > 1)
CREATE_Liver_1_enriched = set(df_exp.loc[inds, 'barcode'].values)

inds = (df_exp['sample'] == 'CREATE-Liver-2') &  (df_exp['enrichment'] > 1)
CREATE_Liver_2_enriched = set(df_exp.loc[inds, 'barcode'].values)

inds = (df_exp['sample'] == 'T7-Liver-1') &  (df_exp['enrichment'] > 1)
T7_Liver_1_enriched = set(df_exp.loc[inds, 'barcode'].values)

inds = (df_exp['sample'] == 'T7-Liver-2') &  (df_exp['enrichment'] > 1)
T7_Liver_2_enriched = set(df_exp.loc[inds, 'barcode'].values)

liver_enriched = CREATE_Liver_1_enriched | CREATE_Liver_2_enriched | T7_Liver_1_enriched | T7_Liver_2_enriched

inds = df_brain_enriched['barcode'].isin(liver_enriched)

df_brain_not_liver = df_brain_enriched[~inds]

df_brain_not_liver.head()

We can save those sequences for future measurement.

In [None]:
df_brain_not_liver.to_csv('analysis/brain_enriched.csv', index=False)

We can assaign an index to each of these sequences and determine the log enrichment for convenient plotting.

In [None]:
index = {}

i = 0

for aa in tqdm(df_brain_not_liver['barcode']):
    if aa not in index:
        index[aa] = i
        i+=1
    else:
        i+=1
        
index_list = np.empty_like(df_brain_not_liver['barcode'])

for i, aa in enumerate(df_brain_not_liver['barcode']):
    index_list[i] = index[aa]
    
df_brain_not_liver['index'] = index_list

#Create log enrichment values
df_brain_not_liver['log enrichment'] = np.log(df_brain_not_liver['enrichment'])

We want to drop any values that are -inf in the log enrichment, so that we can plot.

In [None]:
inds = df_brain_not_liver['log enrichment'] < -10

df_brain_not_liver = df_brain_not_liver.loc[~inds]

Create a heatmap of these sequences:

In [None]:
mapper = LinearColorMapper(
    palette='Viridis256', low=df_brain_not_liver['log enrichment'].min(), high=df_brain_not_liver['log enrichment'].max())

# Define a figureabsabs
p = bokeh.plotting.figure(
    plot_width=800,
    plot_height=800,
    x_range=list(df_brain_not_liver["sample"].drop_duplicates()),
    y_range=np.arange(len(set(df_brain_not_liver['barcode'].unique()))).astype(str),
    toolbar_location='right',
    tools = "pan,wheel_zoom,box_zoom,reset,save",
    x_axis_location="above")

# Create rectangle for heatmap
p.rect(
    x="sample",
    y="index",
    width=1,
    height=1,
    source=ColumnDataSource(df_brain_not_liver),
    line_color=None,
    fill_color=transform('log enrichment', mapper))

# Add legend
color_bar = ColorBar(
    color_mapper=mapper,
    location=(0, 0))

p.add_layout(color_bar, 'right')

p.output_backend = 'svg'

bokeh.io.show(p)

We also want to examine the presence of validated sequences within our dataset. We can examine our `df_exp` dataframe for the sequences of PHP.eB, CAP-B1, CAP-B2, CAP-B8, CAP-B10, CAP-B18, and CAP-B22. 

In [None]:
inds = df_exp['aa'].isin(['NGSGQNQ','LQTSSPG', 'QQGKQSV','GSGKTAA','DGAATKN','GTGTSVL','DGQSSKS'])

df_CAP = df_exp.loc[inds]

We can plot this data using a scatter plot. 

In [None]:
p = bokeh.plotting.figure(width=400, height=300, x_range=list(['NGSGQNQ','LQTSSPG', 'QQGKQSV','GSGKTAA','DGAATKN','GTGTSVL','DGQSSKS']), y_axis_type = 'log', y_range=(0.005, 30))

inds = (df_CAP['selection'] == 'T7') & (df_CAP['tissue'] == 'Liver') & (df_CAP['enrichment'] != 0) & (df_CAP['replicate'] == 1)

p.circle(
    x="aa",
    y="enrichment",
    source=ColumnDataSource(df_CAP.loc[inds]),
    color = '#443982',
    size = 5)

inds = (df_CAP['selection'] == 'T7') & (df_CAP['tissue'] == 'Liver') & (df_CAP['enrichment'] != 0) & (df_CAP['replicate'] == 2)

p.circle(
    x="aa",
    y="enrichment",
    source=ColumnDataSource(df_CAP.loc[inds]),
    size = 5,
    color = '#35B778')

p.output_backend = 'svg'
                                                                                                                                 
bokeh.io.show(p)