In [None]:
import numpy as np
import pandas as pd
import bokeh
from bokeh.models import FactorRange, ColumnDataSource, LinearColorMapper,  ColorBar
from bokeh.transform import transform

from tqdm.notebook import tqdm

import scipy.stats as st
import iqplot

import Bio.Seq

bokeh.io.output_notebook()

The following code is used to quantify sequencing counts and the diversity within them. 

In [None]:
def data_ops(dataframe, site, stage):
    AA_seq = np.empty_like(dataframe['barcode'])

    for i, barcode in tqdm(enumerate(dataframe['barcode'])):
        AA_seq[i] = Bio.Seq.translate(dataframe['barcode'][i])

    dataframe['aa'] = AA_seq.astype('str')
    
    dataframe['site'] = site
    
    dataframe['stage'] = stage
    
    columns = ['site', 'stage', 'barcode', 'aa', 'counts']
    
    dataframe = dataframe[columns]
    
    return(dataframe)

We want to create arrays so we can determine how diverse our libraries are compared to wild type AAV9.

In [None]:
#Initialize the AAV9 AA sequence between each substitution for comparison
aav9_452 = 'NGSGQNQ'
aav9_492 = 'TVTQNNN'
aav9_585 = 'QSAQAQA'

#Create a dictionary and a list containing amino acids
AA_dict = {'A':0,
           'C':1,
           'D':2,
           'E':3,
           'F':4, 
           'G':5,
           'H':6,
           'I':7,
           'K':8,
           'L':9,
           'M':10,
           'N':11,
           'P':12,
           'Q':13,
           'R':14,
           'S':15,
           'T':16,
           'V':17,
           'W':18,
           'Y':19,
           '*':20}

aa_list = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y','*']

The first time we load in the files, we need to convert nucleotide sequence to amino acid and add the meta data. We can concatenate all the individual dataframes together into a single dataframe. This code is commented out because these steps are no longer required, as the dataframe is saved as a csv for future applications. 

In [None]:
# df_2site_ll = pd.read_csv('analysis/2sll_counts.csv')
# df_2site_ll = data_ops(df_2site_ll, '2s', 'll')

# df_2site_vg = pd.read_csv('analysis/2svg_counts.csv')
# df_2site_vg = data_ops(df_2site_vg, '2s', 'vg')

# df_3site_ll = pd.read_csv('analysis/3sll_counts.csv')
# df_3site_ll = data_ops(df_3site_ll, '3s', 'll')

# df_3site_vg = pd.read_csv('analysis/3svg_counts.csv')
# df_3site_vg = data_ops(df_3site_vg, '3s', 'vg')

# df = pd.concat([df_2site_ll, df_2site_vg, df_3site_ll, df_3site_vg], ignore_index=True)

The individual variable regions are then separated into columns.

In [None]:
# df['452-458 nucleotide'] = df['barcode'].str[:21].values
# df['492-498 nucleotide'] = df['barcode'].str[21:42].values
# df['585-591 nucleotide'] = df['barcode'].str[42:].values

# df['452-458 aa'] = df['aa'].str[:7].values
# df['492-498 aa'] = df['aa'].str[7:14].values
# df['585-591 aa'] = df['aa'].str[14:].values

# df = df[['site', 'stage', '452-458 nucleotide', '492-498 nucleotide', '585-591 nucleotide', '452-458 aa', '492-498 aa', '585-591 aa', 'counts']]

The dataframe is saved to save time.

In [None]:
# df.to_csv('analysis/diversity.csv', index=False)

The dataframe can then be loaded from csv.

In [None]:
df = pd.read_csv('analysis/diversity.csv')

We can then use the sequence between 492-498 to determine which libraries are correctly sequenced.

In [None]:
inds = ((df['492-498 aa'] == 'TVTQNNN') & (df['site'] == '2s')) | ((df['492-498 aa'] != 'GGGGGGG') & (df['site'] == '3s'))  
df = df.loc[inds].reset_index(drop = True)
df.head()

In [None]:
library = '2s'
stage = 'vg'

inds = (df['site'] == library) & (df['stage'] == stage)

First, we want to examine the depth of the library and the corresponding counts. To plot as an ecdf, we can subsample 10000 points (since this is a cumulative density function, this doesn't impact the distribution). 

In [None]:
p = iqplot.ecdf(data = df.loc[inds].sample(10000), q = 'counts', x_axis_type="log", style = 'staircase', palette = "#5A2995")

p.output_backend = 'svg'

bokeh.io.show(p)

We can then run each library and stage to determine a heatmap of the diversity at each position and the cumulative mutations within each sequence. First, we set the library to examine hamming distance within the library.

To determine the hamming distance, we want to examine complete strings compared to the sequence of AAV9.

In [None]:
aav9_vr = aav9_452 + aav9_492 + aav9_585

lst = set(df.loc[inds, '452-458 aa'].values + df.loc[inds, '492-498 aa'].values + df.loc[inds, '585-591 aa'].values)
hamming_dist = {k: 0 for k in lst}

for sequence in tqdm(lst):

    for i in range(21):
        if sequence[i] != aav9_vr[i]:
            hamming_dist[sequence] += 1
            
df_hd = pd.DataFrame(hamming_dist.items(), columns = ['sequence', 'hamming_dist'])

We can plot the hamming distance as an ecdf to see the extent of the diversity. 

In [None]:
p = iqplot.ecdf(df_hd.sample(5000), q = 'hamming_dist', style = 'staircase', palette = "#5A2995", x_range = (0,15))

p.output_backend = 'svg'
bokeh.io.show(p)

We can then examine the diversity at each site by examining amino acid identity by position with a heatmap. To do this, we must examine each site individually. 

In [None]:
site = '452-458 aa'
start = 452

We can loop through each amino acid, and determine the prevalence of residue by position.

In [None]:
total_aa_matrix = np.zeros((21,7), dtype=int)

for aa in df.loc[inds, site]:
    for i in range(7):
        if aa[i] in aa_list:
            total_aa_matrix[AA_dict[aa[i]],i] += 1

This matrix can be converted to a frequency by dividing by the sum across positions.

In [None]:
aa_freq_matrix = np.zeros_like(total_aa_matrix).astype(float)

for j in range(7):
    aa_freq_matrix[:,j] = total_aa_matrix[:,j]/sum(total_aa_matrix[:,j])

We can convert this to a dataframe to enable heatmap plotting.

In [None]:
df_hm = pd.DataFrame(aa_freq_matrix, 
                     columns=np.arange(start,start+7,1).astype(str), 
                     index=aa_list)

df_hm.index.name = 'Amino Acid'
df_hm.columns.name = 'Position'

df_hm = df_hm.stack().rename("value").reset_index()

Next, we can construct our heatmap plot. 

In [None]:
mapper = LinearColorMapper(
    palette='Magma256', low=df_hm['value'].min(), high=df_hm['value'].max())

# Define a figureabsabs
p = bokeh.plotting.figure(
    plot_width=500,
    plot_height=500,
    x_range=list(df_hm["Position"].drop_duplicates()),
    y_range=list(reversed(list(df_hm["Amino Acid"].drop_duplicates()))),
    toolbar_location='right',
    tools = "pan,wheel_zoom,box_zoom,reset,save",
    x_axis_location="above")

# Create rectangle for heatmap
p.rect(
    x="Position",
    y="Amino Acid",
    width=1,
    height=1,
    source=ColumnDataSource(df_hm),
    line_color=None,
    fill_color=transform('value', mapper))

# Add legend
color_bar = ColorBar(
    color_mapper=mapper,
    location=(0, 0))

p.add_layout(color_bar, 'right')

p.output_backend = 'svg'

bokeh.io.show(p)

These steps can be repeated for each library, format (linear library or viral genome), and site to generate the corresponding plots.