In [None]:
# Perform the initialization and imports
import sys
import pickle
import re
import os
import csv
import argparse
import math
import pprint

from string import ascii_lowercase
from collections import Counter, defaultdict

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from Bio import SeqIO, AlignIO
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio.Emboss.Applications import NeedleallCommandline

# Demand Python 3.
if sys.version_info[0] < 3:
    print("Python 3 is required, but you are using Python %i.%i.%i") % (
        sys.version_info[0], sys.version_info[1], sys.version_info[2])
    sys.exit(1)

In [None]:
# Retrieve the specific functions from ind and proteins.py
indels_path="/home/mp/InDelScanner"  # /PATH/TO/InDelScanner
if indels_path not in sys.path:
    sys.path.append(indels_path)
from indels.ind import trim_read, findEnds, endMatch, findGap, gapAlign
from ipynb.fs.defs.Library_diversity import convert_variant_to_dict, single_fraction_enrichment

In [None]:
os.chdir("/mnt/c/Users/Maya/Documents/03_Kinases/")

with open('mek.pickle', 'rb') as f:
    mek = pickle.load(f)

with open('kinases_all_ref.pickle', 'rb') as f:
    all_ref = pickle.load(f)

In [None]:
# Set general restrictions stemming from SpliMLib library design
aa_2 = ['A', 'Δ']
aa_12 = ['A','G','P','Y','D','K','M','V','I','L','F','W']
aa_13 = aa_12 +  ['Δ']
splimlib = {'6': aa_12, '9': aa_12, '11': aa_12, '13': aa_12, '7a': aa_13, '8a': aa_2}

In [None]:
df_all = pd.DataFrame.from_dict(mek).fillna(0).sort_values(by=['high', 'med','low-t'], ascending=False)

## Set up the dataset: collect SpliMLiB variants with >10/5/3 reads in H/M/L gates

In [None]:
df_all.head()

In [None]:
wt_short = '6P/9I/11L/13P'
wt = {'6': 'P', '7a': 'Δ', '8a': 'Δ', '9': 'I', '11': 'L', '13': 'P'}

In [None]:
df_all.loc[(lambda df_all: df_all['high'] >= 10) or (lambda df_all: df_all['med'] >= 5) or (lambda df_all: df_all['low-t'] >= 3), :].describe()

Select all variants that appear in the Venn circle diagram: 10+ reads in high gate OR 5+ in medium OR 3+ in low gate.

In [None]:
l = df_all.loc[lambda df: df['low-t'] >= 3, :]
m = df_all.loc[lambda df: df['med'] >= 5, :]
h = df_all.loc[lambda df: df['high'] >= 10, :]

In [None]:
all_filtered_df = pd.concat([h, m, l], join='outer')

In [None]:
wt_short = '6P/9I/11L/13P'
wt = {'6': 'P', '7a': 'Δ', '8a': 'Δ', '9': 'I', '11': 'L', '13': 'P'}

Add Hamming distance for variants vs WT in this dataset.

In [None]:
# hamming distance = 0 if the aa matches at all positions
def hamming_distance(s1, s2, splimlib):
    d = 0
    if (len(s1) != len(splimlib)) or (len(s2) != len(splimlib)):
        d += 1

    for p in splimlib.keys():
        if s1[p] != s2[p]:
            d += 1
    return d

In [None]:
H_dist = {}
for n in all_filtered_df.index:
    s1 = convert_variant_to_dict(n)
    H_dist[n] = hamming_distance(wt, s1, splimlib)
    
all_filtered_df['Hamming'] = pd.Series(H_dist)

Check that we are only considering variants that fit the expected SpliMLiB pattern.

In [None]:
def is_variant_in_expected_set(m_by_pos, splimlib):
    for k, a in m_by_pos.items():
        if k not in splimlib.keys():
            return False
        elif a not in splimlib[k]:
            return False
    return True

In [None]:
mutations_all_positions = {}
position_order = ['6', '7a', '8a', '9', '11', '13']
expected = {}

for n in all_filtered_df.index:
    s1 = convert_variant_to_dict(n)
    expected[n] = is_variant_in_expected_set(s1, splimlib)
    aa_seq = [p + s1[p] for p in position_order]
    mutations_all_positions[n] = '/'.join(aa_seq)


In [None]:
all_filtered_df['variants'] = pd.Series(mutations_all_positions)

In [None]:
all_filtered_df['in_lib'] = pd.Series(expected)

In [None]:
all_filtered_df.shape

In [None]:
df_splimlib = all_filtered_df.loc[all_filtered_df['in_lib'] == True].drop(columns=['in_lib']).drop_duplicates()

In [None]:
df_splimlib.describe()

In [None]:
df_splimlib.head()

In [None]:
df_splimlib.tail()

### Examine the combined enrichment distributions

In [None]:
# convert the read count data into percentages
df = df_splimlib.copy(deep=True)
df['total_reads'] = df[['high', 'med', 'low-t']].sum(axis=1)
df[['high_per', 'med_per', 'low_per']] = df[['high', 'med', 'low-t']].div(df['total_reads'], axis=0).mul(100).round(3)

In [None]:
# sort the variants
df = df.sort_values(by=['high_per', 'med_per','low_per'], ascending=False).reset_index(drop=True)

In [None]:
df.rename(columns = {'index':'variant'}, inplace = True)

In [None]:
df.head()

Let's have a look at the WT values.

In [None]:
wt_short = '6P/7aΔ/8aΔ/9I/11L/13P'
wt = {'6': 'P', '7a': 'Δ', '8a': 'Δ', '9': 'I', '11': 'L', '13': 'P'}
df.loc[df['variants'] == wt_short]

In [None]:
df.loc[df['variants'] == '6P/7aΔ/8aΔ/9A/11A/13P']

Next let's check the relationship between the high gate cutoff and the number of active variants.

In [None]:
from collections import OrderedDict

In [None]:
cutoff_choices = OrderedDict()
for i in range(5, 201):
    cutoff_choices[i] = len(df.loc[df['high'] >= i])


In [None]:
fig, ax = plt.subplots(1, 1,figsize=(5, 5))
ax.plot(cutoff_choices.keys(), cutoff_choices.values())

y_lims = ax.get_ylim()
ax.set_ylim([0, y_lims[1]])
plt.axvline(x=10, ymin=0, ymax= cutoff_choices[10]/y_lims[1], color='c', label='10+')
plt.axvline(x=51, ymin=0, ymax= cutoff_choices[51]/y_lims[1], color='m', label='51+')
ax.legend()

ax.set_ylabel('Number of active variants')
ax.set_xlabel('Minimum number of reads in high gate')

plt.show()

In [None]:
cols = ['mediumseagreen', 'gold', 'salmon']

In [None]:
split = 10000
n=5

fig, axes = plt.subplots(n, 1,figsize=(15,4*n))

for p in range(len(axes)):
    df[['high_per', 'med_per', 'low_per']].iloc[p*split:(p+1)*split:100].plot.bar(stacked=True, ax=axes[p], color = cols)
    axes[p].axes.get_xaxis().set_ticklabels([])
    title = 'Every 100th variant, from variant ' + str(p*split) + ' to variant ' + str((p+1)*split)
    axes[p].set_title(title, position=(0.5, 0.9))


plt.tight_layout()
#plt.savefig('Read_distribution.png', dpi=300)

plt.show()

In [None]:
df.iloc[30000]

In [None]:
df.iloc[50000]

The probability that a variant is truly 'active' decreases going down the plots, with a clear trend:
- the top 30K variants have >70% reads in the high gate, thus we can consider them to have good activity against ERK2
- after the top 50K (sorted by % reads in high gate), there are <15% reads in high gate. However, there are many variants with a large proportion of reads in the medium gate, which are likely to have WT-like or slightly lower activity.

How to proceed? Construct two datasets for analysis:
- top: all variants with >70% reads in high gate, regardless of the other bins; this gives 30,534 variants.
- high: variants with >30% reads in high gate and few low reads, exact parameters to be determined

In [None]:
df_top = df.loc[df['high_per'] >= 70]

In [None]:
df_top.describe()

The Top dataset have 30,534 variants.

In [None]:
df_plot = df.loc[(df['high_per'] < 70) & (df['high_per'] >=30)].sort_values(by=['low_per', 'med_per', 'high_per'], ascending=True)

#os = 30000
split = 2000
n=5

fig, axes = plt.subplots(n, 1,figsize=(15,4*n))

for p in range(len(axes)):
    df_plot[['high_per', 'med_per', 'low_per']].iloc[p*split:(p+1)*split:20].plot.bar(stacked=True, ax=axes[p], color = cols)
    #df_plot[['high_per', 'med_per', 'low_per']].iloc[os+p*split:os+(p+1)*split:20].plot.bar(stacked=True, ax=axes[p], color = cols)
    
    axes[p].axes.get_xaxis().set_ticklabels([])
    title = 'Below top dataset: every 20th variant'
    axes[p].set_title(title, position=(0.5, 0.9))


plt.tight_layout()

plt.show()

Again, there is no clear cutoff. Still, the variants with <40% reads in the low bin seem to have 50-60% of the reads in the high gate, which is again indicative of strong activity. Thus, define:
- high: 70>x>=30 reads in the high gate, <=40% reads in low gate

In [None]:
df_high = df.loc[(df['high_per'] < 70) & (df['high_per'] >=40) & (df['low_per'] <=40)]

In [None]:
df_high.describe()

The TopTwo dataset (the most likely active, but more borderline variants) contains 5,717 variants.

Next, let's look at the oppposite end of the distribution and find the very negative variants, building a Bottom dataset.

In [None]:
df_plot = df.loc[df['low_per'] >= 95].sort_values(by=['low_per', 'med_per', 'high_per'], ascending=False)

In [None]:
df_plot.describe()

In [None]:
split = 100000
n=3

fig, axes = plt.subplots(n, 1,figsize=(15,4*n))

for p in range(len(axes)):
    df_plot[['high_per', 'med_per', 'low_per']].iloc[p*split:(p+1)*split:1000].plot.bar(stacked=True, ax=axes[p], color = cols)
    axes[p].axes.get_xaxis().set_ticklabels([])
    title = 'Every 1000th variant, from variant ' + str(p*split) + ' to variant ' + str((p+1)*split)
    axes[p].set_title(title, position=(0.5, 0.9))


plt.tight_layout()
#plt.savefig('Read_distribution.png', dpi=300)

plt.show()

In this dataset, we have 266,628 variants that are clearly without activity in this assay (95+ percent reads in the low gate; even the negative control variant has 23% in the medium gate.

In [None]:
ilaa_short = '6P/7aΔ/8aΔ/9A/11A/13P'
df.loc[df['variants'] == ilaa_short]

In [None]:
df_bottom = df.loc[df['low_per'] >= 95].sort_values(by=['low_per', 'med_per', 'high_per'], ascending=False)

In [None]:
print('All: ', len(df_splimlib))
print('Top: ', len(df_top))
print('High: ', len(df_high))
print('Bottom: ', len(df_bottom))

Thus, we have 203,078 variants left to classify. These are going to contain the mostly-negative variants (75%-95% low gate, similar to negative control variant), as well as the in-between medium variants.

Next, construct the fourth dataset of mostly-negative variants

In [None]:
df_plot = df.loc[(df['low_per'] >= 75) & (df['low_per'] < 95)].sort_values(by=['low_per', 'med_per', 'high_per'], ascending=False)

In [None]:
len(df_plot)

In [None]:
split = 9000
n=7

fig, axes = plt.subplots(n, 1,figsize=(15,4*n))

for p in range(len(axes)):
    df_plot[['high_per', 'med_per', 'low_per']].iloc[p*split:(p+1)*split:100].plot.bar(stacked=True, ax=axes[p], color = cols)
    axes[p].axes.get_xaxis().set_ticklabels([])
    title = 'Every 100th variant, from variant ' + str(p*split) + ' to variant ' + str((p+1)*split)
    axes[p].set_title(title, position=(0.5, 0.9))


plt.tight_layout()
#plt.savefig('Read_distribution.png', dpi=300)

plt.show()

This looks reasonable. Now complete the stratification into five sets by labelling the intermediate variants.

In [None]:
df['set'] = 'medium'

In [None]:
df.loc[df['low_per'] >= 95, 'set'] = 'bottom'
df.loc[(df['low_per'] >= 75) & (df['low_per'] < 95), 'set'] = 'low'
df.loc[df['high_per'] >= 70, 'set'] = 'top'
df.loc[(df['high_per'] < 70) & (df['high_per'] >=40) & (df['low_per'] <=40), 'set'] = 'high'

In [None]:
df['set_numeric'] = 3
df.loc[df['low_per'] >= 95, 'set_numeric'] = 1
df.loc[(df['low_per'] >= 75) & (df['low_per'] < 95), 'set_numeric'] = 2
df.loc[df['high_per'] >= 70, 'set_numeric'] = 5
df.loc[(df['high_per'] < 70) & (df['high_per'] >=40) & (df['low_per'] <=40), 'set_numeric'] = 4

In [None]:
df['set'].describe()

In [None]:
df['set'].value_counts().plot(kind='pie')
plt.show()

### Identify the distribution of single mutants across this distribution

In [None]:
point = df.loc[(df['Hamming'] == 1)].sort_values(by='set_numeric', ascending=False)

In [None]:
print('Top: ', len(point.loc[(point['set'] == 'top')]))
print('High: ', len(point.loc[(point['set'] == 'high')]))
print('Medium: ', len(point.loc[(point['set'] == 'medium')]))
print('Low: ', len(point.loc[(point['set'] == 'low')]))
print('Bottom: ', len(point.loc[(point['set'] == 'bottom')]))
print('All: ', len(point))

In total, 57 point mutants relative to MKK1 wt sequence are possible; 53 of these have enough sequencing reads in at least one FACS gate.

The WT sequence is in the High set.

In [None]:
point.loc[point['set'] == 'top']

The top set contains conservative mutations, largely large hydrophobic residues:
- 6: I (shifting the Φ-X-Φ motif to the start)
- 7a: K (extending the basic pathc) or L (making a L - I - L motif in 7a/9/11)
- 8a: Δ (wt) only, indicating that the A insertion is contingent on other reorganisation
- 9: P or F, both large and hydrophobic
- 11: F or I, both large and hydrophobic
- 13: I, W, L, F, all large and hydrophobic

In [None]:
point.loc[point['set'] == 'high']

In the high set, which also contains WT, we add:
- 11: M, W
- 13: V

In [None]:
point.loc[point['set'] == 'medium']

The medium set is the most diverse:
- 6: P + W, G, A, Y, V
- 7a: Δ + V, M
- 8a: Δ + A
- 9: I + V, Y, L, M
- 11: L + K, V
- 13: P + D, K, M, A, G

In [None]:
point.loc[point['set'] == 'low']

In the low set we see mostly mutations at the start of the D domain, which disrupt the basic patch at the start, while the hydrophobic core is largely intact:
- 6: M, L, K, F, D
- 7a: I
- 8a: 
- 9: A
- 11: P
- 13: Y

In [None]:
point.loc[point['set'] == 'bottom']

The bottom set introduces small or charged amino acids at the three positions that make the core of the Φ-X-Φ motif:
- 6: 
- 7a: P, G, D, F
- 8a: 
- 9: D, G, K, W
- 11: A, D, G
- 13: 

In [None]:
df.head()

In [None]:
df.to_csv('splimlib.zip')    