In [None]:
# Perform the initialization and imports
import sys
import pickle
import re
import os
import csv
import argparse
import math
import pprint

from string import ascii_lowercase
from collections import Counter, defaultdict

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from Bio import SeqIO, AlignIO
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio.Emboss.Applications import NeedleallCommandline

# Demand Python 3.
if sys.version_info[0] < 3:
    print("Python 3 is required, but you are using Python %i.%i.%i") % (
        sys.version_info[0], sys.version_info[1], sys.version_info[2])
    sys.exit(1)

In [None]:
# Retrieve the specific functions from ind and proteins.py
indels_path="/home/mp/InDelScanner"  # /PATH/TO/InDelScanner
if indels_path not in sys.path:
    sys.path.append(indels_path)
from indels.ind import trim_read, findEnds, endMatch, findGap, gapAlign
from ipynb.fs.defs.Library_diversity import convert_variant_to_dict, single_fraction_enrichment

In [None]:
os.chdir("/mnt/c/Users/Maya/Documents/03_Kinases/")

with open('mek.pickle', 'rb') as f:
    mek = pickle.load(f)

with open('kinases_all_ref.pickle', 'rb') as f:
    all_ref = pickle.load(f)

In [None]:
# Set general restrictions stemming from SpliMLib library design
aa_2 = ['A', 'Δ']
aa_12 = ['A','G','P','Y','D','K','M','V','I','L','F','W']
aa_13 = aa_12 +  ['Δ']
splimlib = {'6': aa_12, '9': aa_12, '11': aa_12, '13': aa_12, '7a': aa_13, '8a': aa_2}

### Examine the combined enrichment distributions

In [None]:
df_all = pd.DataFrame.from_dict(mek).fillna(0).sort_values(by=['high', 'med', 'low-t'], ascending=False)

In [None]:
df_all.describe()

In [None]:
cols = ['mediumseagreen', 'gold', 'salmon']

In [None]:
split = 10000
n=5

fig, axes = plt.subplots(n, 1,figsize=(15,4*n))

for p in range(len(axes)):
    df_all.iloc[p*split:(p+1)*split:100].plot.bar(stacked=True, ax=axes[p], color = cols)
    
    axes[p].axes.get_xaxis().set_ticklabels([])
    title = 'Every 100th variant, from variant ' + str(p*split) + ' to variant ' + str((p+1)*split)
    axes[p].set_title(title, position=(0.5, 0.9))


plt.tight_layout()
plt.savefig('Read_distribution.png', dpi=300)

plt.show()

The probability that a variant is truly 'active' decreases going down the plots, with a clear trend:
- the top 10K variants are 'active' with a high confidence, the appearance of variants in the medium and low gates is just experimental noise
- with some more variability, the same can be said for the next 10-20Ks variants, shown on the second plot. This covers variants with 100+ High reads
- looking at the last two plots, there is a high degree of variability in read distribution once we reach <20 reads in high gate. There, further investigation is needed. Two avenues: check the variants from 20K onwards at higher resolution to pin down the cutoff high gate count, and b) take the variants with fewer high counts below that and re-sort according to medium gate distribution. Especially in the less active variants that become the dominant factor.

In [None]:
os = 20000
split = 2000
n=5

fig, axes = plt.subplots(n, 1,figsize=(15,4*n))

for p in range(len(axes)):
    df_all.iloc[os+p*split:os+(p+1)*split:20].plot.bar(stacked=True, ax=axes[p], color = cols)
    
    axes[p].axes.get_xaxis().set_ticklabels([])
    title = 'Every 20th variant, from variant ' + str(os+p*split) + ' to variant ' + str(os+(p+1)*split)
    axes[p].set_title(title, position=(0.5, 0.9))


plt.tight_layout()

plt.show()

These plots show that from variant 20K to variant 30K there is increasing variation in the frequencies in medium and low gates. On the whole, the precise activity level for some of the variants is less clear-cut because some are equally or more frequent in the medium gate - are they 'active' and depleted from the sort? Or are they truly less active variants?

So, variants with 50+ reads in high gate are 'active'. Let's add some mild filtering to these variants, requiring that the low reads can be <20% of the combined medium + high reads.

In [None]:
df_50p = df_all.loc[(df_all['high'] >= 50) & ((df_all['high']+df_all['med']) > 2*df_all['low-t'])]
df_50p.describe()

Then, let's try to stratify the distribution in variants below 50 reads in high gate.

In [None]:
os = 30000
split = 2000
n=3

fig, axes = plt.subplots(n, 1,figsize=(15,4*n))

for p in range(len(axes)):
    df_all.iloc[os+p*split:os+(p+1)*split:20].plot.bar(stacked=True, ax=axes[p], color = cols)
    
    axes[p].axes.get_xaxis().set_ticklabels([])
    title = 'Every 20th variant, from variant ' + str(os+p*split) + ' to variant ' + str(os+(p+1)*split)
    axes[p].set_title(title, position=(0.5, 0.9))


plt.tight_layout()

plt.show()

Below 30 reads in high gate, things are starting to look iffy. Still, there is a proportion of variants with reads only (or nearly only) in high gate.
- if the high gate is >10, accept as long as <20% of high+medium gate reads appear in the low gate, and # high gate > # medium gate.Remkes, how many reads in medium gate are okay?

So let's pick out those variants.

In [None]:
df_20to50 = df_all.loc[(df_all['high'].isin(range(10,50))) & 
                       (df_all['high'] > df_all['med']) & 
                       ((df_all['high']+df_all['med']) > 5*df_all['low-t']) ]

In [None]:
df_20to50.describe()

In [None]:
os = 0
split = 500
n=5

fig, axes = plt.subplots(n, 1,figsize=(15,4*n))

for p in range(len(axes)):
    df_20to50.iloc[os+p*split:os+(p+1)*split:5].plot.bar(stacked=True, ax=axes[p], color = cols)
    
    axes[p].axes.get_xaxis().set_ticklabels([])
    title = 'Every 5th variant, from variant ' + str(os+p*split) + ' to variant ' + str(os+(p+1)*split)
    axes[p].set_title(title, position=(0.5, 0.9))


plt.tight_layout()

plt.show()

In [None]:
df_pos = df_50p.append(df_20to50)
pos = df_pos.to_dict()
df_pos.describe()

In [None]:
df_pos.head()

#### Check if the final dataset contains any variants with mutations outside the designed positions

In [None]:
from ipynb.fs.defs.Library_diversity import convert_variant_to_dict

def n_altered_positions(short):
    return len(convert_variant_to_dict(short))

In [None]:
df_pos['short'] = df_pos.index 
df_pos['nMuts'] = df_pos['short'].apply(n_altered_positions)

In [None]:
df_pos[df_pos['nMuts'] > 6]

There are two such variants out of 32400, so we drop those.

In [None]:
df_pos = df_pos.drop(['6L/7aL/9A/9a*', '6L/7aI/9A/9a*'])

Here, the filtering steps have been completed: the result is a dataset with 32,4K variants that we are confident to call active MEK1 variants. Next, I use these variants to explore the enrichment of different amino acids amongst positive variants.

### Identify the single mutants in this dataset

In [None]:
aa_2 = ['A', 'Δ']
aa_12 = ['A','G','P','Y','D','K','M','V','I','L','F','W']
aa_13 = aa_12 +  ['Δ']
pos_aa = {'6': aa_12, '9': aa_12, '11': aa_12, '13': aa_12, '7a': aa_13, '8a': aa_2}

In [None]:
# hamming distance = 0 if the aa matches at all positions
def hamming_distance(s1, s2, splimlib):
    d = 0
    if (len(s1) != len(splimlib)) or (len(s2) != len(splimlib)):
        d += 1

    for p in splimlib.keys():
        if s1[p] != s2[p]:
            d += 1
    return d

In [None]:
wt_short = '6P/9I/11L/13P'
wt = {'6': 'P', '7a': 'Δ', '8a': 'Δ', '9': 'I', '11': 'L', '13': 'P'}

In [None]:
positive_vars = df_pos.index.tolist()

In [None]:
n_single = 0
single_pos_muts = []
for short in positive_vars:
    s = convert_variant_to_dict(short)
    h_dis = hamming_distance(s, wt, wt)
    if h_dis in [0,1]:
        n_single += 1
        single_pos_muts.append(short)
print(n_single)

6: P,I = 2

7a: Δ,L,K = 3

9: I,P,F = 3

11: F,L,M,I = 4

13: P,I,L,W,V,F = 6


So the max number of additive variants would be 432 variants



In [None]:
2*3*3*4*6

### Check what reads distribution is in the other point mutants

In total, 57 point mutants relative to MKK1 wt sequence are possible. Of those, we observe 13 in the curated dataset.

In [None]:
11+12+1+11+11+11

In [None]:
single_pos_muts

In [None]:
def format_point_mutant(short):
    position_order = ['6', '7a', '8a', '9', '11', '13']
    
    m_by_pos = convert_variant_to_dict(short)
    aa_sequence_shorthand = []
    for p in position_order:
        if m_by_pos[p] == 'Δ':
            aa_sequence_shorthand.append('-')
        else:
            aa_sequence_shorthand.append(m_by_pos[p])
    return ''.join(aa_sequence_shorthand)

In [None]:
def add_nice_labels(df):
    df['short'] = df.index 
    df['fig_format'] = df['short'].apply(format_point_mutant)
    return df

In [None]:
point_high_df = add_nice_labels(df_pos[df_pos.index.isin(single_pos_muts)])

In [None]:
fig, ax = plt.subplots(1, 1,figsize=(3,5))
point_high_df.plot.barh(stacked=True, ax=ax, color=cols, 
                        x='fig_format' ,title='MKK1 point mutants in active dataset')
#plt.savefig('Point_mutants_active_h.svg', transparent=True)
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1,figsize=(5,3))
point_high_df.plot.bar(stacked=True, ax=ax, color=cols, 
                       x='fig_format', title='MKK1 point mutants in active dataset')
#plt.savefig('Point_mutants_active.svg', transparent=True)
plt.show()

In [None]:
def variant_is_allowed(m_by_pos, pos_aa):
    if len(m_by_pos) != len(pos_aa):
        return False
    for p, aa in m_by_pos.items():
        if aa not in pos_aa[p]:
            return False
    return True

In [None]:
all_vars = df_all.index.tolist()

n_neg_single = 0
single_neg_muts = []
for short in all_vars:
    s = convert_variant_to_dict(short)
    # check that it's not a sequencing error
    if not variant_is_allowed(s, pos_aa):
        continue
    
    h_dis = hamming_distance(s, wt, wt)  
    if h_dis == 1:
        if short not in single_pos_muts:
            n_neg_single += 1
            single_neg_muts.append(short)
            
print(n_neg_single)

In [None]:
print(single_neg_muts)

In [None]:
point_low_df = add_nice_labels(df_all[df_all.index.isin(single_neg_muts)])

In [None]:
fig, ax = plt.subplots(1, 1,figsize=(10,3))
point_low_df.plot.bar(stacked=True, ax=ax, color= cols, 
                      x='fig_format', title='MKK1 low activity point mutants')

# plt.xticks(rotation=45)
plt.savefig('Point_mutants_inactive.svg', transparent=True)
plt.show()

In [None]:
57-13-43

That leaves only a single point mutant unaccounted for.

In [None]:
fig, ax = plt.subplots(1, 1,figsize=(3,10))
point_low_df.plot.barh(stacked=True, ax=ax, color=cols,
                       x='fig_format', title='MKK1 low activity point mutants')
#plt.savefig('Point_mutants_inactive.svg', transparent=True)
plt.show()

## Set up clean, full dataset for MAVE NN

In [None]:
df_all.describe()

In [None]:
df_all.head()

In [None]:
wt_short = '6P/9I/11L/13P'
wt = {'6': 'P', '7a': 'Δ', '8a': 'Δ', '9': 'I', '11': 'L', '13': 'P'}

H_dist = {}
for n in df_all.index:
    s1 = convert_variant_to_dict(n)
    H_dist[n] = hamming_distance(wt, s1, splimlib)

In [None]:
df_all.loc[(lambda df_all: df_all['high'] >= 10) or (lambda df_all: df_all['med'] >= 5) or (lambda df_all: df_all['low-t'] >= 3), :].describe()

In [None]:
l = df_all.loc[lambda df: df['low-t'] >= 3, :]
m = df_all.loc[lambda df: df['med'] >= 5, :]
h = df_all.loc[lambda df: df['high'] >= 10, :]

In [None]:
l

In [None]:
all_filtered_df = pd.concat([h, m, l], join='outer')

In [None]:
wt_short = '6P/9I/11L/13P'
wt = {'6': 'P', '7a': 'Δ', '8a': 'Δ', '9': 'I', '11': 'L', '13': 'P'}

H_dist = {}
for n in all_filtered_df.index:
    s1 = convert_variant_to_dict(n)
    H_dist[n] = hamming_distance(wt, s1, splimlib)

In [None]:
mutations_all_positions = {}
position_order = ['6', '7a', '8a', '9', '11', '13']
expected = {}

for n in all_filtered_df.index:
    s1 = convert_variant_to_dict(n)
    expected[n] = is_variant_in_expected_set(s1, splimlib)
    aa_seq = [p + s1[p] for p in position_order]
    mutations_all_positions[n] = '/'.join(aa_seq)


In [None]:
def is_variant_in_expected_set(m_by_pos, splimlib):
    for k, a in m_by_pos.items():
        if k not in splimlib.keys():
            return False
        elif a not in splimlib[k]:
            return False
    return True

In [None]:
all_filtered_df['Hamming'] = pd.Series(H_dist)

In [None]:
all_filtered_df['Name'] = pd.Series(mutations_all_positions)

In [None]:
all_filtered_df['in_lib'] = pd.Series(expected)

In [None]:
all_filtered_df.shape

In [None]:
df_splimlib = all_filtered_df.loc[all_filtered_df['in_lib'] == True].drop(columns=['in_lib']).drop_duplicates()

In [None]:
df_splimlib.to_csv('MKK1_all_SpliMLiB_variants.csv')

In [None]:
df_splimlib.shape