1. prepare the data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# constant & dataframe

prime_seq = "RAQLSQ"
cite_list = ['74', '101', '143', '148', '173', '176']
aa_order = ['L', 'S', 'E', 'A', 'G', 'P', 'V', 'K', 'R', 'T', 'D', 'Q', 'I', 'F', 'N', 'Y', 'H', 'C', 'M', 'W']

In [None]:
training_df = pd.read_csv('Mut_compe/training.csv', sep = ',')

In [None]:
row_3L = training_df[training_df['Sequence'] == "RALLSQ"]
row_3R = training_df[training_df['Sequence'] == "RARLSQ"]
row_3V = training_df[training_df['Sequence'] == "RAVLSQ"]

In [None]:
# suppose the missing value
k1,k2 = 0.05,0.1

training_df.loc[len(training_df)] = [1594,"RAILSQ", (row_3L['Activity'].values[0]+row_3V['Activity'].values[0])*0.5, row_3L['Selectivity'].values[0]+k1]
training_df.loc[len(training_df)] = [1595,"RAKLSQ", row_3R['Activity'].values[0]+k1, row_3R['Selectivity'].values[0]+k2]
training_df.loc[len(training_df)] = [1596,"RAMLSQ", row_3V['Activity'].values[0]*0.5 + 0.5, row_3L['Selectivity'].values[0]*0.5+0.5]

In [None]:
training_df['lg_Act'] = np.log(training_df['Activity'])
training_df['lg_Sel'] = np.log(training_df['Selectivity'])

In [None]:
training_df

In [None]:
training_df[training_df['Selectivity'] == 8.816]

In [None]:
plt.scatter(training_df['Activity'],training_df['Selectivity'])
plt.show()
plt.scatter(training_df['lg_Act'],training_df['lg_Sel'])

In [None]:
def get_mut_description(col_index, mut_aa):
    if mut_aa == prime_seq[col_index]:
        return ""
    else:
        return f"S{prime_seq[col_index]}{cite_list[col_index]}{mut_aa}:"

In [None]:
def get_seq_description(seq):
    disc = ""
    for i in range(6):
        disc += get_mut_description(i, seq[i])
    if len(disc) > 0:
        return disc[:-1]
    else:
        return np.nan

In [None]:
training_df['description'] = training_df['Sequence'].apply(lambda x:get_seq_description(x))

2. prepare the model

from https://github.com/usnistgov/lantern.git


In [None]:
predata_act = training_df[['description', 'lg_Act']]
predata_sel = training_df[['description', 'lg_Sel']]

In [None]:
predata_act.columns = ['substitutions', 'phenotype']
predata_sel.columns = ['substitutions', 'phenotype']

In [None]:
from lantern.dataset import Dataset
ds_act = Dataset(predata_act)
ds_act

3. build the model

In [None]:
K_act = 6

In [None]:
from lantern.model.basis import VariationalBasis

basis_act = VariationalBasis.fromDataset(ds_act, K=K_act)

In [None]:
from lantern.model.surface import Phenotype

surface_act = Phenotype.fromDataset(ds_act, K=K_act, Ni=200, inducScale=1.0)

In [None]:
from lantern.model import Model
from lantern.model.likelihood import GaussianLikelihood

model_act = Model(basis_act, surface_act, GaussianLikelihood())

4. train the model

In [None]:
from torch.optim import Adam

loss_act = model_act.loss(N=len(ds_act))
Xtrain, ytrain = ds_act[: len(ds_act)]

E = 3000
optimizer_act = Adam(loss_act.parameters(), lr=0.01)
hist_act = []
halpha = np.zeros((E, K_act))

for i in range(E):
    
    optimizer_act.zero_grad()
    yhat = model_act(Xtrain)
    lss_act = loss_act(yhat, ytrain)
    total_act = sum(lss_act.values())
    total_act.backward()
    optimizer_act.step()
    
    hist_act.append(total_act.item())
    halpha[i, :] = basis_act.qalpha(detach=True).mean.numpy()
    

plt.figure(figsize=(4, 3), dpi=300)
plt.plot(hist_act)
plt.xlabel("epoch")
plt.ylabel("loss_act")

In [None]:
plt.figure(figsize=(4, 3), dpi=300)
plt.plot(1/halpha)
plt.xlabel("epoch")
plt.ylabel("variance_act")
plt.semilogy()

None

5. analyze the model

The dimensionality calculations require the trained model and dataset. The number displayed as output is the total number of dimensions found in the model (this is also available as the attribute `K` of the returned `Dimensionality` object

In [None]:
from lantern.model import dimensionality

dim_act = dimensionality(model_act, ds_act)
dim_act

To view the statistics used to determine the dimensionality (see LANTERN's associated manuscript for more details), there is a diagnostic plot available:

In [None]:
dim_act.plotStatistics(nrow=1)

Finally, to see the variance learned for each dimension (with circles representing dimensions included according to the determined dimensionality), run:

In [None]:
dim_act.plotVariance(model_act.basis)

2. prepare the model(sel)

In [None]:
from lantern.dataset import Dataset
ds_sel = Dataset(predata_sel)
ds_sel

3. build the model(sel)

In [None]:
K_sel = 6

In [None]:
from lantern.model.basis import VariationalBasis

basis_sel = VariationalBasis.fromDataset(ds_sel, K=K_sel)

In [None]:
from lantern.model.surface import Phenotype

surface_sel = Phenotype.fromDataset(ds_sel, K=K_sel, Ni=200, inducScale=1.0)

In [None]:
from lantern.model import Model
from lantern.model.likelihood import GaussianLikelihood

model_sel = Model(basis_sel, surface_sel, GaussianLikelihood())

4. train the model

In [None]:
from torch.optim import Adam

loss_sel = model_sel.loss(N=len(ds_sel))
Xtrain2, ytrain2 = ds_sel[: len(ds_sel)]

E = 3000
optimizer_sel = Adam(loss_sel.parameters(), lr=0.01)
hist_sel = []
halpha2 = np.zeros((E, K_sel))

for i in range(E):
    
    optimizer_sel.zero_grad()
    yhat = model_sel(Xtrain2)
    lss = loss_sel(yhat, ytrain2)
    total_sel = sum(lss.values())
    total_sel.backward()
    optimizer_sel.step()
    
    hist_sel.append(total_sel.item())
    halpha2[i, :] = basis_sel.qalpha(detach=True).mean.numpy()
    
plt.figure(figsize=(4, 3), dpi=300)
plt.plot(hist_sel)
plt.xlabel("epoch")
plt.ylabel("loss_sel")

In [None]:
plt.figure(figsize=(4, 3), dpi=300)
plt.plot(1/halpha2)
plt.xlabel("epoch")
plt.ylabel("variance_sel")
plt.semilogy()

None

In [None]:
dim_sel = dimensionality(model_sel, ds_sel)
dim_sel

To view the statistics used to determine the dimensionality (see LANTERN's associated manuscript for more details), there is a diagnostic plot available:

In [None]:
dim_sel.plotStatistics(nrow=1)

Finally, to see the variance learned for each dimension (with circles representing dimensions included according to the determined dimensionality), run:

In [None]:
dim_sel.plotVariance(model_sel.basis)

5 Evaluate the model(act)

In [None]:
import numpy as np
import torch

# find z1, z2
z1, z2 = model_act.basis.order[:2]

# get mutations and brightness for all observations
X, y = ds_act[:len(ds_act)]

# get the embedding for all data points
with torch.no_grad():
    Z = model_act.basis(X)

# to filter outliers, only plot the points within [q/2, 1-q/2] quantile of each latent dimension
q = 0.01

# number of surface points
N = 100

# the meshgrid is used for surface plotting
Z1, Z2 = np.meshgrid(
    np.linspace(np.quantile(Z[:, z1], q/2), np.quantile(Z[:, z1], 1 - q/2), N),
    np.linspace(np.quantile(Z[:, z2], q/2), np.quantile(Z[:, z2], 1 - q/2), N )
)

# predict the surface at each meshgrid point
Zpred = torch.zeros(N**2, model_act.basis.K)
Zpred[:, z1] = torch.from_numpy(Z1.ravel())
Zpred[:, z2] = torch.from_numpy(Z2.ravel())

# predict the surface
with torch.no_grad():
    fpred = model_act.surface(Zpred)
    
# scale to original brightness values and reshape for plotting
f = fpred.mean * predata_act["phenotype"].std() + predata_act["phenotype"].mean()
f = f.reshape(Z1.shape)

# also scale the data for plotting
y = y * predata_act["phenotype"].std() + predata_act["phenotype"].mean()

In [None]:
plt.figure(figsize=(3,2), dpi=300)

im = plt.contourf(Z1, Z2, f, levels=8)
plt.xlabel("$z_1$")
plt.ylabel("$z_2$")
plt.colorbar()

In [None]:
plt.figure(figsize=(3,2), dpi=300)

im = plt.contour(Z1, Z2, f, levels=8)
plt.scatter(Z[:, z1].numpy(), Z[:, z2].numpy(), c = y, alpha=0.8, s=0.3, rasterized=True)

# re-apply the limits
plt.xlim(np.quantile(Z[:, z1], q/2), np.quantile(Z[:, z1], 1 - q/2))
plt.ylim(np.quantile(Z[:, z2], q/2), np.quantile(Z[:, z2], 1 - q/2))

plt.xlabel("$z_1$")
plt.ylabel("$z_2$")
plt.colorbar()

5. Evaluate the model(sel)

In [None]:
import numpy as np
import torch

# find z1, z2
z1, z2 = model_sel.basis.order[:2]

# get mutations and brightness for all observations
X, y = ds_sel[:len(ds_sel)]

# get the embedding for all data points
with torch.no_grad():
    Z = model_sel.basis(X)

# to filter outliers, only plot the points within [q/2, 1-q/2] quantile of each latent dimension
q = 0.01

# number of surface points
N = 100

# the meshgrid is used for surface plotting
Z1, Z2 = np.meshgrid(
    np.linspace(np.quantile(Z[:, z1], q/2), np.quantile(Z[:, z1], 1 - q/2), N),
    np.linspace(np.quantile(Z[:, z2], q/2), np.quantile(Z[:, z2], 1 - q/2), N )
)

# predict the surface at each meshgrid point
Zpred = torch.zeros(N**2, model_sel.basis.K)
Zpred[:, z1] = torch.from_numpy(Z1.ravel())
Zpred[:, z2] = torch.from_numpy(Z2.ravel())

# predict the surface
with torch.no_grad():
    fpred = model_sel.surface(Zpred)
    
# scale to original brightness values and reshape for plotting
f = fpred.mean * predata_sel["phenotype"].std() + predata_sel["phenotype"].mean()
f = f.reshape(Z1.shape)

# also scale the data for plotting
y = y * predata_sel["phenotype"].std() + predata_sel["phenotype"].mean()

In [None]:
plt.figure(figsize=(3,2), dpi=300)

im = plt.contourf(Z1, Z2, f, levels=8)
plt.xlabel("$z_1$")
plt.ylabel("$z_2$")
plt.colorbar()

In [None]:
plt.figure(figsize=(3,2), dpi=300)

im = plt.contour(Z1, Z2, f, levels=8)
plt.scatter(Z[:, z1].numpy(), Z[:, z2].numpy(), c = y, alpha=0.8, s=0.3, rasterized=True)

# re-apply the limits
plt.xlim(np.quantile(Z[:, z1], q/2), np.quantile(Z[:, z1], 1 - q/2))
plt.ylim(np.quantile(Z[:, z2], q/2), np.quantile(Z[:, z2], 1 - q/2))

plt.xlabel("$z_1$")
plt.ylabel("$z_2$")
plt.colorbar()

6.to Predict 

In [None]:
test_set = pd.read_csv('Mut_compe/test.csv', sep = ',')
test_set

In [None]:
test_set['description'] = test_set['Sequence'].apply(lambda x:get_seq_description(x))

In [None]:
X_pre = torch.stack([ds_act.tokenizer.tokenize(x) for x in test_set['description']])

with torch.no_grad():
    Z_preact = model_act.basis(X_pre)

In [None]:
list_of_tensors = [ds_act.tokenizer.tokenize(x) for x in test_set['description']]
X_pre = torch.stack(list_of_tensors)

with torch.no_grad():
    Z_presel = model_sel.basis(X_pre)

In [None]:
# to filter outliers, only plot the points within [q/2, 1-q/2] quantile of each latent dimension
q = 0.01

# number of surface points
N = 100

# the meshgrid is used for surface plotting
Z1, Z2 = np.meshgrid(
    np.linspace(np.quantile(Z_presel[:, z1], q/2), np.quantile(Z_presel[:, z1], 1 - q/2), N),
    np.linspace(np.quantile(Z_presel[:, z2], q/2), np.quantile(Z_presel[:, z2], 1 - q/2), N )
)

# predict the surface at each meshgrid point
Zpred = torch.zeros(N**2, model_sel.basis.K)
Zpred[:, z1] = torch.from_numpy(Z1.ravel())
Zpred[:, z2] = torch.from_numpy(Z2.ravel())

# predict the surface
with torch.no_grad():
    fpred = model_sel.surface(Zpred)
    
# scale to original brightness values and reshape for plotting
f = fpred.mean * predata_sel['phenotype'].std() + predata_sel['phenotype'].mean()
f = f.reshape(Z1.shape)

# also scale the data for plotting
y = y * predata_sel['phenotype'].std() + predata_sel['phenotype'].mean()

In [None]:
plt.figure(figsize=(3,2), dpi=300)

im = plt.contourf(Z1, Z2, f, levels=8)
plt.xlabel("$z_1$")
plt.ylabel("$z_2$")
plt.colorbar()

Next, we add the scatter of measured datapoints for comparison, coloring them by their measured value.

In [None]:
X_act, y_act = ds_act[:len(ds_act)]

with torch.no_grad():
    Z_act = model_act.basis(X_act)
    Zpred = torch.zeros(len(Z_act), K_act)
    for i in model_act.basis.order[:2]:
        Zpred[:, i] = Z_act[:,i]

with torch.no_grad():
    f_act = model_act.surface(Zpred)

with torch.no_grad():
    Y = f_act.mean.numpy()

plt.scatter(y_act, Y)

In [None]:
X_sel, y_sel = ds_sel[:len(ds_sel)]

with torch.no_grad():
    Z_sel = model_sel.basis(X_sel)
    Zpred = torch.zeros(len(Z_sel), K_sel)
    for i in model_sel.basis.order[:2]:
        Zpred[:, i] = Z_sel[:,i]

with torch.no_grad():
    f_sel = model_sel.surface(Zpred)

with torch.no_grad():
    Y = f_sel.mean.numpy()

plt.scatter(y_sel, Y)

In [None]:
y_act.max()

In [None]:
f_sel

In [None]:
X_preact = torch.stack([ds_act.tokenizer.tokenize(x) for x in test_set['description']])
X_presel = torch.stack([ds_sel.tokenizer.tokenize(x) for x in test_set['description']])

with torch.no_grad():
    Z_preact = model_act.basis(X_preact)

with torch.no_grad():
    Z_presel = model_sel.basis(X_presel)

with torch.no_grad():
    f_preact = model_act.surface(Z_preact)

with torch.no_grad():
    f_presel = model_sel.surface(Z_presel)
    
with torch.no_grad():
    Y_act = f_preact.mean.numpy()

with torch.no_grad():
    Y_sel = f_presel.mean.numpy()

In [None]:
plt.scatter(Y_act,Y_sel)
plt.xlabel("pre_act(log)")
plt.ylabel("pre_sel(log)")

In [None]:
X_preact = torch.stack([ds_act.tokenizer.tokenize(x) for x in test_set['description']])
X_presel = torch.stack([ds_sel.tokenizer.tokenize(x) for x in test_set['description']])

Y_act_total = np.zeros(len(Y_act))
Y_sel_total = np.zeros(len(Y_sel))

for i in range(10):
    with torch.no_grad():
        Z_preact = model_act.basis(X_preact)

    with torch.no_grad():
        Z_presel = model_sel.basis(X_presel)

    with torch.no_grad():
        f_preact = model_act.surface(Z_preact)

    with torch.no_grad():
        f_presel = model_sel.surface(Z_presel)
        
    with torch.no_grad():
        Y_act_total += f_preact.mean.numpy()

    with torch.no_grad():
        Y_sel_total += f_presel.mean.numpy()

Y_act_mean = Y_act_total / 10
Y_sel_mean = Y_sel_total / 10

In [None]:
plt.scatter(Y_act_mean,Y_sel_mean)
plt.xlabel("pre_act(log)")
plt.ylabel("pre_sel(log)")

In [None]:
X_preact = torch.stack([ds_act.tokenizer.tokenize(x) for x in test_set['description']])
X_presel = torch.stack([ds_sel.tokenizer.tokenize(x) for x in test_set['description']])

Y_act_total = np.zeros(len(Y_act))
Y_sel_total = np.zeros(len(Y_sel))

n = 100

for i in range(n):
    with torch.no_grad():
        Z_preact = model_act.basis(X_preact)

    with torch.no_grad():
        Z_presel = model_sel.basis(X_presel)

    with torch.no_grad():
        f_preact = model_act.surface(Z_preact)

    with torch.no_grad():
        f_presel = model_sel.surface(Z_presel)
        
    with torch.no_grad():
        Y_act_total += f_preact.mean.numpy()

    with torch.no_grad():
        Y_sel_total += f_presel.mean.numpy()

Y_act_mean = Y_act_total / n
Y_sel_mean = Y_sel_total / n

In [None]:
plt.scatter(Y_act_mean,Y_sel_mean)
plt.xlabel("pre_act(log)")
plt.ylabel("pre_sel(log)")

In [None]:
test_set['Activity'] = np.exp(Y_act_mean)
test_set['Selectivity'] = np.exp(Y_sel_mean)
test_set

In [None]:
plt.scatter(test_set['Activity'], test_set['Selectivity'])
plt.xlabel("pre_act")
plt.ylabel("pre_sel")

In [None]:
test_set[['SequenceID', 'Sequence', 'Activity', 'Selectivity']].to_csv('test_set_prediction.csv')

7 scanning best 96 sequences

In [None]:
aa_multiple = [x+y for x in aa_order for y in aa_order]
aa_three = [x+y for x in aa_order for y in aa_multiple]
aa_four = [x+y for x in aa_multiple for y in aa_multiple]

In [None]:
train_seq = list(training_df['Sequence'])

In [None]:
def mutate(seq, letter, num):
    return seq[:num] + letter + seq[num+1:]

In [None]:
import itertools

In [None]:
temp_act = pd.DataFrame()
temp_sel = pd.DataFrame()

for i in itertools.combinations([0,1,2,3,4,5],r = 2):
    test_list = []
    for turn, aaa in enumerate(aa_three):
        seq = prime_seq
        for times,mut_site in enumerate(i):
            seq = mutate(seq, aaa[times], mut_site)
        
        if seq not in train_seq:
            test_list.append(seq)
    
    test_df = pd.DataFrame(test_list, columns = ['Sequence'])
    test_df['description'] = test_df['Sequence'].apply(lambda x:get_seq_description(x))

    X_preact = torch.stack([ds_act.tokenizer.tokenize(x) for x in test_df['description']])
    X_presel = torch.stack([ds_sel.tokenizer.tokenize(x) for x in test_df['description']])

    Y_act_total = np.zeros(len(test_df))
    Y_sel_total = np.zeros(len(test_df))


    with torch.no_grad():
        Z_preact = model_act.basis(X_preact)
        Z_presel = model_sel.basis(X_presel)
        f_preact = model_act.surface(Z_preact)
        f_presel = model_sel.surface(Z_presel)
        Y_act = f_preact.mean.numpy()
        Y_sel = f_presel.mean.numpy()

    test_df['Activity'] = np.exp(Y_act)
    test_df['Selectivity'] = np.exp(Y_sel)

    temp_act2 = test_df.sort_values(by = 'Activity', ascending = False).head(50)
    temp_act = temp_act.append(temp_act2).sort_values(by = 'Activity', ascending = False).head(50)

    temp_sel2 = test_df.sort_values(by = 'Selectivity', ascending = False).head(50)
    temp_sel = temp_sel.append(temp_sel2).sort_values(by = 'Selectivity', ascending = False).head(50)

    print(i, list(temp_act['Activity'][:3]),list(temp_sel['Selectivity'][:3])) 

temp_act_2 = temp_act.copy()
temp_sel_2 = temp_sel.copy()

In [None]:
temp_act = pd.DataFrame()
temp_sel = pd.DataFrame()

for i in itertools.combinations([0,1,2,3,4,5],r = 3):
    test_list = []
    for turn, aaa in enumerate(aa_three):
        seq = prime_seq
        for times,mut_site in enumerate(i):
            seq = mutate(seq, aaa[times], mut_site)
        
        if seq not in train_seq:
            test_list.append(seq)
    
    test_df = pd.DataFrame(test_list, columns = ['Sequence'])
    test_df['description'] = test_df['Sequence'].apply(lambda x:get_seq_description(x))

    X_preact = torch.stack([ds_act.tokenizer.tokenize(x) for x in test_df['description']])
    X_presel = torch.stack([ds_sel.tokenizer.tokenize(x) for x in test_df['description']])

    Y_act_total = np.zeros(len(test_df))
    Y_sel_total = np.zeros(len(test_df))


    with torch.no_grad():
        Z_preact = model_act.basis(X_preact)
        Z_presel = model_sel.basis(X_presel)
        f_preact = model_act.surface(Z_preact)
        f_presel = model_sel.surface(Z_presel)
        Y_act = f_preact.mean.numpy()
        Y_sel = f_presel.mean.numpy()

    test_df['Activity'] = np.exp(Y_act)
    test_df['Selectivity'] = np.exp(Y_sel)

    temp_act2 = test_df.sort_values(by = 'Activity', ascending = False).head(50)
    temp_act = temp_act.append(temp_act2).sort_values(by = 'Activity', ascending = False).head(50)

    temp_sel2 = test_df.sort_values(by = 'Selectivity', ascending = False).head(50)
    temp_sel = temp_sel.append(temp_sel2).sort_values(by = 'Selectivity', ascending = False).head(50)

    print(i, list(temp_act['Activity'][:3]),list(temp_sel['Selectivity'][:3])) 

temp_act_3 = temp_act.copy()
temp_sel_3 = temp_sel.copy()

In [None]:
temp_act = pd.DataFrame()
temp_sel = pd.DataFrame()

for turn, aaa in enumerate(aa_four):
    if turn < 27000:
        pass
    else:
        
        test_list = []
        for i in itertools.combinations([0,1,2,3,4,5],r = 4):
            seq = prime_seq
            for times,mut_site in enumerate(i):
                seq = mutate(seq, aaa[times], mut_site)
            
            if seq not in train_seq:
                test_list.append(seq)
        
        test_df = pd.DataFrame(test_list, columns = ['Sequence'])
        test_df['description'] = test_df['Sequence'].apply(lambda x:get_seq_description(x))

        X_preact = torch.stack([ds_act.tokenizer.tokenize(x) for x in test_df['description']])
        X_presel = torch.stack([ds_sel.tokenizer.tokenize(x) for x in test_df['description']])

        Y_act_total = np.zeros(len(test_df))
        Y_sel_total = np.zeros(len(test_df))


        with torch.no_grad():
            Z_preact = model_act.basis(X_preact)
            Z_presel = model_sel.basis(X_presel)
            f_preact = model_act.surface(Z_preact)
            f_presel = model_sel.surface(Z_presel)
            Y_act = f_preact.mean.numpy()
            Y_sel = f_presel.mean.numpy()

        test_df['Activity'] = np.exp(Y_act)
        test_df['Selectivity'] = np.exp(Y_sel)

        temp_act2 = test_df.sort_values(by = 'Activity', ascending = False).head(50)
        temp_act = temp_act.append(temp_act2).sort_values(by = 'Activity', ascending = False).head(50)

        temp_sel2 = test_df.sort_values(by = 'Selectivity', ascending = False).head(50)
        temp_sel = temp_sel.append(temp_sel2).sort_values(by = 'Selectivity', ascending = False).head(50)

        # print(i, list(temp_act['Activity'][:3]),list(temp_sel['Selectivity'][:3])) 
        print(turn,aaa)

temp_act_4_2 = temp_act.copy()
temp_sel_4_2 = temp_sel.copy()

In [None]:
temp_act_all = temp_act_2.append(temp_act_3).sort_values(by = 'Activity', ascending = False).head(50)
temp_sel_all = temp_sel_2.append(temp_sel_3).sort_values(by = 'Selectivity', ascending = False).head(50)

temp_act_all = temp_act_all.append(temp_act_4_2).sort_values(by = 'Activity', ascending = False).head(50)
temp_sel_all = temp_sel_all.append(temp_sel_4_2).sort_values(by = 'Selectivity', ascending = False).head(50)

In [None]:
temp_act_all['cotE'] = temp_act_all['Sequence'].apply(lambda x:x[2] not in ["I", "K", "M"])
temp_sel_all['cotE'] = temp_sel_all['Sequence'].apply(lambda x:x[2] not in ["I", "K", "M"])

temp_act_all[temp_act_all['cotE']].to_csv('act_screen_234_withoutE.csv')
temp_sel_all[temp_sel_all['cotE']].to_csv('sel_screen_234_withoutE.csv')