## INTRODUCTION



**UmojaHack Africa 2022: African Snake Antivenom Binding Challenge (ADVANCED)**

Snake bites cause more than 100 000 deaths and more than 300 000 permanent disfigurements every year, caused by toxin proteins in snake venom. Injury and death from snake bites is more common in poor socioeconomic conditionss and has historically received limited funding for discovery, development, and delivery of new treatment options.

 
 
**AIM:**
In this challenge, you are tasked to build a machine learning model to predict how strongly a given string of amino acids from a snake venom toxin protein binds to eight different commercial antivenom antibodies.

##IMPORTING REQUIRED LIBRARIES.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os, random

from keras.callbacks import EarlyStopping
from keras.layers import Input, Dense, Dropout, Flatten, Activation, Conv1D, Add, MaxPooling1D, BatchNormalization, Concatenate
from keras.layers import Embedding, Bidirectional, CuDNNLSTM, GlobalMaxPooling1D
from keras import layers

In [None]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import tensorflow as tf

In [None]:
def tf_seed(seed=0):
    os.environ['PYTHONHASHSEED'] = str(seed)
    # #For working on GPU from Tensorflow.
    # if tf.test.gpu_device_name() == '/device:GPU:0':
    #os.environ['TF_DETERMINISTIC_OPS']=str(seed)
    # # For working on CPU on Tensorflow.
    # else:
    #os.environ['CUDA_VISSIBLE_DEVICE'] = ''
    # #python_random.seed(seed)
    
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
seed = 2022
tf_seed(seed)

## READING THE DATA.

In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/Umoja22/data/Train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Umoja22/data/Test.csv', index_col='ID')
df_sub = pd.read_csv('/content/drive/MyDrive/Umoja22/data/SampleSubmission.csv')


In [None]:
df_train.head()

Unnamed: 0,ID,Toxin_UniprotID,Position_start,Position_end,Antivenom,Toxin_K_mer,Signal,Genus,Species,ProteinFam,ProteinSubFam,ProteinSubSubFam
0,P07037_Bioclone_1,P07037,1,16,Bioclone,NLYQFKNMIQCTVPNR,-0.53,Aspidelaps,Aspidelaps_scutatus,Phospholipase_A2,Group_I_subfamily,D49_sub_subfamily
1,P07037_Bioclone_2,P07037,2,17,Bioclone,LYQFKNMIQCTVPNRS,-1.0,Aspidelaps,Aspidelaps_scutatus,Phospholipase_A2,Group_I_subfamily,D49_sub_subfamily
2,P07037_Bioclone_3,P07037,3,18,Bioclone,YQFKNMIQCTVPNRSW,-0.21,Aspidelaps,Aspidelaps_scutatus,Phospholipase_A2,Group_I_subfamily,D49_sub_subfamily
3,P07037_Bioclone_4,P07037,4,19,Bioclone,QFKNMIQCTVPNRSWW,-0.3,Aspidelaps,Aspidelaps_scutatus,Phospholipase_A2,Group_I_subfamily,D49_sub_subfamily
4,P07037_Bioclone_5,P07037,5,20,Bioclone,FKNMIQCTVPNRSWWH,-1.0,Aspidelaps,Aspidelaps_scutatus,Phospholipase_A2,Group_I_subfamily,D49_sub_subfamily


In [None]:
df_train.tail()

Unnamed: 0,ID,Toxin_UniprotID,Position_start,Position_end,Antivenom,Toxin_K_mer,Signal,Genus,Species,ProteinFam,ProteinSubFam,ProteinSubSubFam
124112,M5BGY5_VINS_Central_Africa_92,M5BGY5,92,107,VINS_Central_Africa,DMNDYCTGTTPDCPRN,1.06,Atheris,Atheris_chlorechis,Disintegrin,Dimeric_disintegrin_subfamily,
124113,M5BGY5_VINS_Central_Africa_93,M5BGY5,93,108,VINS_Central_Africa,MNDYCTGTTPDCPRNP,0.8,Atheris,Atheris_chlorechis,Disintegrin,Dimeric_disintegrin_subfamily,
124114,M5BGY5_VINS_Central_Africa_94,M5BGY5,94,109,VINS_Central_Africa,NDYCTGTTPDCPRNPY,0.43,Atheris,Atheris_chlorechis,Disintegrin,Dimeric_disintegrin_subfamily,
124115,M5BGY5_VINS_Central_Africa_95,M5BGY5,95,110,VINS_Central_Africa,DYCTGTTPDCPRNPYK,1.46,Atheris,Atheris_chlorechis,Disintegrin,Dimeric_disintegrin_subfamily,
124116,M5BGY5_VINS_Central_Africa_96,M5BGY5,96,111,VINS_Central_Africa,YCTGTTPDCPRNPYKD,1.47,Atheris,Atheris_chlorechis,Disintegrin,Dimeric_disintegrin_subfamily,


In [None]:
df_test.shape

(36540, 10)

In [None]:
col_with_missing = [col for col in df_train.columns if df_train[col].isnull().any()]

def fill_mode(df, x=[]):
    """
    iterate over each x(list in the da,
    pick the most frequent entry in the column 
    make a dictionary of each column with the mode entry.
    """
    dict_mode = {}
    
    for i in x:
        mod = df[i].mode()
        dict_mode[i] = mod[0]
    return dict_mode
  
train_mode = fill_mode(df_train, col_with_missing)
test_mode = fill_mode(df_test, col_with_missing)


for i in train_mode:
    print(i)
    df_train[i].fillna(train_mode[i], inplace=True)


for i in test_mode:
    print(i)
    df_test[i].fillna(test_mode[i], inplace=True)

ProteinSubFam
ProteinSubSubFam
ProteinSubFam
ProteinSubSubFam


## Data Preprocessing

In [None]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
od = OneHotEncoder(handle_unknown='ignore', sparse=False)
cat_cols =['Antivenom']
df_cat = pd.DataFrame(od.fit_transform(df_train[cat_cols]))
df_cat

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
124112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
124113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
124114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
124115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
dft_cat = pd.DataFrame(od.transform(df_test[cat_cols]))
dft_cat

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
36535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
36536,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
36537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
36538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
tk = list(df_train['Toxin_K_mer'])
tkt = list(df_test['Toxin_K_mer'])

len(tkt)

36540

In [None]:
for i in range(len(tk)):
  tk[i] = ' '.join(tk[i])

for i in range(len(tkt)):
  tkt[i] = ' '.join(tkt[i])
  
tk[0]

'N L Y Q F K N M I Q C T V P N R'

In [None]:
num_f = pd.DataFrame(MinMaxScaler().fit_transform(df_train[['Position_start', 'Position_end']]))
num_ft = pd.DataFrame(MinMaxScaler().fit_transform(df_test[['Position_start', 'Position_end']]))

In [None]:
num_ft.shape

(36540, 2)

In [None]:
df1 = pd.DataFrame({'Position_start': np.array(num_f[0]),'Position_end': num_f[1], 'Signal': df_train['Signal']})
# df1 = pd.get_dummies(df1)

In [None]:
df1['Toxin_K_mer'] = tk
df1 = pd.concat([df1, df_cat], axis=1)
df1.head()

Unnamed: 0,Position_start,Position_end,Signal,Toxin_K_mer,0,1,2,3,4,5,6,7
0,0.0,0.0,-0.53,N L Y Q F K N M I Q C T V P N R,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.001684,0.001684,-1.0,L Y Q F K N M I Q C T V P N R S,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.003367,0.003367,-0.21,Y Q F K N M I Q C T V P N R S W,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.005051,0.005051,-0.3,Q F K N M I Q C T V P N R S W W,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.006734,0.006734,-1.0,F K N M I Q C T V P N R S W W H,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
np.array(num_ft[0]).shape

(36540,)

In [None]:
test_df = pd.DataFrame({'Position_start': np.array(num_ft[0]),'Position_end': np.array(num_ft[1])})
# test_df = pd.get_dummies(test_df)
test_df.shape

(36540, 2)

In [None]:
test_df['Toxin_K_mer'] = tkt
test_df = pd.concat([test_df, dft_cat], axis=1)
test_df.head()

Unnamed: 0,Position_start,Position_end,Toxin_K_mer,0,1,2,3,4,5,6,7
0,0.0,0.0,R K C L N T P L P L F Y K T C P,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.002004,0.002004,K C L N T P L P L F Y K T C P E,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.004008,0.004008,C L N T P L P L F Y K T C P E G,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.006012,0.006012,L N T P L P L F Y K T C P E G K,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.008016,0.008016,N T P L P L F Y K T C P E G K D,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from sklearn.model_selection import train_test_split 

train, val = train_test_split(df1, test_size=0.2, random_state=seed)

print(len(train), 'Train samples')
print(len(val), 'Val samples')

99293 Train samples
24824 Val samples


### Splitting data features.

In [None]:
train_text = train['Toxin_K_mer']
train_y = train['Signal']
train_x = train.drop(['Toxin_K_mer', 'Signal'], axis=1)
# train_x = pd.get_dummies(train_x)

val_text = val['Toxin_K_mer']
val_y = val['Signal']
val_x = val.drop(['Toxin_K_mer', 'Signal'], axis=1)

test_text = test_df['Toxin_K_mer']
test_x = test_df.drop('Toxin_K_mer', axis=1)
test_x = pd.get_dummies(test_x)

train_x

Unnamed: 0,Position_start,Position_end,0,1,2,3,4,5,6,7
96237,0.148148,0.148148,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
101027,0.085859,0.085859,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
110364,0.112795,0.112795,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120790,0.048822,0.048822,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
97537,0.526936,0.526936,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
46769,0.005051,0.005051,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
103024,0.028620,0.028620,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
16557,0.705387,0.705387,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1244,0.013468,0.013468,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
from keras.layers.preprocessing.text_vectorization import TextVectorization
vectorize_layer = TextVectorization(max_tokens=21)
vectorize_layer.adapt(df1['Toxin_K_mer'])

In [None]:
train_seq = vectorize_layer(train_text)
val_seq = vectorize_layer(val_text)
test_seq = vectorize_layer(test_text)

train_seq[0]

<tf.Tensor: shape=(16,), dtype=int64, numpy=array([16,  7,  7, 11,  4, 20, 17, 17, 20,  4, 16, 16,  6, 18,  8, 12])>

In [None]:
train_seq.shape

TensorShape([99293, 16])

## Modelling.

In [73]:
l2=tf.keras.regularizers.l2(0.05)

input_a = Input(shape=(16))
# vectorize = vectorize_layer(input_a)
input_b = Input(shape=(len(train_x.columns),))
embed = Embedding(21, 30, input_length=16, embeddings_regularizer=l2)(input_a)
bi_rnn = Bidirectional(CuDNNLSTM(32, return_sequences=True, recurrent_regularizer=l2))(embed)
bi_rnn = Bidirectional(CuDNNLSTM(32, return_sequences=True, recurrent_regularizer=l2))(bi_rnn)
bi_rnn = Bidirectional(CuDNNLSTM(32, return_sequences=True, recurrent_regularizer=l2))(bi_rnn)
# bn = layers.BatchNormalization()(bi_rnn)
concat = layers.concatenate([embed, bi_rnn])

bi_rnn_1 = Bidirectional(CuDNNLSTM(32, return_sequences=True, recurrent_regularizer=l2))(bi_rnn)
bi_rnn = Bidirectional(CuDNNLSTM(2, return_sequences=True, recurrent_regularizer=l2))(bi_rnn_1)
bi_rnn = Bidirectional(CuDNNLSTM(32, return_sequences=True, recurrent_regularizer=l2))(bi_rnn)
concat = layers.concatenate([concat, bi_rnn])

bi_rnn = Bidirectional(CuDNNLSTM(32, return_sequences=True, recurrent_regularizer=l2))(concat)
bi_rnn = Bidirectional(CuDNNLSTM(32, return_sequences=True, recurrent_regularizer=l2))(bi_rnn)
bi_rnn = Bidirectional(CuDNNLSTM(32, recurrent_regularizer=l2))(bi_rnn)
bn = layers.BatchNormalization()(bi_rnn)
merge = layers.concatenate([input_b, bn])

L1 = layers.Dense(64, activation='relu', kernel_regularizer=l2)(merge)
# L2 = layers.Dense(64, activation='relu', kernel_regularizer=l2)(L1)
# L2 = layers.Dense(32, activation='relu', kernel_regularizer=l2)(L2)
# L2 = layers.Dense(32, activation='relu', kernel_regularizer=l2)(L2)

bn = layers.BatchNormalization()(merge)
#ld = layers.Dropout(0.1)(bn)
lf = layers.Flatten()(bn)
L3 = layers.Dense(128, activation='relu', kernel_regularizer=l2)(lf)
L3 = layers.Dense(64, activation='relu', kernel_regularizer=l2)(L3)

output_layer = layers.Dense(1)(L3)

modelf = tf.keras.Model(inputs=[input_a, input_b], outputs=output_layer)

In [74]:
modelf.compile(loss='mse',
             optimizer=tf.keras.optimizers.Adam(learning_rate=0.005),
             metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [75]:
modelf.summary()

Model: "model_9"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_19 (InputLayer)          [(None, 16)]         0           []                               
                                                                                                  
 embedding_11 (Embedding)       (None, 16, 30)       630         ['input_19[0][0]']               
                                                                                                  
 bidirectional_81 (Bidirectiona  (None, 16, 64)      16384       ['embedding_11[0][0]']           
 l)                                                                                               
                                                                                                  
 bidirectional_82 (Bidirectiona  (None, 16, 64)      25088       ['bidirectional_81[0][0]'] 

In [76]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_root_mean_squared_error', mode='min', patience=2, verbose=1, factor=0.9, min_lr=0.000001)
modelf.fit([train_seq, train_x], train_y, epochs=100, batch_size=300, validation_data=([val_seq, val_x], val_y), callbacks=[early_stop, reduce_lr])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0044999998994171625.
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 7: ReduceLROnPlateau reducing learning rate to 0.004049999825656414.
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 13: ReduceLROnPlateau reducing learning rate to 0.0036449996754527093.
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 38: ReduceLROnPlateau reducing learning rate to 0.0032804996240884065.
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 47: ReduceLROnPlateau reducing learning rate to 0.0029524497454985975.
Epoch 48/100
Epo

<keras.callbacks.History at 0x7fd2f00e68d0>

In [77]:
modelf.evaluate([val_seq, val_x], val_y)



[0.23377768695354462, 0.4559616446495056]

In [None]:
#modelf.save_weights('/content/drive/MyDrive/Umoja22/model2')

### Prediction.

In [None]:
pred = modelf.predict([test_seq, test_x])

In [None]:
pred

array([[-0.6034039 ],
       [-0.6352216 ],
       [-0.6688267 ],
       ...,
       [ 0.3144092 ],
       [ 0.57993245],
       [ 0.5071994 ]], dtype=float32)

In [None]:
df_test['prediction2'] = pred

sub = pd.DataFrame(df_test['prediction2'], index=df_test.index, columns=['Signal'])
sub['Signal'] = pred

In [None]:
df_test

Unnamed: 0_level_0,Toxin_UniprotID,Position_start,Position_end,Antivenom,Toxin_K_mer,Genus,Species,ProteinFam,ProteinSubFam,ProteinSubSubFam,prediction2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
P19003_Bioclone_1,P19003,1,16,Bioclone,RKCLNTPLPLFYKTCP,Aspidelaps,Aspidelaps_scutatus,Snake_three_finger_toxin,Short_chain_subfamily,Orphan_group_XVI_sub_subfamily,-0.603404
P19003_Bioclone_2,P19003,2,17,Bioclone,KCLNTPLPLFYKTCPE,Aspidelaps,Aspidelaps_scutatus,Snake_three_finger_toxin,Short_chain_subfamily,Orphan_group_XVI_sub_subfamily,-0.635222
P19003_Bioclone_3,P19003,3,18,Bioclone,CLNTPLPLFYKTCPEG,Aspidelaps,Aspidelaps_scutatus,Snake_three_finger_toxin,Short_chain_subfamily,Orphan_group_XVI_sub_subfamily,-0.668827
P19003_Bioclone_4,P19003,4,19,Bioclone,LNTPLPLFYKTCPEGK,Aspidelaps,Aspidelaps_scutatus,Snake_three_finger_toxin,Short_chain_subfamily,Orphan_group_XVI_sub_subfamily,1.092501
P19003_Bioclone_5,P19003,5,20,Bioclone,NTPLPLFYKTCPEGKD,Aspidelaps,Aspidelaps_scutatus,Snake_three_finger_toxin,Short_chain_subfamily,Orphan_group_XVI_sub_subfamily,0.481626
...,...,...,...,...,...,...,...,...,...,...,...
P01405_VINS_Central_Africa_41,P01405,41,56,VINS_Central_Africa,PKKEIFRKSIHCCRSD,Dendroaspis,Dendroaspis_viridis,Snake_three_finger_toxin,Short_chain_subfamily,Orphan_group_XI_sub_subfamily,1.143137
P01405_VINS_Central_Africa_42,P01405,42,57,VINS_Central_Africa,KKEIFRKSIHCCRSDK,Dendroaspis,Dendroaspis_viridis,Snake_three_finger_toxin,Short_chain_subfamily,Orphan_group_XI_sub_subfamily,0.240552
P01405_VINS_Central_Africa_43,P01405,43,58,VINS_Central_Africa,KEIFRKSIHCCRSDKC,Dendroaspis,Dendroaspis_viridis,Snake_three_finger_toxin,Short_chain_subfamily,Orphan_group_XI_sub_subfamily,0.314409
P01405_VINS_Central_Africa_44,P01405,44,59,VINS_Central_Africa,EIFRKSIHCCRSDKCN,Dendroaspis,Dendroaspis_viridis,Snake_three_finger_toxin,Short_chain_subfamily,Orphan_group_XI_sub_subfamily,0.579932


In [None]:
sub.to_csv('/content/sample_data/umT.csv')