<a href="https://colab.research.google.com/github/bouzayeniiheb/UmojaHack-Tunisia-InstaDeep-Kinase-Classification-Challenge-by-UmojaHack-Africa/blob/main/final_sub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# **UmojaHack Tunisia: InstaDeep Kinase Classification Challenge by UmojaHack Africa** 

---



This hackathon was a great opportunity to develop our skills more and more.The goal of this competition is to build a model that assigns a kinase type (as defined by Enzyme Commission number) to an arbitrary sequence of amino acids. All the sequences in both training and test dataset are complete sequences of protein kinases. Each comprises up to 560 positions, and each position can take one of 20 values (there are 20 standard amino acids produced in eukaryotic cells).

As each letter in an amino acid sequence represents a physical structure (one amino acid), these sequences can be augmented by converting each letter into a numerical representation of that amino acid. There are several ways to do this:

Our Solution is an ensemble of 7 DL models! 


In [1]:
import pandas as  pd 
import numpy as np 
import tensorflow as tf
import tensorflow_datasets as tfds
import os 
from sklearn.model_selection import train_test_split
#os.environ["CUDA_VISIBLE_DEVICES"] = "3" # change it to "0" if yo have only one gpu or the gpu numbe  that you would like to use 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Data processing

In [3]:
def write_to_txt(file_name,column):
    with open(file_name, 'w') as f:
        for item in column:
            f.write("%s\n" % item)

In [4]:
train=pd.read_csv("/content/drive/My Drive/UmojaHackTunisia/UmojaHackTun/train.csv")
test=pd.read_csv("/content/drive/My Drive/UmojaHackTunisia/UmojaHackTun/test.csv")

In [5]:
train.head()

Unnamed: 0,ID,Sequence,target
0,ID_train_0,MVDGVMILPVLVMIAFPFPSMEDEKPKVNPKLYMCVCEGLSCGDEA...,0
1,ID_train_1,MAQKENAYPWPYGSKTSQSGLNTLSQRVLRKEPATTSALALVNRFN...,1
2,ID_train_2,MRLWPRSLFGRLVLILVSGMLAAQILTSSIWYDVRHSQVLEIPTRL...,2
3,ID_train_3,MNSIVKIMKMKQITYKLFMTTSLILLSFAVLIYLTLYFFLPTFYEQ...,2
4,ID_train_4,MKLIYQNVLSFLLIIVTTISIIGYSEIGYARNQAYTQNYQRMESYA...,2


In [6]:
max_seq_length=550# max seq length in this data set is 550 

In [7]:
# split data to train and validation 
train,val=train_test_split(train,test_size=0.1,random_state=1994)

#reduce seq length
if max_seq_length>550 : 
    train["Sequence"]=train["Sequence"].apply(lambda x: "".join(list(x)[0:max_seq_length]))
    val["Sequence"]=val["Sequence"].apply(lambda x: "".join(list(x)[0:max_seq_length]))
    test["Sequence"]=test["Sequence"].apply(lambda x: "".join(list(x)[0:max_seq_length]))

In [8]:
# # write Sequnce column to txt file 
write_to_txt("/content/train.txt",train.Sequence)
write_to_txt("/content/test.txt",test.Sequence)
write_to_txt("/content/val.txt",val.Sequence)

In [9]:
train_label=train[["target"]].copy()
val_label=val[["target"]].copy()
train_label.to_csv("/content/train_label.csv",index=False)
val_label.to_csv("/content/val_label.csv",index=False)

### Data loaders 

In [10]:
train_label=pd.read_csv("/content/train_label.csv")
val_label=pd.read_csv("/content/val_label.csv")

In [11]:
train_batch_size=512
val_batch_size=512
number_of_class=train_label.target.nunique()
train_steps = len(train_label) // train_batch_size + int(len(train_label) % train_batch_size > 0)
val_steps = len(val_label) // val_batch_size + int(len(val_label) % val_batch_size > 0)

In [12]:
voc_set=set(['P', 'V', 'I', 'K', 'N', 'B', 'F', 'Y', 'E', 'W', 'R', 'D', 'X', 'S', 'C', 'U', 'Q', 'A', 'M', 'H', 'L', 'G', 'T'])
voc_set_map={ k:v for k , v in zip(voc_set,range(1,len(voc_set)+1))}

In [13]:
def encode(text_tensor, label):
    encoded_text = [ voc_set_map[e] for e in list(text_tensor.numpy().decode())]
    return encoded_text, label
def encode_map_fn(text, label):
    # py_func doesn't set the shape of the returned tensors.
    encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int64))
    encoded_text.set_shape([None])
    label=tf.one_hot(label,number_of_class)
    label.set_shape([number_of_class])
    
    return encoded_text, label
def get_data_loader(file,batch_size,labels):
    
    label_data=tf.data.Dataset.from_tensor_slices(labels.target)
    data_set=tf.data.TextLineDataset(file)
    data_set=tf.data.Dataset.zip((data_set,label_data))

    data_set=data_set.repeat()
    data_set = data_set.shuffle(len(labels))
    data_set=data_set.map(encode_map_fn,tf.data.experimental.AUTOTUNE)
    data_set=data_set.padded_batch(batch_size)
    data_set = data_set.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return data_set


def get_data_loader_test(file,batch_size,labels):
    
    label_data=tf.data.Dataset.from_tensor_slices(labels.target)
    data_set=tf.data.TextLineDataset(file)
    data_set=tf.data.Dataset.zip((data_set,label_data))
    data_set=data_set.map(encode_map_fn,tf.data.experimental.AUTOTUNE)
    data_set=data_set.padded_batch(batch_size)
    data_set = data_set.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return data_set

In [14]:
train_dl=get_data_loader("/content/train.txt",train_batch_size,train_label)
val_dl=get_data_loader("/content/val.txt",train_batch_size,val_label)

### Model 

In [18]:
from tensorflow.keras.layers import Input,Dense,Dropout,Embedding,Concatenate,Flatten,LSTM ,Bidirectional
from tensorflow.keras.activations import relu ,sigmoid,softmax
from tensorflow.keras.losses import CategoricalCrossentropy
def model():
    name="seq"
    dropout_rate=0.1
    learning_rate=0.001
    sequnce=Input([None],name="sequnce")
    
    EMB_layer=Embedding(input_dim=len(voc_set)+1,output_dim=64,name="emb_layer")
    

    LSTM_layer_2=LSTM(units=256,name="lstm_2",return_sequences=False)
    BIDIR_layer_2=Bidirectional(LSTM_layer_2,name="bidirectional_2")
    
    Dens_layer_1=Dense(units=512,activation=relu,kernel_regularizer=None,bias_regularizer=None,name=name+"_dense_layer_1")
    Dens_layer_2=Dense(units=256,activation=relu,kernel_regularizer=None,bias_regularizer=None,name=name+"_dense_layer_2")
    
    output=Dense(units=number_of_class,activation=softmax,kernel_regularizer=None,bias_regularizer=None,name=name+"_dense_layer_output")
    
    dropout_1=Dropout(dropout_rate)
    
    
    emb_layer=EMB_layer(sequnce)
    logits=output(Dens_layer_2(dropout_1(Dens_layer_1(BIDIR_layer_2(emb_layer)))))

    
    model=tf.keras.Model(inputs={"sequnce":sequnce, },outputs=logits) 
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=CategoricalCrossentropy(), metrics=[tf.keras.metrics.CategoricalAccuracy(name="Acc")]) 
    model.summary()
    return model 
    

In [19]:
model=model()

Model: "functional_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequnce (InputLayer)         [(None, None)]            0         
_________________________________________________________________
emb_layer (Embedding)        (None, None, 64)          1536      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 512)               657408    
_________________________________________________________________
seq_dense_layer_1 (Dense)    (None, 512)               262656    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
seq_dense_layer_2 (Dense)    (None, 256)               131328    
_________________________________________________________________
seq_dense_layer_output (Dens (None, 8)                

In [34]:
from keras.callbacks import ModelCheckpoint, EarlyStopping

earlyStopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='min')
mcp_save = ModelCheckpoint('.mdl_wts.hdf5', save_best_only=True, monitor='val_loss', mode='min')

In [210]:
history = model.fit(train_dl,
                    validation_data=val_dl,
                    epochs=#to be defined,
                    verbose=1,
                    validation_steps=val_steps,
                    steps_per_epoch=train_steps
                   )



In [211]:
def encode_test(text_tensor):
    encoded_text = [ voc_set_map[e] for e in list(text_tensor.numpy().decode())]
    return (encoded_text)
def encode_map_fn_test(text):
    # py_func doesn't set the shape of the returned tensors.
    encoded_text = tf.py_function(encode_test, 
                                       inp=[text], 
                                       Tout=tf.int64)
    encoded_text.set_shape([None])

    
    return (encoded_text)

def get_test_data_loader(file,batch_size):
    data_set=tf.data.TextLineDataset(file)
    data_set=data_set.map(encode_map_fn_test,tf.data.experimental.AUTOTUNE)
    data_set=data_set.padded_batch(batch_size)
    data_set = data_set.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return data_set

In [212]:
  test=pd.read_csv("/content/drive/My Drive/UmojaHackTunisia/UmojaHackTun/test.csv")
  test["target"]=0
  test_dl=get_data_loader_test("/content/test.txt",512,test)
  test_pred=model.predict(test_dl,verbose=True)



In [213]:
sub=test[["ID"]].copy()
for i in range(number_of_class):
    sub["target_{}".format(i)]=test_pred[:,i]

In [214]:
sub.to_csv("sub.csv",index=False)

In [240]:
df1=pd.read_csv('/content/sub_15epoch_lr0.001.csv')
df2=pd.read_csv('/content/sub_20epoch_lr0.001.csv')
df3=pd.read_csv('/content/sub_25epoch_lr0.001.csv')
df4=pd.read_csv('/content/sub_28epoch_lr0.001.csv')
df5=pd.read_csv('/content/sub_29epoch_lr0.001.csv')
df6=pd.read_csv('/content/sub_40epoch_lr0.001.csv')
df7=pd.read_csv('/content/sub_43epoch_lr0.001.csv')



In [241]:
df1=df1.drop(['ID'],axis=1)
df2=df2.drop(['ID'],axis=1)
df3=df3.drop(['ID'],axis=1)
df4=df4.drop(['ID'],axis=1)
df5=df5.drop(['ID'],axis=1)
df6=df6.drop(['ID'],axis=1)
df7=df7.drop(['ID'],axis=1)



In [242]:
sum=(df1+df2+df3+df4+df5+df6+df7)/7
sum.head()

Unnamed: 0,target_0,target_1,target_2,target_3,target_4,target_5,target_6,target_7
0,2.743778e-09,6.167194e-07,0.999999,1.563821e-08,2.751316e-10,1.209903e-12,1.622911e-12,5.024944e-11
1,5.631812e-09,8.330158e-07,0.999999,2.671227e-08,2.675956e-10,1.790893e-12,1.584419e-12,1.188894e-10
2,4.817937e-07,0.0001190297,0.999877,3.059235e-06,2.719024e-08,5.503442e-11,2.030765e-10,1.144229e-09
3,3.69823e-09,9.452509e-07,0.999999,5.241113e-08,9.162993e-10,7.782105e-12,2.914901e-12,1.210879e-10
4,0.0009181092,0.9816592,0.002884,0.0108848,7.189217e-08,1.941844e-05,0.0003881604,0.003245914


In [243]:
sum.insert(0,'ID',sub['ID'])

In [244]:
sum.head()

Unnamed: 0,ID,target_0,target_1,target_2,target_3,target_4,target_5,target_6,target_7
0,ID_test_0,2.743778e-09,6.167194e-07,0.999999,1.563821e-08,2.751316e-10,1.209903e-12,1.622911e-12,5.024944e-11
1,ID_test_1,5.631812e-09,8.330158e-07,0.999999,2.671227e-08,2.675956e-10,1.790893e-12,1.584419e-12,1.188894e-10
2,ID_test_2,4.817937e-07,0.0001190297,0.999877,3.059235e-06,2.719024e-08,5.503442e-11,2.030765e-10,1.144229e-09
3,ID_test_3,3.69823e-09,9.452509e-07,0.999999,5.241113e-08,9.162993e-10,7.782105e-12,2.914901e-12,1.210879e-10
4,ID_test_4,0.0009181092,0.9816592,0.002884,0.0108848,7.189217e-08,1.941844e-05,0.0003881604,0.003245914


In [245]:
sum.to_csv("final_sub.csv",index=False)