In [None]:
import pandas as  pd 
import numpy as np 
import tensorflow as tf
import tensorflow_datasets as tfds
import os 
from sklearn.model_selection import train_test_split
os.environ["CUDA_VISIBLE_DEVICES"] = "3" # change it to "0" if yo have only one gpu or the gpu numbe  that you would like to use 

### Data processing

In [None]:
def write_to_txt(file_name,column):
    with open(file_name, 'w') as f:
        for item in column:
            f.write("%s\n" % item)

In [None]:
train=pd.read_csv("./Zindi/front/train.csv")
test=pd.read_csv("./Zindi/front/test.csv")

In [None]:
train.head()

In [None]:
max_seq_length=550# max seq length in this data set is 550 

In [None]:
# split data to train and validation 
train,val=train_test_split(train,test_size=0.1,random_state=1994)

#reduce seq length
if max_seq_length>550 : 
    train["Sequence"]=train["Sequence"].apply(lambda x: "".join(list(x)[0:max_seq_length]))
    val["Sequence"]=val["Sequence"].apply(lambda x: "".join(list(x)[0:max_seq_length]))
    test["Sequence"]=test["Sequence"].apply(lambda x: "".join(list(x)[0:max_seq_length]))

In [None]:
# # write Sequnce column to txt file 
write_to_txt("proc_data/train.txt",train.Sequence)
write_to_txt("proc_data/test.txt",test.Sequence)
write_to_txt("proc_data/val.txt",val.Sequence)

In [None]:
train_label=train[["target"]].copy()
val_label=val[["target"]].copy()
train_label.to_csv("./proc_data/train_label.csv",index=False)
val_label.to_csv("./proc_data/val_label.csv",index=False)

### Data loaders 

In [None]:
train_label=pd.read_csv("./proc_data/train_label.csv")
val_label=pd.read_csv("./proc_data/val_label.csv")

In [None]:
train_batch_size=512
val_batch_size=512
number_of_class=train_label.target.nunique()
train_steps = len(train_label) // train_batch_size + int(len(train_label) % train_batch_size > 0)
val_steps = len(val_label) // val_batch_size + int(len(val_label) % val_batch_size > 0)

In [None]:
voc_set=set(['P', 'V', 'I', 'K', 'N', 'B', 'F', 'Y', 'E', 'W', 'R', 'D', 'X', 'S', 'C', 'U', 'Q', 'A', 'M', 'H', 'L', 'G', 'T'])
voc_set_map={ k:v for k , v in zip(voc_set,range(1,len(voc_set)+1))}

In [None]:
def encode(text_tensor, label):
    encoded_text = [ voc_set_map[e] for e in list(text_tensor.numpy().decode())]
    return encoded_text, label
def encode_map_fn(text, label):
    # py_func doesn't set the shape of the returned tensors.
    encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int64))
    encoded_text.set_shape([None])
    label=tf.one_hot(label,number_of_class)
    label.set_shape([number_of_class])
    
    return encoded_text, label
def get_data_loader(file,batch_size,labels):
    
    label_data=tf.data.Dataset.from_tensor_slices(labels.target)
    data_set=tf.data.TextLineDataset(file)
    data_set=tf.data.Dataset.zip((data_set,label_data))

    data_set=data_set.repeat()
    data_set = data_set.shuffle(len(labels))
    data_set=data_set.map(encode_map_fn,tf.data.experimental.AUTOTUNE)
    data_set=data_set.padded_batch(batch_size)
    data_set = data_set.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return data_set


def get_data_loader_test(file,batch_size,labels):
    
    label_data=tf.data.Dataset.from_tensor_slices(labels.target)
    data_set=tf.data.TextLineDataset(file)
    data_set=tf.data.Dataset.zip((data_set,label_data))
    data_set=data_set.map(encode_map_fn,tf.data.experimental.AUTOTUNE)
    data_set=data_set.padded_batch(batch_size)
    data_set = data_set.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return data_set

In [None]:
train_dl=get_data_loader("proc_data/train.txt",train_batch_size,train_label)
val_dl=get_data_loader("proc_data/val.txt",train_batch_size,val_label)

### Model 

In [None]:
from tensorflow.keras.layers import Input,Dense,Dropout,Embedding,Concatenate,Flatten,LSTM ,Bidirectional
from tensorflow.keras.activations import relu ,sigmoid,softmax
from tensorflow.keras.losses import CategoricalCrossentropy
def model():
    name="seq"
    dropout_rate=0.1
    learning_rate=0.001
    sequnce=Input([None],name="sequnce")
    
    EMB_layer=Embedding(input_dim=len(voc_set)+1,output_dim=64,name="emb_layer")
    

    LSTM_layer_2=LSTM(units=256,name="lstm_2",return_sequences=False)
    BIDIR_layer_2=Bidirectional(LSTM_layer_2,name="bidirectional_2")
    
    Dens_layer_1=Dense(units=512,activation=relu,kernel_regularizer=None,bias_regularizer=None,name=name+"_dense_layer_1")
    Dens_layer_2=Dense(units=256,activation=relu,kernel_regularizer=None,bias_regularizer=None,name=name+"_dense_layer_2")
    
    output=Dense(units=number_of_class,activation=softmax,kernel_regularizer=None,bias_regularizer=None,name=name+"_dense_layer_output")
    
    dropout_1=Dropout(dropout_rate)
    
    
    emb_layer=EMB_layer(sequnce)
    logits=output(Dens_layer_2(dropout_1(Dens_layer_1(BIDIR_layer_2(emb_layer)))))

    
    model=tf.keras.Model(inputs={"sequnce":sequnce, },outputs=logits) 
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss=CategoricalCrossentropy(), metrics=[tf.keras.metrics.CategoricalAccuracy(name="Acc")]) 
    model.summary()
    return model 
    

In [None]:
model=model()

In [None]:
# you can add eraly stoping method as callback and save best  model to improve your score 

In [None]:
history = model.fit(train_dl,
                    validation_data=val_dl,
                    epochs=5,
                    verbose=1,
                    validation_steps=val_steps,
                    steps_per_epoch=train_steps
                   )

In [None]:
def encode_test(text_tensor):
    encoded_text = [ voc_set_map[e] for e in list(text_tensor.numpy().decode())]
    return (encoded_text)
def encode_map_fn_test(text):
    # py_func doesn't set the shape of the returned tensors.
    encoded_text = tf.py_function(encode_test, 
                                       inp=[text], 
                                       Tout=tf.int64)
    encoded_text.set_shape([None])

    
    return (encoded_text)

def get_test_data_loader(file,batch_size):
    data_set=tf.data.TextLineDataset(file)
    data_set=data_set.map(encode_map_fn_test,tf.data.experimental.AUTOTUNE)
    data_set=data_set.padded_batch(batch_size)
    data_set = data_set.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return data_set

In [None]:
test=pd.read_csv("./Zindi/front/test.csv")
test["target"]=0
test_dl=get_data_loader_test("proc_data/test.txt",512,test)
test_pred=model.predict(test_dl,verbose=True)

In [None]:
sub=test[["ID"]].copy()
for i in range(number_of_class):
    sub["target_{}".format(i)]=test_pred[:,i]

In [None]:
sub.to_csv("sub.csv",index=False)