In [29]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import callbacks
from tensorflow.keras import layers as L
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import plotly.express as px

In [2]:
PATH = './stanford-covid-vaccine/'
train = pd.read_json(os.path.join(PATH,'train.json'),lines=True)
test = pd.read_json(os.path.join(PATH,'test.json'),lines=True)
train = train[train.SN_filter == 1]
train.shape

(1589, 19)

In [4]:
pred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']
input_cols = ['sequence', 'structure', 'predicted_loop_type']

In [5]:
test_private = test.query("seq_length == 130")
test_public = test.query("seq_length == 107")

In [7]:
def preprocess_inputs(df,input_cols):
    """
    Converts inputs into one-hot
    """
    output = []
    for i in range(len(input_cols)):
        tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
        tokenizer.fit_on_texts(np.asarray(df[input_cols[i]]))
        tmp = tokenizer.texts_to_sequences(np.asarray(df[input_cols[i]]))
        output.append(np.asarray(keras.utils.to_categorical(tmp)[:,:,1:]))
    return output

In [8]:
def merge_inputs(inputs, length):
    """
    Merges the one-hot inputs by columns
    Also snips seq length's till desired amount
    """
    size = len(inputs[0])
    output = []
    for i in range(size):
        output.append(np.concatenate((inputs[0][i][0:length], inputs[1][i][0:length], inputs[2][i][0:length]), axis = 1))
    return np.asarray(output)

In [14]:
def preprocess_results(df, results):
    """
    Makes sure that the results are in the appropriate format:
        [layers,lines,columns] in an np array
    """
    tmp = np.asarray(df[results])
    size = len(tmp[0])
    output = np.zeros((len(tmp),len(tmp[0][0]),len(results)))
    for i in range(len(results)):
        for j in range(size):
            tmp[i,j] = np.asarray(tmp[i,j])
    for i in range(len(tmp)):
        output[i] = np.vstack((tmp[i,0], tmp[i,1], tmp[i,2]))[:,:].T
    return output

In [15]:
inputs = preprocess_inputs(train, input_cols)
inputs_simple = merge_inputs(inputs, 68)
inputs_simple.shape

(1589, 68, 14)

In [16]:
expected_results = preprocess_results(train, pred_cols)
expected_results.shape

(1589, 68, 3)

In [17]:
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=(0, 1))
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=-1)

In [19]:
# MODEL TRAIN

inputs = L.Input(shape = (68, 14))
x1 = L.Bidirectional(L.GRU(units = 300, dropout = 0.5, return_sequences = True))(inputs)
x2 = L.Bidirectional(L.GRU(units = 150, dropout = 0.5, return_sequences = True))(x1)
x3 = L.Bidirectional(L.GRU(units = 70,  dropout = 0.5, return_sequences = True))(x2)
out_layer = L.Dense(3, activation = 'linear')(x3)

model = tf.keras.Model(inputs = inputs, outputs = out_layer)
model.compile(tf.keras.optimizers.Adam(), loss = 'mse', metrics = MCRMSE) 
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 68, 14)]          0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 68, 600)           568800    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 68, 300)           676800    
_________________________________________________________________
bidirectional_5 (Bidirection (None, 68, 140)           156240    
_________________________________________________________________
dense_1 (Dense)              (None, 68, 3)             423       
Total params: 1,402,263
Trainable params: 1,402,263
Non-trainable params: 0
_________________________________________________________________


In [20]:
x_train, x_val, y_train, y_val = train_test_split(
    inputs_simple, 
    expected_results, 
    test_size=.1, 
    random_state=34
)

In [33]:
callback = [tf.keras.callbacks.EarlyStopping(monitor = "val_loss", patience = 10)]
history = model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    batch_size=64,
    epochs=100,
    callbacks = callback,
    shuffle = True,
    verbose=1
)

Train on 1430 samples, validate on 159 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100


KeyboardInterrupt: 

In [None]:
fig = px.line(
    history.history, 
    y=['loss', 'val_loss'],
    labels={'index': 'epoch', 'value': 'MCRMSE'}, 
    title='BiGRU Training History'
)
fig.show()