In [11]:
import json

import pandas as pd
import numpy as np
import plotly.express as px
import tensorflow.keras.layers as L
import tensorflow as tf
from sklearn.model_selection import train_test_split
import os

In [2]:
tf.random.set_seed(2020)
np.random.seed(2020)

In [3]:
# This will tell us the columns we are predicting
pred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C', 'deg_pH10', 'deg_50C']

In [4]:
y_true = tf.random.normal((32, 68, 3))
y_pred = tf.random.normal((32, 68, 3))

In [5]:
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)

In [6]:
def gru_layer(hidden_dim, dropout):
    return L.Bidirectional(L.GRU(
        hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))

In [7]:
def build_model(embed_size, seq_len=107, pred_len=68, dropout=0.5, 
                sp_dropout=0.2, embed_dim=200, hidden_dim=256, n_layers=3):
    inputs = L.Input(shape=(seq_len, 3))
    embed = L.Embedding(input_dim=embed_size, output_dim=embed_dim)(inputs)
    
    reshaped = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3])
    )
    hidden = L.SpatialDropout1D(sp_dropout)(reshaped)
    
    for x in range(n_layers):
        hidden = gru_layer(hidden_dim, dropout)(hidden)
    
    # Since we are only making predictions on the first part of each sequence, 
    # we have to truncate it
    truncated = hidden[:, :pred_len]
    out = L.Dense(5, activation='linear')(truncated)
    
    model = tf.keras.Model(inputs=inputs, outputs=out)
    model.compile(tf.optimizers.Adam(), loss=MCRMSE)
    
    return model

In [8]:
def pandas_list_to_array(df):
    """
    Input: dataframe of shape (x, y), containing list of length l
    Return: np.array of shape (x, l, y)
    """
    
    return np.transpose(
        np.array(df.values.tolist()),
        (0, 2, 1)
    )

In [9]:
def preprocess_inputs(df, token2int, cols=['sequence', 'structure', 'predicted_loop_type']):
    return pandas_list_to_array(
        df[cols].applymap(lambda seq: [token2int[x] for x in seq])
    )

In [12]:
train = pd.read_json(os.path.join('E:\Datasets\MRNA','train.json') , lines = True)
test = pd.read_json(os.path.join('E:\Datasets\MRNA','test.json') , lines = True)
sample_sub = pd.read_csv(os.path.join('E:\Datasets\MRNA','sample_submission.csv'))

In [13]:
train = train.query("signal_to_noise >= 1")

In [14]:
# We will use this dictionary to map each character to an integer
# so that it can be used as an input in keras
token2int = {x:i for i, x in enumerate('().ACGUBEHIMSX')}

train_inputs = preprocess_inputs(train, token2int)
train_labels = pandas_list_to_array(train[pred_cols])

In [15]:
x_train, x_val, y_train, y_val = train_test_split(
    train_inputs, train_labels, test_size=.1, random_state=34, stratify=train.SN_filter)

In [16]:
public_df = test.query("seq_length == 107")
private_df = test.query("seq_length == 130")

public_inputs = preprocess_inputs(public_df, token2int)
private_inputs = preprocess_inputs(private_df, token2int)

In [17]:
model = build_model(embed_size=len(token2int))
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 107, 3)]          0         
_________________________________________________________________
embedding (Embedding)        (None, 107, 3, 200)       2800      
_________________________________________________________________
tf_op_layer_Reshape (TensorF [(None, 107, 600)]        0         
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 107, 600)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 107, 512)          1317888   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 107, 512)          1182720   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 107, 512)          118272

In [None]:
history = model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    batch_size=64,
    epochs=75,
    verbose=2,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(patience=5),
        tf.keras.callbacks.ModelCheckpoint('model.h5')
    ]
)

Train on 1887 samples, validate on 210 samples
Epoch 1/75
1887/1887 - 73s - loss: 0.4536 - val_loss: 0.3811
Epoch 2/75
1887/1887 - 31s - loss: 0.3852 - val_loss: 0.3557
Epoch 3/75
1887/1887 - 31s - loss: 0.3623 - val_loss: 0.3424
Epoch 4/75
1887/1887 - 31s - loss: 0.3494 - val_loss: 0.3271
Epoch 5/75
1887/1887 - 35s - loss: 0.3396 - val_loss: 0.3208
Epoch 6/75
1887/1887 - 42s - loss: 0.3323 - val_loss: 0.3163
Epoch 7/75
1887/1887 - 47s - loss: 0.3250 - val_loss: 0.3140
Epoch 8/75
1887/1887 - 42s - loss: 0.3165 - val_loss: 0.2995
Epoch 9/75
1887/1887 - 35s - loss: 0.3087 - val_loss: 0.2993
Epoch 10/75
1887/1887 - 36s - loss: 0.3039 - val_loss: 0.2999
Epoch 11/75
1887/1887 - 35s - loss: 0.2970 - val_loss: 0.2809
Epoch 12/75
1887/1887 - 44s - loss: 0.2904 - val_loss: 0.2811
Epoch 13/75
1887/1887 - 39s - loss: 0.2837 - val_loss: 0.2712
Epoch 14/75
1887/1887 - 40s - loss: 0.2769 - val_loss: 0.2663
Epoch 15/75
1887/1887 - 40s - loss: 0.2697 - val_loss: 0.2577
Epoch 16/75
1887/1887 - 46s - lo

In [None]:
fig = px.line(
    history.history, y=['loss', 'val_loss'],
    labels={'index': 'epoch', 'value': 'MCRMSE'}, 
    title='Training History')
fig.show()

In [None]:
# Caveat: The prediction format requires the output to be the same length as the input,
# although it's not the case for the training data.
model_public = build_model(seq_len=107, pred_len=107, embed_size=len(token2int))
model_private = build_model(seq_len=130, pred_len=130, embed_size=len(token2int))

model_public.load_weights('model.h5')
model_private.load_weights('model.h5')

In [None]:
public_preds = model_public.predict(public_inputs)
private_preds = model_private.predict(private_inputs)

In [None]:
preds_ls = []

for df, preds in [(public_df, public_preds), (private_df, private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=pred_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_ls.append(single_df)

preds_df = pd.concat(preds_ls)
preds_df.head()

In [None]:
submission = sample_df[['id_seqpos']].merge(preds_df, on=['id_seqpos'])
submission.to_csv('submission.csv', index=False)