In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers as L
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import plotly.express as px

In [2]:
PATH = './stanford-covid-vaccine/'
train = pd.read_json(os.path.join(PATH,'train.json'),lines=True)
train = train[train.SN_filter == 1]
train.shape

(1589, 19)

In [3]:
train.head(2)

Unnamed: 0,index,id,sequence,structure,predicted_loop_type,signal_to_noise,SN_filter,seq_length,seq_scored,reactivity_error,deg_error_Mg_pH10,deg_error_pH10,deg_error_Mg_50C,deg_error_50C,reactivity,deg_Mg_pH10,deg_pH10,deg_Mg_50C,deg_50C
0,0,id_001f94081,GGAAAAGCUCUAAUAACAGGAGACUAGGACUACGUAUUUCUAGGUA...,.....((((((.......)))).)).((.....((..((((((......,EEEEESSSSSSHHHHHHHSSSSBSSXSSIIIIISSIISSSSSSHHH...,6.894,1,107,68,"[0.1359, 0.20700000000000002, 0.1633, 0.1452, ...","[0.26130000000000003, 0.38420000000000004, 0.1...","[0.2631, 0.28600000000000003, 0.0964, 0.1574, ...","[0.1501, 0.275, 0.0947, 0.18660000000000002, 0...","[0.2167, 0.34750000000000003, 0.188, 0.2124, 0...","[0.3297, 1.5693000000000001, 1.1227, 0.8686, 0...","[0.7556, 2.983, 0.2526, 1.3789, 0.637600000000...","[2.3375, 3.5060000000000002, 0.3008, 1.0108, 0...","[0.35810000000000003, 2.9683, 0.2589, 1.4552, ...","[0.6382, 3.4773, 0.9988, 1.3228, 0.78770000000..."
2,2,id_006f36f57,GGAAAGUGCUCAGAUAAGCUAAGCUCGAAUAGCAAUCGAAUAGAAU...,.....((((.((.....((((.(((.....)))..((((......)...,EEEEESSSSISSIIIIISSSSMSSSHHHHHSSSMMSSSSHHHHHHS...,8.8,1,107,68,"[0.0931, 0.13290000000000002, 0.11280000000000...","[0.1365, 0.2237, 0.1812, 0.1333, 0.1148, 0.160...","[0.17020000000000002, 0.178, 0.111, 0.091, 0.0...","[0.1033, 0.1464, 0.1126, 0.09620000000000001, ...","[0.14980000000000002, 0.1761, 0.1517, 0.116700...","[0.44820000000000004, 1.4822, 1.1819, 0.743400...","[0.2504, 1.4021, 0.9804, 0.49670000000000003, ...","[2.243, 2.9361, 1.0553, 0.721, 0.6396000000000...","[0.5163, 1.6823000000000001, 1.0426, 0.7902, 0...","[0.9501000000000001, 1.7974999999999999, 1.499..."


# Reshape des data pour Deeplearning

### Dictionnaire référençant chaque caractères des inputs (seq, structure, loop)

In [4]:
# key : caractères, value : chiffre de 0 à 13
inputs_dict = {x:i for i, x in enumerate('().ACGUBEHIMSX')}
print(inputs_dict)

{'(': 0, ')': 1, '.': 2, 'A': 3, 'C': 4, 'G': 5, 'U': 6, 'B': 7, 'E': 8, 'H': 9, 'I': 10, 'M': 11, 'S': 12, 'X': 13}


### 3 inputs et 3 outputs

In [5]:
# on selectionne les colonnes inputs et outputs dans les data
outputs = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']
inputs = ['sequence', 'structure', 'predicted_loop_type']

train_labels = train[outputs]
train_inputs = train[inputs]

print(train_inputs)

                                               sequence  \
0     GGAAAAGCUCUAAUAACAGGAGACUAGGACUACGUAUUUCUAGGUA...   
2     GGAAAGUGCUCAGAUAAGCUAAGCUCGAAUAGCAAUCGAAUAGAAU...   
5     GGAAAGCGCCGCGGCGGUAGCGGCAGCGAGGAGCGCUACCAAGGCA...   
6     GGAAAACAAUUGCAUCGUUAGUACGACUCCACAGCGUAAGCUGUGG...   
7     GGAAAUCAUCGAGGACGGGUCCGUUCAGCACGCGAAAGCGUCGUGA...   
...                                                 ...   
2392  GGAAACUCCACAUCUCUACGCCACGAAAGUGGGUAGGAUGGGAGAG...   
2393  GGAAAAGAUCGAUAGGUACGUGGGUUCAUGUAGGAAACUAGCUGGC...   
2395  GGAAAAUAGCAGAGGAAAUACUAGAGCAAUUGCAAAGGCCGAUCAU...   
2396  GGAAAACAAAAACAAACAACAAAAACAAACAACAAAAACAAACAAC...   
2399  GGAAAGCUAGGACGUGGGAGCGUAGCUCUCCACACGGGUACGCCAA...   

                                              structure  \
0     .....((((((.......)))).)).((.....((..((((((......   
2     .....((((.((.....((((.(((.....)))..((((......)...   
5     .....(.(((((.(((((((((...........)))))))..(((....   
6     .........((((((((......((((((((((((....)))))))...

In [6]:
# pour tester le modèle on ne sélectionne pas toutes les data
#SEED = 1
#tf.random.set_seed(SEED)
#np.random.seed(SEED)
#n = 200
#train_labels = train_labels.sample(n)
#train_inputs = train_inputs.sample(n)
#train_labels = train_labels.head(10)
#train_inputs = train_inputs.head(10)

### Transforme numpy to array et les caractères en chiffres

In [8]:
def preprocess_inputs(df,input_cols):
    """
    Converts inputs into one-hot
    """
    output = []
    for i in range(len(input_cols)):
        tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
        tokenizer.fit_on_texts(np.asarray(df[input_cols[i]]))
        tmp = tokenizer.texts_to_sequences(np.asarray(df[input_cols[i]]))
        output.append(np.asarray(keras.utils.to_categorical(tmp)[:,:,1:]))
    return output

In [None]:
inputs = preprocess_inputs(train,input_cols)

In [18]:
train_inputs.shape, train_labels.shape

((1589, 107, 3), (1589, 68, 3))

In [19]:
x_train, x_val, y_train, y_val = train_test_split(train_inputs, train_labels, test_size=.1, random_state=34)

In [20]:
print(f"x_train = {x_train.shape}")
print(f"x_val   = {x_val.shape}")
print(f"y_train = {y_train.shape}") # il va falloir enlever bases de 68 à 107
print(f"y_val   = {y_val.shape}")

x_train = (1430, 107, 3)
x_val   = (159, 107, 3)
y_train = (1430, 68, 3)
y_val   = (159, 68, 3)


In [11]:
#Define Loss Function
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=(0, 1))
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=-1)

### Construction des modèles (GRU, GRU bidirectionnel, LSTM et LSTM bidirectionnel)

In [12]:
seq_len=107
hidden_dim= ?
dropout = 0.5
pred_len= 68
embed_size = len(inputs_dict)
output_dim = ?
out_dim = 3

inputs = L.Input(shape = (seq_len, 3))
embed_layer = L.Embedding(input_dim = embed_size, output_dim = output_dim)(inputs)
x1 = L.Bidirectional(L.GRU(hidden_dim, dropout = dropout, return_sequences = True))(inputs)
x2 = L.Bidirectional(L.GRU(hidden_dim, dropout = dropout, return_sequences = True))(x1)
x3 = L.Bidirectional(L.GRU(hidden_dim, dropout = dropout, return_sequences = True))(x2)
truncated = x2[:, :pred_len]
out_layer = L.Dense(out_dim, activation = 'linear')(truncated)

model = tf.keras.Model(inputs = inputs, outputs = out)
model.compile(keras.optimizers.Adam(), loss = 'accuracy')
#model.compile(keras.optimizers.Adam(), loss = MCRMSE)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 107, 3)]          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 107, 512)          400896    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 107, 512)          1182720   
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 68, 512)]         0         
_________________________________________________________________
dense (Dense)                (None, 68, 3)             1539      
Total params: 1,585,155
Trainable params: 1,585,155
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    batch_size=64,
    epochs=75,
    verbose=2
)

Train on 180 samples, validate on 20 samples
Epoch 1/75
180/180 - 23s - loss: 1.6640 - val_loss: 1.2598
Epoch 2/75
180/180 - 10s - loss: 1.0232 - val_loss: 1.2249
Epoch 3/75
180/180 - 9s - loss: 0.8937 - val_loss: 0.6768
Epoch 4/75
180/180 - 9s - loss: 0.7177 - val_loss: 0.9435
Epoch 5/75
180/180 - 10s - loss: 0.6347 - val_loss: 0.5367
Epoch 6/75
180/180 - 11s - loss: 0.6123 - val_loss: 0.6500
Epoch 7/75
180/180 - 10s - loss: 0.5659 - val_loss: 0.5524
Epoch 8/75
180/180 - 12s - loss: 0.5535 - val_loss: 0.6133
Epoch 9/75
180/180 - 10s - loss: 0.5295 - val_loss: 0.5490
Epoch 10/75
180/180 - 11s - loss: 0.5206 - val_loss: 0.5103
Epoch 11/75
180/180 - 11s - loss: 0.5121 - val_loss: 0.5436
Epoch 12/75
180/180 - 10s - loss: 0.5055 - val_loss: 0.5870
Epoch 13/75
180/180 - 10s - loss: 0.4965 - val_loss: 0.5011
Epoch 14/75
180/180 - 11s - loss: 0.4904 - val_loss: 0.5335
Epoch 15/75
180/180 - 10s - loss: 0.4894 - val_loss: 0.5215
Epoch 16/75
180/180 - 10s - loss: 0.4849 - val_loss: 0.5326
Epoch 

In [None]:
fig = px.line(
    history.history, 
    y=['loss', 'val_loss'],
    labels={'index': 'epoch', 'value': 'MCRMSE'}, 
    title='BiGRU Training History'
)
fig.show()