In [None]:
pip install tensorflow-addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (591 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m591.0/591.0 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.7
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.20.0 typeguard-2.13.3


In [None]:
import pandas as pd, numpy as np
import math, json, gc, random, os, sys
from matplotlib import pyplot as plt
from tqdm import tqdm

#tensorflow deep learning basics
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L

#for model evaluation
from sklearn.model_selection import train_test_split, KFold,  StratifiedKFold


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [None]:
train = pd.read_json('train.json', lines=True)
test = pd.read_json('test.json', lines=True)
private_lab = pd.read_csv("private_test_labels.csv")

In [None]:
trainn = pd.read_json('train.json', lines=True)
trainn.shape
private_lab.shape

(2493, 19)

In [None]:
#target columns
target_cols = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']
target_colss = ['reactivity', 'deg_Mg_pH10', 'deg_pH10', 'deg_Mg_50C', 'deg_50C']

In [None]:
token2int = {x:i for i, x in enumerate('().ACGUBEHIMSX')}

def get_pair_index_structure(structure):
    structure = np.array([struc for struc in structure], dtype="<U4")

    open_index = np.where(structure == "(")[0]
    closed_index = np.where(structure == ")")[0]

    structure[open_index] = range(0, len(open_index))
    structure[closed_index] = range(len(open_index)-1, -1, -1)
    structure[structure == "."] = -1
    structure = structure.astype(int)

    pair_structure = np.array([-1]*len(structure))
    for i in range(len(open_index)):
        start, end = np.where(structure == i)[0]
        pair_structure[start] = end
        pair_structure[end] = start    
        
    return pair_structure

In [None]:
def preprocess_inputs(df, cols=['sequence', 'structure', 'predicted_loop_type']):
    return np.transpose(
        np.array(
            df[cols]
            .applymap(lambda seq: [token2int[x] for x in seq])
            .values
            .tolist()
        ),
        (0, 2, 1)
    )

train_inputs_all = preprocess_inputs(train)
train_labels_all = np.array(train[target_cols].values.tolist()).transpose((0, 2, 1))

In [None]:
# custom loss_fnc
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)

def gru_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
                                tf.keras.layers.GRU(hidden_dim,
                                dropout=dropout,
                                return_sequences=True,
                                kernel_initializer = 'orthogonal'))

def lstm_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
                                tf.keras.layers.LSTM(hidden_dim,
                                dropout=dropout,
                                return_sequences=True,
                                kernel_initializer = 'orthogonal'))

def build_model(model_type=1,seq_len=107, pred_len=68, dropout=0.4,
                embed_dim=100, hidden_dim=128):
    
    inputs = tf.keras.layers.Input(shape=(seq_len, 3))

    embed = tf.keras.layers.Embedding(input_dim=len(token2int), output_dim=embed_dim)(inputs)
    reshaped = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3]))
    
    reshaped = tf.keras.layers.SpatialDropout1D(.2)(reshaped)
    
    if model_type == 0:
        hidden = gru_layer(hidden_dim, dropout)(reshaped)
        hidden = gru_layer(hidden_dim, dropout)(hidden)
        hidden = gru_layer(hidden_dim, dropout)(hidden)
        
    elif model_type == 1:
        hidden = lstm_layer(hidden_dim, dropout)(reshaped)
        hidden = lstm_layer(hidden_dim, dropout)(hidden)
        hidden = lstm_layer(hidden_dim, dropout)(hidden)
        
    elif model_type == 2:
        hidden = gru_layer(hidden_dim, dropout)(reshaped)
        hidden = lstm_layer(hidden_dim, dropout)(hidden)
        hidden = lstm_layer(hidden_dim, dropout)(hidden)
        
    elif model_type == 3:
        hidden = lstm_layer(hidden_dim, dropout)(reshaped)
        hidden = gru_layer(hidden_dim, dropout)(hidden)
        hidden = gru_layer(hidden_dim, dropout)(hidden)

    elif model_type == 4:
        hidden = lstm_layer(hidden_dim, dropout)(reshaped)
        hidden = gru_layer(hidden_dim, dropout)(hidden)
        hidden = lstm_layer(hidden_dim, dropout)(hidden)
    
    #only making predictions on the first part of each sequence
    truncated = hidden[:, :pred_len]

    out = tf.keras.layers.Dense(5, activation='linear')(truncated)

    model = tf.keras.Model(inputs=inputs, outputs=out)

    adam = tf.optimizers.Adam()
    model.compile(optimizer=adam, loss=MCRMSE)
    
    return model

In [None]:
#test values
seq_id = list(private_lab['id'].values)
private_df = test.query("seq_length == 130").copy()
private_df = private_df.loc[private_df['id'].isin(seq_id)]
private_df = private_df.reindex(private_df['id'].map(dict(zip(seq_id, range(len(seq_id))))).sort_values().index)

In [None]:
private_inputs = preprocess_inputs(private_df)
private_inputs.shape

(2493, 130, 3)

In [None]:
react = np.asarray(private_lab['reactivity'].tolist())
mg_ph = np.asarray(private_lab['deg_Mg_pH10'].tolist())
mg_50 = np.asarray(private_lab['deg_Mg_50C'].tolist())

In [None]:
def procc(a):
  aa=[]
  for i in a:
    am = i.replace(" ", "")
    am = am[1:-1].split(',')
    am = [float(j) for j in am]
    aa.append(am)
  return aa
mg_ph= procc(mg_ph)
react = procc(react)
mg_50 = procc(mg_50)

In [None]:
Y_act = np.dstack((react,mg_ph,mg_50))
Y_act.shape

(2493, 102, 3)

In [None]:
import math
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
def metrics(con,y_pred):
  RMSE = math.sqrt(mean_squared_error(con.reshape(1,-1),y_pred.reshape(1,-1)))
  MSE = mean_squared_error(con.reshape(1,-1),y_pred.reshape(1,-1))
  MAE = mean_absolute_error(con.reshape(1,-1),y_pred.reshape(1,-1))
  return RMSE,MSE,MAE
def metrics_p(con,y_pred):
  RMSE = math.sqrt(mean_squared_error(con,y_pred))
  MSE = mean_squared_error(con,y_pred)
  MAE = mean_absolute_error(con,y_pred)
  return RMSE,MSE,MAE

MODEL 1

In [None]:
model_long = build_model(model_type=0, seq_len=130, pred_len=102)
model_long.load_weights('model_1.h5')
private_preds = model_long.predict(private_inputs)
#y_pred_1=private_preds.transpose((0, 2, 1))




In [None]:
y_pred_1 = private_preds[:,:,[0,1,3]]
print(y_pred_1)

[[[ 5.46530664e-01  7.80686498e-01  6.86655283e-01]
  [ 1.47229731e+00  2.16408491e+00  2.48203444e+00]
  [ 1.38016868e+00  9.02493060e-01  1.46030700e+00]
  ...
  [ 2.35394895e-01  1.79715052e-01  8.08080658e-02]
  [ 2.09073260e-01  3.44412088e-01  1.37444794e-01]
  [ 5.30746698e-01  3.34214032e-01  1.71151966e-01]]

 [[ 5.27800739e-01  7.51746535e-01  6.76827371e-01]
  [ 1.45931017e+00  2.12267876e+00  2.53965354e+00]
  [ 1.37361860e+00  8.41801226e-01  1.38248539e+00]
  ...
  [ 2.01375231e-01  2.18933567e-01  6.36998489e-02]
  [ 2.57670730e-01  5.74092746e-01  1.52191475e-01]
  [ 1.72275692e-01  3.46156836e-01  1.48285851e-01]]

 [[ 4.83320385e-01  6.86726153e-01  6.22647107e-01]
  [ 1.33059597e+00  1.87175024e+00  2.26334667e+00]
  [ 1.29895890e+00  7.50605166e-01  1.31008554e+00]
  ...
  [ 1.16388202e-01  1.96831360e-01  1.10957865e-02]
  [ 1.56922773e-01  2.56162941e-01  2.22161952e-02]
  [ 3.88544172e-01  2.94391394e-01  7.94772431e-02]]

 ...

 [[ 4.88203019e-01  5.53245366e-01

MODEL 2

In [None]:
model_long = build_model(model_type=1, seq_len=130, pred_len=102)
model_long.load_weights('model_2.h5')
private_preds = model_long.predict(private_inputs)
y_pred_2 = private_preds[:,:,[0,1,3]]




In [None]:
y_pred_2

array([[[0.52701294, 0.58573276, 0.5228815 ],
        [1.3493695 , 2.1207805 , 2.4572265 ],
        [1.229667  , 0.57599634, 1.1810416 ],
        ...,
        [1.1325841 , 0.9617695 , 1.3727934 ],
        [0.56634355, 1.3136518 , 1.4418216 ],
        [0.63912493, 0.787177  , 0.8481913 ]],

       [[0.51120126, 0.6296341 , 0.50758684],
        [1.1918519 , 2.1052551 , 2.3624935 ],
        [1.23613   , 0.7714936 , 1.2528809 ],
        ...,
        [1.5413116 , 1.2044694 , 0.88798386],
        [0.769369  , 2.135904  , 1.7384703 ],
        [0.53039306, 1.3737545 , 1.1469059 ]],

       [[0.49089018, 0.5819407 , 0.5192341 ],
        [1.3449945 , 2.2321172 , 2.6923473 ],
        [1.2537909 , 0.5769968 , 1.0803643 ],
        ...,
        [0.42922992, 0.21464302, 0.09375885],
        [0.264464  , 0.44762066, 0.30929443],
        [0.43546322, 0.318725  , 0.02832583]],

       ...,

       [[0.48142698, 0.5762003 , 0.46871984],
        [1.0420237 , 1.461058  , 1.3888391 ],
        [1.1360555 , 0

MODEL 3

In [None]:
model_long = build_model(model_type=2, seq_len=130, pred_len=102)
model_long.load_weights('model_3.h5')
private_preds = model_long.predict(private_inputs)
y_pred_3 = private_preds[:,:,[0,1,3]]




MODEL 4

In [None]:
model_long = build_model(model_type=3, seq_len=130, pred_len=102)
model_long.load_weights('model_4.h5')
private_preds = model_long.predict(private_inputs)
y_pred_4 = private_preds[:,:,[0,1,3]]



MODEDL 5

In [None]:
model_long = build_model(model_type=4, seq_len=130, pred_len=102)
model_long.load_weights('model_5.h5')
private_preds = model_long.predict(private_inputs)
#y_pred_5=private_preds.transpose((0, 2, 1))
y_pred_5 = private_preds[:,:,[0,1,3]]



META MODELLING

In [None]:
from tensorflow.keras import Model
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.losses import MeanSquaredLogarithmicError

hidden_units1 = 160
hidden_units2 = 480
hidden_units3 = 256
learning_rate = 0.01
# Creating model using the Sequential in tensorflow
def build_model_using_sequential():
  model = Sequential([
    Dense(hidden_units1, kernel_initializer='normal', activation='relu'),
    Dropout(0.2),
    Dense(hidden_units2, kernel_initializer='normal', activation='relu'),
    Dropout(0.2),
    Dense(hidden_units3, kernel_initializer='normal', activation='relu'),
    Dense(1, kernel_initializer='normal', activation='linear')
  ]) 
  return model
# build the model

model = build_model_using_sequential()


REACTIVITY

In [None]:
a = y_pred_1[:,:,0].reshape(-1)
b = y_pred_2[:,:,0].reshape(-1)
c = y_pred_3[:,:,0].reshape(-1)
d = y_pred_4[:,:,0].reshape(-1)
e = y_pred_5[:,:,0].reshape(-1)
df=pd.DataFrame()
df['model_1']=a
df['model_2']=b
df['model_3']=c
df['model_4']=d
df['model_5']=e
y = Y_act[:,:,0].reshape(-1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.values, y, test_size=0.33, random_state=42)

In [None]:
# loss function
msle = MeanSquaredLogarithmicError()
model.compile(
    loss=msle, 
    optimizer=Adam(learning_rate=learning_rate), 
    metrics=[msle]
)
# train the model
cp_callback = tf.keras.callbacks.ModelCheckpoint('meta_weights.h5')
history = model.fit(
    X_train, 
    y_train, 
    epochs=10, 
    batch_size=64,
    validation_split=0.2
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
Y_pred = model.predict(X_test)



In [None]:
RMSE,MSE,MAE = metrics_p(Y_pred,y_test)
print(RMSE,MSE,MAE)

0.29258371943588696 0.08560523287893783 0.1782156749415098



deg_Mg_50C


In [None]:
a = y_pred_1[:,:,2].reshape(-1)
b = y_pred_2[:,:,2].reshape(-1)
c = y_pred_3[:,:,2].reshape(-1)
d = y_pred_4[:,:,2].reshape(-1)
e = y_pred_5[:,:,2].reshape(-1)
df=pd.DataFrame()
df['model_1']=a
df['model_2']=b
df['model_3']=c
df['model_4']=d
df['model_5']=e
y = Y_act[:,:,2].reshape(-1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.values, y, test_size=0.33, random_state=42)

In [None]:
# loss function
msle = MeanSquaredLogarithmicError()
model.compile(
    loss=msle, 
    optimizer=Adam(learning_rate=0.003), 
    metrics=[msle]
)
# train the model
cp_callback = tf.keras.callbacks.ModelCheckpoint('meta_weights.h5')
history = model.fit(
    X_train, 
    y_train, 
    epochs=10, 
    batch_size=64,
    validation_split=0.2
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
Y_pred = model.predict(X_test)



In [None]:
RMSE,MSE,MAE = metrics_p(Y_pred,y_test)
print(RMSE,MSE,MAE)

0.3885818653019586 0.1509958660415495 0.2261105350023508


deg_Mg_pH10

In [None]:
a = y_pred_1[:,:,1].reshape(-1)
b = y_pred_2[:,:,1].reshape(-1)
c = y_pred_3[:,:,1].reshape(-1)
d = y_pred_4[:,:,1].reshape(-1)
e = y_pred_5[:,:,1].reshape(-1)
df=pd.DataFrame()
df['model_1']=a
df['model_2']=b
df['model_3']=c
df['model_4']=d
df['model_5']=e
y = Y_act[:,:,1].reshape(-1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.values, y, test_size=0.33, random_state=42)

In [None]:
# loss function
msle = MeanSquaredError()
model.compile(
    loss=msle, 
    optimizer=Adam(learning_rate=0.0005), 
    metrics=[msle]
)
# train the model
cp_callback = tf.keras.callbacks.ModelCheckpoint('meta_weights.h5')
history = model.fit(
    X_train, 
    y_train, 
    epochs=10, 
    batch_size=64,
    validation_split=0.2
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
Y_pred = model.predict(X_test)



In [None]:
RMSE,MSE,MAE = metrics_p(Y_pred,y_test)
print(RMSE,MSE,MAE)

0.4851018286969444 0.23532378420511957 0.2718517671518934


In [None]:
def plot_history(history, key):
  plt.plot(history.history[key])
  plt.plot(history.history['val_'+key])
  plt.xlabel("Epochs")
  plt.ylabel(key)
  plt.legend([key, 'val_'+key])
  plt.show()
# Plot the history
plot_history(history, 'mean_squared_error')