In [1]:
import numpy as np
import pandas as pd
from tensorflow import keras

In [2]:
pred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C', 'deg_pH10', 'deg_50C']

In [3]:
input_cols = ['sequence', 'structure', 'predicted_loop_type']

In [4]:
train = pd.read_json('train.json', lines=True)
test = pd.read_json('test.json', lines=True)

In [5]:
train = train.query("signal_to_noise >= 1")

In [6]:
train = train.query("SN_filter == 1")

In [7]:
test_private = test.query("seq_length == 130")

In [8]:
test_public = test.query("seq_length == 107")

In [9]:
def preprocess_inputs(df,input_cols):
    """
    Converts inputs into one-hot
    """
    output = []
    for i in range(len(input_cols)):
        tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
        tokenizer.fit_on_texts(np.asarray(df[input_cols[i]]))
        tmp = tokenizer.texts_to_sequences(np.asarray(df[input_cols[i]]))
        output.append(np.asarray(keras.utils.to_categorical(tmp)[:,:,1:]))
    return output

In [10]:
inputs = preprocess_inputs(train,input_cols)

In [11]:
len(inputs[0])

1587

In [12]:
def merge_inputs(inputs, length):
    """
    Merges the one-hot inputs by columns
    Also snips seq length's till desired amount
    """
    size = len(inputs[0])
    output = []
    for i in range(size):
        output.append(np.concatenate((inputs[0][i][0:length], inputs[1][i][0:length], inputs[2][i][0:length]), axis = 1))
    return np.asarray(output)

In [13]:
inputs_simple = merge_inputs(inputs, 68)

In [14]:
inputs_simple.shape

(1587, 68, 14)

In [15]:
test1 = np.zeros((2,5,5))

In [16]:
test1[1,1,0:3]

array([0., 0., 0.])

In [17]:
def preprocess_results(df, results):
    """
    Makes sure that the results are in the appropriate format:
        [layers,lines,columns] in an np array
    """
    tmp = np.asarray(df[results])
    size = len(tmp[0])
    output = np.zeros((len(tmp),len(tmp[0][0]),len(results)))
    for i in range(len(results)):
        for j in range(size):
            tmp[i,j] = np.asarray(tmp[i,j])
    for i in range(len(tmp)):
        output[i] = np.vstack((tmp[i,0], tmp[i,1], tmp[i,2], 
                                tmp[i,3], tmp[i,4]))[:,:].T
    return output

In [18]:
expected_results = preprocess_results(train, pred_cols)

In [19]:
expected_results.shape

(1587, 68, 5)

In [20]:
expected_results[0,0]

array([0.3297, 0.7556, 0.3581, 2.3375, 0.6382])

In [21]:
model_simple = keras.Sequential()

model_simple.add(keras.layers.Conv1D(filters=10, kernel_size=3, activation=('relu'), 
                                     input_shape=(None,14), padding='same'))
model_simple.add(keras.layers.GlobalMaxPooling1D())
model_simple.add(keras.layers.Dense(20, activation=('relu')))
model_simple.add(keras.layers.Dense(16, activation='relu'))
model_simple.add(keras.layers.Dropout(rate=0.4))
model_simple.add(keras.layers.Dense(10, activation='relu'))


#Est-ce que le out put c'est 3 valuers distinctes, ou 1 valeurs mais qui existe dans 3 channels?
model_simple.add(keras.layers.Dense(68, activation='linear'))

# mean_squared_error car on a affair a une regression
model_simple.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
model_simple.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, None, 10)          430       
_________________________________________________________________
global_max_pooling1d (Global (None, 10)                0         
_________________________________________________________________
dense (Dense)                (None, 20)                220       
_________________________________________________________________
dense_1 (Dense)              (None, 16)                336       
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                170       
_________________________________________________________________
dense_3 (Dense)              (None, 68)                7

In [22]:
model_simple.fit(inputs_simple[:], expected_results[:,:,0], batch_size=64, 
                 epochs=100, verbose=1, validation_split=0.2) # validation loss keeps going down?

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100


Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x7ff45589e7c0>

In [23]:
model_simple.predict(inputs_simple)

array([[0.4552154 , 1.2252952 , 0.97599006, ..., 0.31898028, 0.2965657 ,
        0.2526426 ],
       [0.44509864, 1.2295414 , 0.98912084, ..., 0.33077532, 0.2958997 ,
        0.26225537],
       [0.42045796, 1.1442679 , 0.9264906 , ..., 0.32537568, 0.29542977,
        0.26317954],
       ...,
       [0.49349552, 1.3570546 , 1.0625825 , ..., 0.32829818, 0.30296153,
        0.25346696],
       [0.2174711 , 0.66968983, 0.62520444, ..., 0.4198929 , 0.39894527,
        0.43809325],
       [0.4413795 , 1.1859127 , 0.92966163, ..., 0.3052482 , 0.28876698,
        0.25437045]], dtype=float32)

In [24]:
inputs_simple_test_public = preprocess_inputs(test_public,input_cols)
inputs_simple_test_public = merge_inputs(inputs_simple_test_public, len(inputs_simple_test_public[0][0]))

In [25]:
inputs_simple_test_public.shape

(629, 107, 14)

In [26]:
model_simple.predict(inputs_simple_test_public)

array([[0.45530832, 1.2186967 , 0.9633015 , ..., 0.31342936, 0.29419202,
        0.2520268 ],
       [0.46435457, 1.2428504 , 0.98336935, ..., 0.31771863, 0.29673517,
        0.25074688],
       [0.4322242 , 1.1426079 , 0.8885317 , ..., 0.29964814, 0.2946388 ,
        0.261455  ],
       ...,
       [0.5035448 , 1.4193052 , 1.0555708 , ..., 0.30375487, 0.29619753,
        0.2677962 ],
       [0.51145166, 1.4294422 , 1.0507143 , ..., 0.29762572, 0.2961886 ,
        0.26806185],
       [0.4339155 , 1.169026  , 0.9501978 , ..., 0.32281208, 0.29807135,
        0.2560317 ]], dtype=float32)

In [27]:
expected_results[:,:,0]

array([[ 0.3297,  1.5693,  1.1227, ...,  0.2937,  0.2362,  0.5731],
       [ 0.4482,  1.4822,  1.1819, ...,  0.6449,  0.04  ,  0.5446],
       [ 0.7642,  1.6641,  1.0622, ...,  0.1107,  0.2261,  0.3238],
       ...,
       [ 0.6957,  1.251 ,  1.3236, ..., -0.0043,  0.0521,  0.0874],
       [ 0.2891,  0.4496,  0.7165, ...,  0.8738,  0.2816,  0.554 ],
       [ 1.0102,  1.7928,  1.9228, ...,  0.0381, -0.0066,  0.0706]])

In [28]:
expected_results[1]

array([[0.4482, 0.2504, 0.5163, 2.243 , 0.9501],
       [1.4822, 1.4021, 1.6823, 2.9361, 1.7975],
       [1.1819, 0.9804, 1.0426, 1.0553, 1.4991],
       [0.7434, 0.4967, 0.7902, 0.721 , 0.8686],
       [0.7148, 0.3653, 0.7477, 0.6396, 0.6893],
       [0.6529, 0.8973, 0.9697, 1.1473, 1.2293],
       [0.2239, 0.296 , 0.2527, 0.3768, 0.3176],
       [0.1927, 0.5758, 0.3151, 0.4777, 0.5927],
       [0.1969, 0.7249, 0.8943, 0.9614, 0.8827],
       [0.3033, 0.1514, 0.1396, 0.1522, 0.1861],
       [0.6176, 0.2885, 0.5329, 0.3904, 0.4771],
       [0.3858, 0.2239, 0.244 , 0.1389, 0.5338],
       [1.0418, 0.9248, 0.796 , 0.4465, 0.7546],
       [0.6581, 2.2196, 2.0522, 1.3107, 1.2277],
       [1.1053, 0.7686, 0.8767, 0.7178, 0.7779],
       [0.6224, 0.3331, 0.4773, 0.2121, 0.4945],
       [0.4591, 0.32  , 0.1956, 0.1804, 0.4322],
       [0.1989, 0.3516, 0.1778, 0.2019, 0.2082],
       [0.1632, 1.3419, 1.0562, 0.5721, 0.6173],
       [0.5538, 0.3321, 0.3052, 0.2512, 0.4662],
       [0.6342, 0.63

In [29]:
seq_input = keras.layers.Input(shape=(None,4), name="seq")      #Input shape a revoir
pair_input = keras.layers.Input(shape=(None,3), name="pair")  
loop_input = keras.layers.Input(shape=(None,7), name="loop")  

seq_features = keras.layers.Conv1D(filters=10, kernel_size=3, activation=('relu'), 
                                     padding='same')(seq_input)
pair_features = keras.layers.Conv1D(filters=10, kernel_size=3, activation=('relu'), 
                                     padding='same')(pair_input)
loop_features = keras.layers.Conv1D(filters=10, kernel_size=3, activation=('relu'), 
                                     padding='same')(loop_input)

seq_features = keras.layers.GlobalMaxPooling1D()(seq_features)
pair_features = keras.layers.GlobalMaxPooling1D()(pair_features)
loop_features = keras.layers.GlobalMaxPooling1D()(loop_features)

seq_features = keras.layers.Dense(20)(seq_features)
pair_features = keras.layers.Dense(20)(pair_features)
loop_features = keras.layers.Dense(20)(loop_features)


# Merge les features
x = keras.layers.concatenate([seq_features, pair_features, loop_features])

flat = keras.layers.Flatten()(x)

first_pred = keras.layers.Dense(68, name="reactivity")(flat) #regression pour "reactivity"
second_pred = keras.layers.Dense(68, name="deg_Mg_pH10")(flat)  #regression pour "ph"
third_pred = keras.layers.Dense(68, name="deg_Mg_50C")(flat)
fourth_pred = keras.layers.Dense(68, name="deg_pH10")(flat)
fifth_pred = keras.layers.Dense(68, name="deg_50C")(flat)


model = keras.Model(
    inputs=[seq_input, pair_input, loop_input],
    outputs=[first_pred, second_pred, third_pred, fourth_pred, fifth_pred],
)
model.compile(optimizer='adam', loss='mse', metrics=['mse'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
seq (InputLayer)                [(None, None, 4)]    0                                            
__________________________________________________________________________________________________
pair (InputLayer)               [(None, None, 3)]    0                                            
__________________________________________________________________________________________________
loop (InputLayer)               [(None, None, 7)]    0                                            
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, None, 10)     130         seq[0][0]                        
______________________________________________________________________________________________

In [30]:
model.fit([inputs_simple[:,:,0:4], inputs_simple[:,:,4:7], inputs_simple[:,:,7:14]],
          [expected_results[:,:,0], expected_results[:,:,1], expected_results[:,:,2],
          expected_results[:,:,3], expected_results[:,:,4]], batch_size = 25, 
          epochs = 30, verbose = 1, validation_split = 0.2)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30


Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30


Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7ff45ba10fd0>

In [None]:
inputs_simple[0,0,0:4]


**Revoir donnees a predire**