In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras as K
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from tensorflow.keras.layers import (
                                BatchNormalization, LeakyReLU,
                                Input, Dense, Conv2D,
                                MaxPooling2D, Flatten, Dropout)
from tensorflow.keras.optimizers import Adam

In [2]:
!wget https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/main/Datasets/train_set_1_10_CLASH2013_paper.tsv

--2022-04-24 17:22:50--  https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/main/Datasets/train_set_1_10_CLASH2013_paper.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12518906 (12M) [text/plain]
Saving to: ‘train_set_1_10_CLASH2013_paper.tsv’


2022-04-24 17:22:51 (130 MB/s) - ‘train_set_1_10_CLASH2013_paper.tsv’ saved [12518906/12518906]



In [3]:
def binding_encoding(df, tensor_dim=(50,20,1)):
    """
    fun transform input database to numpy array.
    
    parameters:
    df = Pandas df with col names "noncodingRNA", "gene", "label"
    tensor_dim = 2d matrix shape
    
    output:
    2d dot matrix, labels as np array
    """
    df.reset_index(inplace=True, drop=True)

    # alphabet for watson-crick interactions.
    alphabet = {"AT": 1., "TA": 1., "GC": 1., "CG": 1.} 

    # labels to one hot encoding
    labels = df["label"].to_numpy()

    # create empty main 2d matrix array
    N = df.shape[0] # number of samples in df
    shape_matrix_2d = (N, *tensor_dim) # 2d matrix shape 
    # initialize dot matrix with zeros
    ohe_matrix_2d = np.zeros(shape_matrix_2d, dtype="float32")

    # compile matrix with watson-crick interactions.
    for index, row in df.iterrows():        
        for bind_index, bind_nt in enumerate(row.gene.upper()):
        
            for ncrna_index, ncrna_nt in enumerate(row.noncodingRNA.upper()):
                if ncrna_index >= tensor_dim[1]:
                    break
                base_pairs = bind_nt + ncrna_nt
                ohe_matrix_2d[index, bind_index, ncrna_index, 0] = alphabet.get(base_pairs, 0)
    

    return ohe_matrix_2d, labels

In [4]:
def make_architecture():
    """
    build model architecture

    return a model object
    """
    cnn_num = 6
    kernel_size = 5
    pool_size = 2
    dropout_rate = 0.3
    dense_num = 2

    x = Input(shape=(50,20,1),
                       dtype='float32', name='main_input'
                       )
    main_input = x

    for cnn_i in range(cnn_num):
        x = Conv2D(
            filters=32 * (cnn_i + 1),
            kernel_size=(kernel_size, kernel_size),
            padding="same",
            data_format="channels_last",
            name="conv_" + str(cnn_i + 1))(x)
        x = LeakyReLU()(x)
        x = BatchNormalization()(x)
        x = MaxPooling2D(pool_size=(pool_size, pool_size), padding='same', name='Max_' + str(cnn_i + 1))(x)
        x = Dropout(rate=dropout_rate)(x)

    x = Flatten(name='2d_matrix')(x)

    for dense_i in range(dense_num):
        neurons = 32 * (cnn_num - dense_i)
        x = Dense(neurons)(x)
        x = LeakyReLU()(x)
        x = BatchNormalization()(x)
        x = Dropout(rate=dropout_rate)(x)

    main_output = Dense(1, activation='sigmoid', name='main_output')(x)

    model = K.Model(inputs=[main_input], outputs=[main_output], name='arch_00')
    
    return model

In [5]:
def compile_model():
    K.backend.clear_session()
    model = make_architecture()
    
    opt = Adam(
        learning_rate=0.00152,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-07,
        amsgrad=False,
        name="Adam")

    model.compile(
        optimizer=opt,
        loss='binary_crossentropy',
        metrics=['accuracy']
        )
    return model

In [6]:
def plot_history(history):
    """
    plot history of the training of the model,
    accuracy and loss of the training and validation set
    """
    
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(1, len(acc) + 1)

    plt.figure(figsize=(8, 6), dpi=80)

    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Accuracy')
    plt.legend()
    plt.figure()

    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Loss')
    plt.legend()
    plt.show()

In [8]:
train_df = pd.read_csv('train_set_1_10_CLASH2013_paper.tsv', sep='\t', names=['noncodingRNA', 'gene', 'label'], header=0)
# set random state for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
train_df = train_df.sample(frac=1, random_state=RANDOM_STATE)
print(train_df.head())
ohe_data = binding_encoding(train_df)
train_ohe, labels = ohe_data
print("Number of training samples: ", train_df.shape[0])

                noncodingRNA  \
45236   ACTGCATTATGAGCACTTAA   
168824  TATTGCACTTGTCCCGGCCT   
2591    AAAAGCTGGGTTGAGAGGGC   
76746   TCTCACACAGAAATCGCACC   
63277   TGAGGTAGTAGTTTGTGCTG   

                                                     gene  label  
45236   GAGAAGAAATCTGGCTGGTTTGAGGGTTTCCTTTAGTTCACCCTCA...      0  
168824  GTAAATGTCTGTTTTTCATAATTGCTCTTTATATTGTGTGTTATCT...      0  
2591    GTACCCAGTAAAAACCAGAATGACCCATTGCCAGGACGCATCAAAG...      1  
76746   ACGTCGGCGCCATGCTCCAGGTACAGAGCCACATGTTGCTCCAGGC...      0  
63277   ACCAATGCCAGAGGAGCAACAGCGGCAACCTTTGGCACTGCATCCA...      0  
Number of training samples:  169312


In [9]:
model = compile_model()
model.summary()

Model: "arch_00"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 main_input (InputLayer)     [(None, 50, 20, 1)]       0         
                                                                 
 conv_1 (Conv2D)             (None, 50, 20, 32)        832       
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 50, 20, 32)        0         
                                                                 
 batch_normalization (BatchN  (None, 50, 20, 32)       128       
 ormalization)                                                   
                                                                 
 Max_1 (MaxPooling2D)        (None, 25, 10, 32)        0         
                                                                 
 dropout (Dropout)           (None, 25, 10, 32)        0         
                                                           

In [None]:
model_history = model.fit(
    train_ohe, labels,
    validation_split=0.05, epochs=10,
    batch_size=32,
    class_weight={0 : 1, 1 : 10}
    )

In [None]:
plot_history(model_history)

In [None]:
model.save("model.h5")