<a href="https://colab.research.google.com/github.com/ML-Bioinfo-CEITEC/mirna_binding/blob/master/notebook/model_normal_train_lock.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# General Python Modules

In [1]:
import pandas as pd
import numpy as np
import os
import time
%load_ext autoreload
%autoreload 2

# Data Preprocessing

## Common function

set of functions to generate the one hot encoding version of sequences and dot matrix.

## convert input nucleotide sequences to arrays

the code below takes as input a table of three columns:

- genomic binding site ( 50nt length);
- microRNA sequence (20nt length);
- label: class [ positive or negative ];

it outputs a list of arrays as:

- 2d matrix of binding vs microRNA with 2 channels. first channel is watson-crick score, second channel is relative position of microRNA (or zero if not required);
- binding site sequence as tensor of shape 50 x 4, where each channel is a nucleotide;
- microRNA sequence as tensor of shape 20 x 4, where each channel is a nucleotide;
- labels: numpy array [ 0 or 1, as negatie or positive];


In [2]:
def one_hot_encoding(df, tensor_dim=(50,20,1), aux=False, log=False):
    """
    fun transform input database to
    one hot encoding array.
    paramenters:
    df=input table as pandas dataframe
    tensor_dim= 2d matrix shape
    aux=create inputs for LSTM
    log=print log
    """

    # reset df indexes (needed for multithreading)
    df.reset_index(inplace=True, drop=True)
    
    # alphabet for watson-crick interactions.
    alphabet = {"AT": 1., "TA": 1., "GC": 1., "CG": 1.} 
    # one hot encoding of nt sequences for conv1d + LSTM input
    nt_pos_voc = {
        "A" : np.array([1.,0.,0.,0.]),
        "T" : np.array([0.,1.,0.,0.]),
        "C" : np.array([0.,0.,1.,0.]),
        "G" : np.array([0.,0.,0.,1.]),
        "N" : np.array([0.25,0.25,0.25,0.25]),
    
    }

    # labels to one hot encoding
    labels = np.where(df.label == 'positive', 1., 0.)
    
    if aux == True: # conv1d + LSTM input
        bind_matrix_l = list()
        mirna_matrix_l = list()
    else: 
        bind_matrix, mirna_matrix = None, None

    # create empty main 2d matrix array
    N = df.shape[0] # number of samples in df
    shape_matrix_2d = (N, *tensor_dim) # 2d matrix shape 
    ohe_matrix_2d = np.zeros(shape_matrix_2d, dtype="float32")

    start = time.time()

    for index, row in df.iterrows():        
        if aux:
            bind_matrix_l.append(
                [nt_pos_voc[letter] for letter in row.binding_sequence.upper()]
                                  )
            mirna_matrix_l.append(
                [nt_pos_voc[letter] for letter in row.mirna_binding_sequence.upper()]
                    )

        for bind_index, bind_nt in enumerate(row.binding_sequence.upper()):
                
            for mirna_index, mirna_nt in enumerate(
                row.mirna_binding_sequence.upper()
            ):

                pair = bind_nt + mirna_nt
                ohe_matrix_2d[index, bind_index, mirna_index, 0] = alphabet.get(pair, 0)
                
        if index % 1000 == 0 and log==True: # write something
            end = time.time()
            print(
                "rows:\t%s" % (index),
                "elapsed (sec):\t%s" % (end - start),
                sep=" | ",
            )

    if aux:
        bind_matrix = np.array(bind_matrix_l)
        mirna_matrix = np.array(mirna_matrix_l)
    if aux:
        return (ohe_matrix_2d, bind_matrix, mirna_matrix, labels)
    else:
        return (ohe_matrix_2d, labels)

##  general function to manage files

In [3]:
import joblib

def save_joblib(object_, filepath):
    joblib.dump(object_, filepath)
    return filepath

def load_joblib(filepath):
    return joblib.load(filepath)

## Parallelized conversion of an array/dataframe to 2D matrix

The below function takes as input a Pandas df or numpy array, and split it into  batches for parallelization.

Usage:

`output = multithread(df, one_hot_encoding, aux=False, log=False, n_cores=24)`  
`data = join_cores_results(output, aux=True)`

In [4]:
def join_cores_results(multithread_output, aux=False):
  """ join the output of different core processes """
  if aux:
    array_2d_matrix = np.concatenate(
        [ process[0] for process in multithread_output ]
        )
    array_bind_seq = np.concatenate(
        [ process[1] for process in multithread_output ]
        )
    array_micro_seq = np.concatenate(
        [ process[2] for process in multithread_output ]
        )
    array_labels = np.concatenate(
        [ process[3] for process in multithread_output ]
        )
    return (array_2d_matrix, array_bind_seq,
            array_micro_seq, array_labels)
  else:
    array_2d_matrix = np.concatenate(
        [ process[0] for process in multithread_output ]
        )
    array_labels = np.concatenate(
        [ process[1] for process in multithread_output ]
        )
    return (array_2d_matrix, array_labels)

In [5]:
from multiprocessing import Pool
from functools import partial

def multithread(df, func, aux=False, log=False, n_cores=4):
    iterable = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    lock_func = partial(func, aux=aux, log=log)
    df_update = pool.map(lock_func, iterable)
    pool.close()
    pool.join()
    data = join_cores_results(df_update, aux=aux)
    return data

## shuffle positive to create negative

The function generates the negative class by creating a connection between each

binding site and all mirna (expect the real one). If argument mirna_dict is

provided as dictionary of mirna sequences, this dictionary will be used to

create the negative class. Otherwise, all unique mirna sequences of the input

df will be used to generate samples for the negative class.

In [6]:
def negative_class_shuffle(df, mirna_dict=None, neg_ratio=None):
    if not mirna_dict:
        # generate mirna db of unique sequences
        mirnadb = pd.DataFrame(
            df.mirna_binding_sequence.unique(), columns=['mirnaid']
        )
    else:
        mirnadb = pd.DataFrame(mirna_dict)
        mirnadb.columns = ['mirnaid']
    # add mirna db to each row of df
    connections = mirnadb.assign(key=1).merge(
          df.assign(key=1), on='key'
          ).drop(['key', 'label'],axis=1)
    # find index of positive connection
    positive_samples_mask = (connections.mirnaid == 
                             connections.mirna_binding_sequence)
    # drop positive connection to create negative samples
    negative_df = connections[~positive_samples_mask].copy().drop(
      ['mirna_binding_sequence'], axis=1
      ).reset_index(drop=True)
    # rename cols
    negative_df.columns = ['mirna_binding_sequence', 'binding_sequence']
    # add negative labels
    negative_df['label'] = 'negative'
    if neg_ratio == None:
        return negative_df
    else:
        neg_samples = int(df.shape[0] * neg_ratio)
        return negative_df.sample(n = neg_samples)

## Create Hand-Picked Mini-Batches

Create minibatches keeping the original positive-negative ratio. It returns a list of minibatches (pos + neg together).

In [7]:
def make_minibatches(pos_samples, neg_samples, split_batch ):
    batches_list = []
    ## split subset_pos.class into minibatches
    ### see numpy doc for more details:
    # https://docs.scipy.org/doc/numpy/reference/generated/numpy.split.html
    batch_pos = np.array_split(pos_samples, split_batch)
    ## split subset_neg into minibatches
    ## of size == SPLIT_BATCH
    batch_neg = np.array_split(neg_samples, split_batch)
    ## zip together pos and neg subsets to create minibatches
    for mini_index, minibatch_pairs in enumerate(zip(batch_pos, batch_neg)):
        print('### minibatch pair id is:', mini_index,
              'pos shape is:', minibatch_pairs[0].shape[0],
              'neg_shape_is:', minibatch_pairs[1].shape[0],
              sep='\t'
              )

        batch_train = pd.concat(minibatch_pairs)
        # append each minibatch to minibatch list
        batches_list.append(batch_train)
    return batches_list

## Model

### Model Architecture

In [8]:
from tensorflow import keras as keras 
from tensorflow.keras.layers import BatchNormalization, LeakyReLU, Input, Dense, Conv2D, MaxPooling2D, Flatten, Dropout
from tensorflow.keras import Model
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping


In [9]:
def make_arch_00b():
    main_input = Input(shape=(50,20,1), dtype='float32', name='main_input')

    x = Conv2D(
        filters=32,
        kernel_size=(3, 3),
        padding="same",
        data_format="channels_last",
        name="conv_1")(main_input)    
    x = LeakyReLU()(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D(pool_size=(2, 2), name='Max_1')(x)
    x = Dropout(rate = 0.25)(x)


    x = Conv2D(
        filters=64,
        kernel_size=(3, 3),
        padding="same",
        data_format="channels_last",
        name="conv_2")(x)
    x = LeakyReLU()(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D(pool_size=(2, 2), name='Max_2')(x)
    x = Dropout(rate = 0.25)(x)

    x = Conv2D(
        filters=128,
        kernel_size=(3, 3),
        padding="same",
        data_format="channels_last",
        name="conv_3")(x)
    x = LeakyReLU()(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D(pool_size=(2, 2), name='Max_3')(x)
    x = Dropout(rate = 0.25)(x)


    conv_flat = Flatten(name='2d_matrix')(x)

    x = Dense(128)(conv_flat)
    x = LeakyReLU()(x)
    x = BatchNormalization()(x)
    x = Dropout(rate = 0.25)(x)

    x = Dense(64)(x)
    x = LeakyReLU()(x)
    x = BatchNormalization()(x)
    x = Dropout(rate = 0.25)(x)

    x = Dense(32)(x)
    x = LeakyReLU()(x)
    x = BatchNormalization()(x)
    x = Dropout(rate = 0.25)(x)

    main_output = Dense(1, activation='sigmoid', name='main_output')(x)

    model = Model(inputs=[main_input], outputs=[main_output], name='arch_00b')
    
    return model

### Train Function

In [10]:
def train(
    model, minibatch,
    sample_weight=None,
    class_weight=None,
    reset_metrics=True,
    aux=False
    ):
    if not aux:
        ## assign samples and labels
        X_ohe, y_ohe = minibatch
    else:
        raise NotImplementedError
    
    model_loss = model.train_on_batch(
        { "main_input" : X_ohe},
        { "main_output" : y_ohe},
        sample_weight=sample_weight,
        class_weight=class_weight,
        reset_metrics=reset_metrics
    )
    return model, model_loss

# Run Pipeline

## Create Dataset

In [31]:
DATASET_NAME = "enc09.joblib"
DATASET_PATH = "/home/grioni_andrea/media/disk_1/grioni_andrea/data/sets/train/enc09.joblib"
POSITIVE_SAMPLES = 250000
NEG_RATIO = 10
RANDOM_STATE = 1789
MINIBATCH_SPLIT = 500
AUX = False
CORES = 8
EXPORT = False # set True if you want to save the train dataset
WORK_DIR = "/home/grioni_andrea/media/disk_1/grioni_andrea/paper/train/normal/"
STRATEGY = "normal"
MODEL_ID = "model_02"

### Load Dataset

In [12]:
df = load_joblib(DATASET_PATH)
dataframe = df[['binding_sequence', 'mirna_binding_sequence', 'label']].copy()
print('dataframe shape is:', dataframe.shape)

dataframe shape is: (255588, 3)


### Select Samples for Positive Class

In [13]:
positive_samples = dataframe.sample(n = POSITIVE_SAMPLES, random_state=RANDOM_STATE)
print('positive samples are:', positive_samples.shape)
print('unique mirna sequences number is:', positive_samples.mirna_binding_sequence.nunique())

positive samples are: (250000, 3)
unique mirna sequences number is: 406


### Generate Samples for Negative Class

In [27]:
negative_samples = negative_class_shuffle(positive_samples, neg_ratio = NEG_RATIO)
print('negative samples are:', negative_samples.shape)

negative samples are: (2500000, 3)


## EXPORT: Convert Negative and Positive Class to one hot encodig

These section can be used to export 2d matrix for both positive and negative classes to file for future usage. If you do not need it, you can skip it and go to "Create hand-made mini-batches" section.

In [15]:
if EXPORT:
    negative_ohe = multithread(negative_samples, one_hot_encoding, aux=AUX, log=False, n_cores=CORES)
    positive_ohe = multithread(positive_samples, one_hot_encoding, aux=AUX, log=False, n_cores=CORES)

### export to files

In [16]:
if EXPORT:
    output_pos_dir = "./positive_set.joblib"
    output_neg_dir = "./negative_set.joblib"

    save_joblib(positive_ohe, output_pos_dir)
    save_joblib(negative_ohe, output_neg_dir)

## Create hand-made mini-batches

In [29]:
minibatches_list = make_minibatches(positive_samples, negative_samples, MINIBATCH_SPLIT )

### minibatch pair id is:	0	pos shape is:	500	neg_shape_is:	5000
### minibatch pair id is:	1	pos shape is:	500	neg_shape_is:	5000
### minibatch pair id is:	2	pos shape is:	500	neg_shape_is:	5000
### minibatch pair id is:	3	pos shape is:	500	neg_shape_is:	5000
### minibatch pair id is:	4	pos shape is:	500	neg_shape_is:	5000
### minibatch pair id is:	5	pos shape is:	500	neg_shape_is:	5000
### minibatch pair id is:	6	pos shape is:	500	neg_shape_is:	5000
### minibatch pair id is:	7	pos shape is:	500	neg_shape_is:	5000
### minibatch pair id is:	8	pos shape is:	500	neg_shape_is:	5000
### minibatch pair id is:	9	pos shape is:	500	neg_shape_is:	5000
### minibatch pair id is:	10	pos shape is:	500	neg_shape_is:	5000
### minibatch pair id is:	11	pos shape is:	500	neg_shape_is:	5000
### minibatch pair id is:	12	pos shape is:	500	neg_shape_is:	5000
### minibatch pair id is:	13	pos shape is:	500	neg_shape_is:	5000
### minibatch pair id is:	14	pos shape is:	500	neg_shape_is:	5000
### minibatch pair i

## Create Model

In [32]:
from tensorflow.keras import backend as K
K.clear_session()

model = make_arch_00b()
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    )

## Train as Generator

This code will fit each mini-batch to the model. The conversion from mini-batch dataframe to one hot encoding happen on-the-fly to save memory resources.

### Normal Training

In [None]:
# keep the last minibatch as validation set
validation_batch, minibatches_train = minibatches_list[-1], minibatches_list[ : -1]
val_ohe = multithread(validation_batch, one_hot_encoding, aux=AUX, log=False, n_cores=CORES)
# set paramenters
EPOCHS = 10
TRAIN_LOSSES = np.zeros( (EPOCHS, 1) )
VAL_LOSSES = np.zeros((EPOCHS, 1))
LOG = list()
# set callbacks

cycle = 0
for epoch in range(EPOCHS):
    epoch_train_losses = np.zeros( (len(minibatches_train), 1) )
    epoch_val_losses = np.zeros( (len(minibatches_train), 1) )

    print(f'#\tSTART EPOCH:\t{epoch}')
    for mini_index, minibatch in enumerate(minibatches_train):
        sample_size = minibatch.shape[0]
        data_ohe = multithread(minibatch, one_hot_encoding, aux=AUX, log=False, n_cores=CORES)
        model, model_loss = train(model, data_ohe, reset_metrics=False)
        epoch_train_losses[mini_index] = model_loss
        
        val_los = model.test_on_batch(
            { "main_input" : val_ohe[0]},
            { "main_output" : val_ohe[1]}
            )
        epoch_val_losses[mini_index] = val_los
        print(f'\tminibatch:{mini_index}|\ttrain on samples:{sample_size}|\t' + 
              f'train loss:{model_loss}|\tval loss:{val_los}', '\n')
        
        LOG.append([DATASET_NAME, STRATEGY, EPOCHS, cycle, POSITIVE_SAMPLES, NEG_RATIO,
                    epoch, mini_index, sample_size, model_loss, val_los])
        cycle += 1
    model.save( f'{WORK_DIR}/{epoch}.{MODEL_ID}.h5')
    TRAIN_LOSSES[epoch] = epoch_train_losses.sum() / (1.0 * len(epoch_train_losses))
    VAL_LOSSES[epoch] = epoch_val_losses.sum() / (1.0 * len(epoch_val_losses))

final_train_loss = TRAIN_LOSSES.sum() / (1.0 * len(TRAIN_LOSSES))
final_val_loss = VAL_LOSSES.sum() / (1.0 * len(VAL_LOSSES))

print(final_train_loss, final_val_loss)

# save results
col_names = ("dataset,strategy,total_epochs,cycle,positive_samples," + 
             "neg_ratio,epoch,mini_index,tot_samples,train_loss,val_loss").split(',')
log_df = pd.DataFrame(LOG, columns=col_names)    
log_df.to_csv(f'{WORK_DIR}/train.{MODEL_ID}.log.csv', index=False, sep=',')

model.save( f'{WORK_DIR}/{MODEL_ID}.h5')

#	START EPOCH:	0
	minibatch:0|	train on samples:5500|	train loss:1.0105211734771729|	val loss:0.7426842451095581 

	minibatch:1|	train on samples:5500|	train loss:0.9618327617645264|	val loss:0.7358652949333191 

	minibatch:2|	train on samples:5500|	train loss:0.9272983074188232|	val loss:0.7274988293647766 

	minibatch:3|	train on samples:5500|	train loss:0.9096933603286743|	val loss:0.723459005355835 

	minibatch:4|	train on samples:5500|	train loss:0.889648973941803|	val loss:0.721407413482666 

	minibatch:5|	train on samples:5500|	train loss:0.8524019718170166|	val loss:0.7157011032104492 

	minibatch:6|	train on samples:5500|	train loss:0.84372478723526|	val loss:0.7119900584220886 

	minibatch:7|	train on samples:5500|	train loss:0.8150548934936523|	val loss:0.7073625922203064 

	minibatch:8|	train on samples:5500|	train loss:0.8319933414459229|	val loss:0.7012004852294922 

	minibatch:9|	train on samples:5500|	train loss:0.8046963810920715|	val loss:0.6944630146026611 

	minibat