I took things out of `training.py` script so I can easily modify them here.

# Setup

In [4]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras as K
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.metrics import precision_recall_curve
from tensorflow.keras.layers import (
                                BatchNormalization, LeakyReLU,
                                Input, Dense, Conv2D,
                                MaxPooling2D, Flatten, Dropout)
from tensorflow.keras.optimizers import Adam

# Data preprocessing

In [5]:
def one_hot_encoding(df, tensor_dim=(50, 20, 1)):
    """
    fun transform input database to numpy array.
    
    parameters:
    df = Pandas df with col names "gene", "label", "miRNA"
    tensor_dim= 2d matrix shape
    
    output:
    2d dot matrix, labels as np array
    """
    df.reset_index(inplace=True, drop=True)

    # alphabet for watson-crick interactions.
    alphabet = {"AT": 1., "TA": 1., "GC": 1., "CG": 1.}

    # labels to one hot encoding
    label = df["label"].to_numpy()

    # create empty main 2d matrix array
    N = df.shape[0]  # number of samples in df
    shape_matrix_2d = (N, *tensor_dim)  # 2d matrix shape
    # initialize dot matrix with zeros
    ohe_matrix_2d = np.zeros(shape_matrix_2d, dtype="float32")

    # compile matrix with watson-crick interactions.
    for index, row in df.iterrows():
        for bind_index, bind_nt in enumerate(row.gene.upper()):

            for mirna_index, mirna_nt in enumerate(row.miRNA.upper()):

                base_pairs = bind_nt + mirna_nt
                ohe_matrix_2d[index, bind_index, mirna_index, 0] = alphabet.get(base_pairs, 0)

    return ohe_matrix_2d, label

In [6]:
# set random state for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [8]:
TRAINING_RATIO = 1
train_df = pd.read_csv("../Datasets/train_set_1_" + str(TRAINING_RATIO) + "_CLASH2013_paper.tsv", sep='\t')
train_df = train_df.sample(frac=1, random_state=RANDOM_STATE)
train_df.head()

Unnamed: 0,miRNA,gene,label
15287,TCTGGCTCCGTGTCTTCACT,GGTGAGGGAGACGGAGGCCGTCATCCACAAGCACCGCTCGGCCACC...,1
27358,TGAGGTAGTAGTTTGTGCTG,GGACAGGCACAGAGACTTGGAAGAGAGAAATAGACGCTCTAGTGGG...,0
9838,TACCCTGTAGATCCGAATTT,ACTTCTTGGACTACATGGGGATCAAAGGCCCCAGGATGCCTCTGGG...,1
896,CGTCAACACTTGCTGGTTTC,GTGTCTCAAAGCAAAGGAAACCTCCACAAGTGCTGCAACAGTGCAT...,1
19287,TTCACCACCTTCTCCACCCA,CTTTGACACTACACAATTTTCTAATATGTGTTAATGCTATGTGACA...,0


In [9]:
ohe_data = one_hot_encoding(train_df)
train_ohe, labels = ohe_data
print("Number of training samples: ", train_df.shape[0])

Number of training samples:  30784


# Model + Keras Tuner setup

Following this tutorial https://blog.tensorflow.org/2020/01/hyperparameter-tuning-with-keras-tuner.html to setup hyperparameter tuning.

In [14]:
def make_architecture(hp):
    """
    build model architecture

    return a model object
    """
    main_input = Input(shape=(50, 20, 1),
                       dtype='float32', name='main_input'
                       )

    cnn_num = hp.Int('conv_blocks', 2, 6, default=3)
    dropout_rate = hp.Float('dropout', 0, 0.6, step=0.1, default=0.25)
    # max the same number of dense layers as is the number of cnn layers
    dense_num = hp.Int('dense_blocks', 2, cnn_num, default=3)


    for cnn_i in range(cnn_num):

        x = Conv2D(
            # we increase number of filters by 32 in each layer
            filters=32*(cnn_i + 1),
            kernel_size=(3, 3),
            padding="same",
            data_format="channels_last",
            name="conv_" + str(cnn_i + 1))(main_input)
        x = LeakyReLU()(x)
        x = BatchNormalization()(x)
        x = MaxPooling2D(pool_size=(2, 2), name='Max_' + str(cnn_i + 1))(x)
        x = Dropout(rate=dropout_rate)(x)

    conv_flat = Flatten(name='2d_matrix')(x)

    for dense_i in range(dense_num):

        neurons = 32 * (cnn_num - dense_i)
        x = Dense(neurons)(conv_flat)
        x = LeakyReLU()(x)
        x = BatchNormalization()(x)
        x = Dropout(rate=dropout_rate)(x)

    main_output = Dense(1, activation='sigmoid', name='main_output')(x)

    m = K.Model(inputs=[main_input], outputs=[main_output], name='arch_00')

    return m

In [15]:
def compile_model(hp):
    K.backend.clear_session()
    m = make_architecture(hp)

    opt = Adam(
        learning_rate=hp.Float('learning_rate', 1e-4, 1e-2, sampling='log'),
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-07,
        amsgrad=False,
        name="Adam")

    m.compile(
        optimizer=opt,
        loss='binary_crossentropy',
        metrics=['accuracy']
        )
    return m

In [16]:
import kerastuner as kt

tuner = kt.Hyperband(
    compile_model,
    objective='val_accuracy',
    max_epochs=30,
    hyperband_iterations=2)

In [None]:
tuner.search(train_ohe, labels,
             validation_split=0.05,
             epochs=10,
             callbacks=[tf.keras.callbacks.EarlyStopping(patience=1)])