I took things out of `training.py` script so I can easily modify them here.

# Setup

In [25]:
!pip install keras-tuner
!pip install tensorflow_addons

Collecting tensorflow_addons
  Downloading tensorflow_addons-0.16.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 4.3 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.16.1


In [26]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras as K
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.metrics import roc_curve, roc_auc_score, auc
from sklearn.metrics import precision_recall_curve
from tensorflow.keras.layers import (
                                BatchNormalization, LeakyReLU,
                                Input, Dense, Conv2D,
                                MaxPooling2D, Flatten, Dropout)
from tensorflow.keras.optimizers import Adam
import tensorflow_addons as tfa
import keras_tuner as kt

In [13]:
# set random state for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Data preprocessing

In [14]:
def one_hot_encoding(df, tensor_dim=(50, 20, 1)):
    """
    fun transform input database to numpy array.
    
    parameters:
    df = Pandas df with col names "gene", "label", "miRNA"
    tensor_dim= 2d matrix shape
    
    output:
    2d dot matrix, labels as np array
    """
    df.reset_index(inplace=True, drop=True)

    # alphabet for watson-crick interactions.
    alphabet = {"AT": 1., "TA": 1., "GC": 1., "CG": 1.}

    # labels to one hot encoding
    label = df["label"].to_numpy()

    # create empty main 2d matrix array
    N = df.shape[0]  # number of samples in df
    shape_matrix_2d = (N, *tensor_dim)  # 2d matrix shape
    # initialize dot matrix with zeros
    ohe_matrix_2d = np.zeros(shape_matrix_2d, dtype="float32")

    # compile matrix with watson-crick interactions.
    for index, row in df.iterrows():
        for bind_index, bind_nt in enumerate(row.gene.upper()):

            for mirna_index, mirna_nt in enumerate(row.miRNA.upper()):

                base_pairs = bind_nt + mirna_nt
                ohe_matrix_2d[index, bind_index, mirna_index, 0] = alphabet.get(base_pairs, 0)

    return ohe_matrix_2d, label

In [21]:
def load_data(training_ratio, train=True):
  if train == True:
    part = 'train'
  else:
    part = 'evaluation'
  df = pd.read_csv("../Datasets/" + part + "_set_1_" + str(training_ratio) + "_CLASH2013_paper.tsv", sep='\t')
  df = df.sample(frac=1, random_state=RANDOM_STATE)
  ohe_data = one_hot_encoding(df)
  ohe, labels = ohe_data
  return ohe, labels

In [16]:
# run this cell in Google Colab to get the data into right place
!mkdir ../Datasets

!wget https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/tuning/Datasets/train_set_1_1_CLASH2013_paper.tsv -P ../Datasets/
!wget https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/tuning/Datasets/train_set_1_10_CLASH2013_paper.tsv -P ../Datasets/
!wget https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/tuning/Datasets/train_set_1_100_CLASH2013_paper.tsv -P ../Datasets/
!wget https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/tuning/Datasets/evaluation_set_1_1_CLASH2013_paper.tsv -P ../Datasets/
!wget https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/tuning/Datasets/evaluation_set_1_10_CLASH2013_paper.tsv -P ../Datasets/
!wget https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/tuning/Datasets/evaluation_set_1_100_CLASH2013_paper.tsv -P ../Datasets/

mkdir: cannot create directory ‘../Datasets’: File exists
--2022-02-24 12:28:27--  https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/tuning/Datasets/train_set_1_1_CLASH2013_paper.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2276853 (2.2M) [text/plain]
Saving to: ‘../Datasets/train_set_1_1_CLASH2013_paper.tsv.1’


2022-02-24 12:28:27 (29.2 MB/s) - ‘../Datasets/train_set_1_1_CLASH2013_paper.tsv.1’ saved [2276853/2276853]

--2022-02-24 12:28:27--  https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/miRBind/tuning/Datasets/train_set_1_10_CLASH2013_paper.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|1

# Model + Keras Tuner setup

Following this tutorial https://blog.tensorflow.org/2020/01/hyperparameter-tuning-with-keras-tuner.html to setup hyperparameter tuning.

In [17]:
def make_architecture(hp):
    """
    build model architecture

    return a model object
    """
    main_input = Input(shape=(50, 20, 1),
                       dtype='float32', name='main_input'
                       )

    cnn_num = hp.Int('conv_blocks', 2, 6, default=3)
    kernel_size = hp.Int('kernel_size', 3, 6, default=3)
    pool_size = hp.Int('pool_size', 2, 5, default=2)
    dropout_rate = hp.Float('dropout', 0, 0.6, step=0.05, default=0.25)
    # max the same number of dense layers as is the number of cnn layers
    dense_num = hp.Int('dense_blocks', 2, cnn_num, default=3)


    for cnn_i in range(cnn_num):

        x = Conv2D(
            # we increase number of filters by 32 in each layer
            filters=32*(cnn_i + 1),
            kernel_size=(kernel_size, kernel_size),
            padding="same",
            data_format="channels_last",
            name="conv_" + str(cnn_i + 1))(main_input)
        x = LeakyReLU()(x)
        x = BatchNormalization()(x)
        x = MaxPooling2D(pool_size=(pool_size, pool_size), name='Max_' + str(cnn_i + 1))(x)
        x = Dropout(rate=dropout_rate)(x)

    conv_flat = Flatten(name='2d_matrix')(x)

    for dense_i in range(dense_num):

        neurons = 32 * (cnn_num - dense_i)
        x = Dense(neurons)(conv_flat)
        x = LeakyReLU()(x)
        x = BatchNormalization()(x)
        x = Dropout(rate=dropout_rate)(x)

    main_output = Dense(1, activation='sigmoid', name='main_output')(x)

    m = K.Model(inputs=[main_input], outputs=[main_output], name='arch_00')

    return m

In [27]:
def compile_model(hp):
    K.backend.clear_session()
    m = make_architecture(hp)

    opt = Adam(
        learning_rate=hp.Float('learning_rate', 1e-4, 1e-2, sampling='log'),
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-07,
        amsgrad=False,
        name="Adam")
    
    binary_f1_score = tfa.metrics.F1Score(num_classes=1, threshold=0.5, average="micro")
    m.compile(
        optimizer=opt,
        loss='binary_crossentropy',
        metrics=['accuracy', binary_f1_score, K.metrics.AUC(name='prc', curve='PR')]
        )
    return m

# Tune

In [None]:
for ratio in [1, 10, 100]:

  tuner = kt.Hyperband(
    compile_model,
    objective=kt.Objective('val_prc', direction='max'),
    max_trials=50,
    hyperband_iterations=2,
    project_name='miRBind_' + str(ratio)
  )

  train_ohe, train_labels = load_data(ratio, train=True)
  val_ohe, val_labels = load_data(ratio, train=False)

  tuner.search(train_ohe, train_labels,
             validation_data=(val_ohe, val_labels),
             epochs=10,
             callbacks=[tf.keras.callbacks.EarlyStopping(patience=3)],
             batch_size=32,
            class_weight={0: 1, 1: ratio}
  )
  
  print("Best params for ratio", ratio, " are: ", tuner.get_best_hyperparameters(1)[0])

  best_model = tuner.get_best_models(1)[0]
  best_model.save("model_1_" + str(ratio) + ".h5")