# Iterative Training with Hand Picked Mini-Batches
This notebook contains functions to train a 
dense neural networks with iterative training.

Iterative training is a strategy that loop over
several users defined training phases. At each loop,
the abundance of the negative class samples is increased by
a user-defined ratio. This system allows us to gradually
imbalanced a training set, while still keeping the learner
able to recognize the positive class samples.

Increasing the negative class samples reduced the 
probability of generating imbalanced mini-batches by random
choice; meaning that in an imbalanced dataset of 100 positives and
10000 negatives, there is the chance that Keras will create mini-batches
of only negatives. Thus moving the weights of the neurons in favors of only the negative class samples. 
The notebook contains a helper function that allows to hand-pick mini-batches to keep at least one positive sample in each mini-batch.


In [1]:
import random
import time
import pandas as pd
import numpy as np

from tensorflow import keras as keras 
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten, Dropout
from tensorflow.keras import Model
from tensorflow.keras import backend as K


## create random dataset
This function is used only for the purpose
of demostration.

In [0]:
def random_seq(length_a, length_b, label=None, alphabet=None):
  list_a = [random.choice(alphabet) for i in range(0, length_a)]
  list_b = [random.choice(alphabet) for i in range(0, length_b)]
  seq_a, seq_b = ''.join(list_a), ''.join(list_b)
  return [seq_a, seq_b, label]


## Create simple CNN + FC
Here we defined a simple convnet + fully connected layers
that can be used for the demostration of the iterative
training.

In [0]:
def make_arch_00b(input_tensor):
  main_input = input_tensor
  x = Conv2D(
      filters=32,
      kernel_size=(3, 3),
      padding="same",
      data_format="channels_last",
      activation='relu',
      name="conv_1")(main_input)

  x = MaxPooling2D(pool_size=(2, 2), name='max_1')(x)

  x = Dropout(rate = 0.2)(x)

  conv_flat = Flatten(name='2d_matrix')(x)

  x = Dense(128, activation='relu')(conv_flat)

  x = Dropout(rate = 0.2)(x)

  x = Dense(64, activation='relu')(x)

  x = Dropout(rate = 0.4)(x)

  x = Dense(32, activation='relu')(x)

  main_output = Dense(1, activation='sigmoid', name='main_output')(x)

  model = Model(inputs=[main_input], outputs=[main_output], name='arch_00b')

  return model

In [0]:
def save_log(record_training, log_path):
  col_names = "MAX_ITER,model_name,OPTIMIZER,model.name,PICK_MINIBATCHES,iteration,POS_SAMPLES,neg_ratio,neg_samples".split(',')
  df_log = pd.DataFrame(
      record_training,
      columns=col_names)
  df_log.to_csv(log_path, header=True, index=False, sep='\t')
  return None

## create dot matrix from pandas df

This function converts a pandas dataframe with columns named ['seq_a', 'seq_b']
into an array of dot matrix. Each watson-crick pair got a score of 1, else 0.

In [0]:
def one_hot_encoding(df, tensor_dim, log=False):
  """
  fun transform input database to
  one hot encoding array.
  paramenters:
  df=input dataset
  tensor_dim=tensor dimension as tuple
  log=log time [bool(false)]
  """
  def logs(index, start): # logs
    end = time.time()
    print("processed rows are,", index, sep='\t')
    print("elapsed time(sec) is:", end - start, sep='\t')
  
  # warning: any nucleotide sequence is converted to capital letters

  ## alphabet for watson-crick interactions.
  alphabet = {"AT": 1, "TA": 1, "GC": 1, "CG": 1}
  X = df.reset_index(drop=True)
  # convert sample's labels to array
  y_ohe = X.label.to_numpy()
  # create empty dot matrix
  dot_matrix_ohe = np.zeros(
      (df.shape[0], *tensor_dim), dtype="float32"
      )
  # some time logs
  start = time.time()
  
  # loop over samples; improvement todo
  for index, sample in X.iterrows():
    # loop over nucleotides of sequence_a
    for seq_a_idex, seq_a_nt in enumerate(sample.seq_a.upper(), start=0):
      # loop over nucleotides of sequence_b
      for seq_b_index, seq_b_nt in enumerate(sample.seq_b.upper(), start=0):
        pair = seq_a_nt + seq_b_nt
        dot_matrix_ohe[index, seq_a_idex, seq_b_index, 0] = alphabet.get(pair, 0)
        # print some logging
        if log:
          if index % 1000 == 0:
            logs(index, start)
  return [dot_matrix_ohe, y_ohe]

## Dataset generator for iterative training
`generate_subset` is a function that subset N samples
from the original dataset, and returns a new subset + the
original dataset (with subset samples removed).

In [0]:
### create training set:
def generate_subset(df, N, drop=True):
  # copy original input df, this avoid
  # to corrupt original df.
  X = df.copy()
  # pick random N samples from main X
  subset = X.sample(
          n=N
      )
  
  if drop: # remove sample
    # get indexes for picked samples
    subset_samples_index = subset.index.tolist()
    # remove picked samples from original X
    new_X = X.drop(subset_samples_index, axis=0)
  else: # do nothing
    new_X = X
  # reset indexes
  subset.reset_index(drop=True, inplace=True)

  return subset, new_X
  

## train script
This function takes as input a list [batches_list] containing hand-picked
minibatches, and train the model [model] by fittin each mini-batch. Additionaly,
you can provide an array with sample weights as well as class weigths.

In [0]:
def train(
    model, 
    pick_minibatch=True,
    batches_list=None,
    sample_weight=None,
    class_weight=None,
    reset_metrics=True,
  ):
  if pick_minibatch:
    # train model by looping over minibatches
    for batch in batches_list:
      ## assign samples and labels
      X_ohe, y_ohe = batch
      ## train model
      ## read more about parameters at:
      ## https://keras.io/models/model/
      model.train_on_batch(
          { "main_input" : X_ohe},
          { "main_output" : y_ohe},
          sample_weight=sample_weight,
          class_weight=class_weight,
          reset_metrics=reset_metrics
        )
    return model
  else:
    raise NotImplementedError

# create a fake dataset
this cell uses the random_seq() function defined
above to create a fake dataset for the positive
sample class and one fake dataset for the negative
sample class. Has you can see, it is compulsary to 
keep the two dataset separated. When we hand-pick 
mini-batches, we will first split the two datasets
into subsets. Then, each positive class sample
subset will be unified with one negative class sample
to create a minibatch.

In [0]:
## create fake positive class dataset
seq_pos_list = [ random_seq(50, 20, label=1, alphabet=list('ACGT')) for i in range(0, 5000)]
seq_neg_list = [ random_seq(50, 20, label=0, alphabet=list('N')) for i in range(0, 10000)]


df_pos_orig = pd.DataFrame(seq_pos_list, columns = ['seq_a', 'seq_b', 'label'])
df_neg_orig = pd.DataFrame(seq_neg_list, columns = ['seq_a', 'seq_b', 'label'])

# iterative train
**Iterative** **training** is a sequence of steps to re-training the same model,while increase the abundance of the negative class. For this purpose, and to avoid overfitting, both samples from positive and negative class are replaced after each iteration. Thus, the training dataset is updated at each iteration.

The function that cares to update the training dataset is `generate_subset`. This function takes as input the original dataset `df` and an integer `N`. `N` corresponds to the number of samples that we want to subsamples. The function subsamples `N` samples from the original dataset `df`, thus creating a new subset `df_sub`. The function returns the newly create subset `df_sub` + the original dataset. The original dataset **will** **not** contain the subsampled samples, thus will have a size equal to: original dataset size - N. Which means
that after several iteration the original dataset will decrease to size zero.

## config static variables

In [9]:
from collections import defaultdict


## tensor dim
tensor_dim = (50, 20, 1) # we define a tensor of dimention 50, 20, 1 that
                         # correspond to the dimension of the dot matrix 
                         # (seq_a has length 50, seq_b length 20, and 1 dimention
                         # to store the watson-crick value)

## input keras tensor
input_tensor = Input(
      shape=tensor_dim, # the tensor dimentions (see above)
      dtype='float32', # array of float32 is ok for our purposes and allows to save memory.
      name='main_input' # define a name for the tensor.
    )

# create the model
architecture = make_arch_00b(input_tensor)
OPTIMIZER = 'RMSprop'

# settings train strategy with
# hand picked minibatch
PICK_MINIBATCHES = True # bool
SPLIT_BATCH = 10 # set number of minibatches, which means that the two main 
                 # datasets storing the positive and negative class,
                 # respectively, will be diveded into N==SPLIT_BATCH
                 # subsets.

# DATASET STATIC VARIABLES
POS_SAMPLES = 100
NEG_RATIO_MINISTEPS = 0.5 # define mini-steps to increase
                          # negative class sample ratio
MAX_ITER = 25
NEG_RATIO_START = 1 # initial negative class sample ratio to start with.

# set folders and logs
RECORD_TRAINING = list()
save_ = True # if true save models and training parameters
if save_ :
  RECORD_FILENAME = 'iterative_training_paramenters.csv'
  LOG_DIR = './'
  MODEL_PATH = './'
  LOG_PATH = LOG_DIR + RECORD_FILENAME

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


### run iterative training


In [10]:
# iterative training
for iteration in range(0, MAX_ITER):
  try:
    if iteration == 0:
      neg_ratio = NEG_RATIO_START
      print('# model initialization..')
      model = architecture # initialize model
      model.compile(optimizer=OPTIMIZER,
                loss='binary_crossentropy',
                )
    else:
      # increase negative class sample ratio for this iteration      
      neg_ratio += NEG_RATIO_MINISTEPS  
    # assign model name for iteration n
    model_name = f'model_{model.name}_{iteration}'
    # define number of negative class samples 
    # that will be picked up.
    neg_samples = int(neg_ratio * POS_SAMPLES)

    print("# model name is:", model_name, sep='\t')
    print("# iteration is:", iteration, sep='\t')
    print("## pos.class main dataset size is:", df_pos_orig.shape, sep='\t')
    print("## neg.class main dataset size is:", df_neg_orig.shape, sep='\t')

    print("## neg.class ratio is:", neg_ratio, sep='\t')
    print("## pos.class samples size is:", POS_SAMPLES, sep='\t')
    print("## neg.class samples size is:", neg_samples, sep='\t')

    # GENERATE TRAINING DATASET
    ## subsample pos.class samples
    ## from original pos.class dataset.
    subset_pos, df_pos_orig = generate_subset(
        df_pos_orig, N=POS_SAMPLES
        )
    ## subsample neg.class samples
    ## from original neg.class dataset
    subset_neg, df_neg_orig = generate_subset(
          df_neg_orig,
          N=neg_samples,
      )

    ## generate hand-picked imbalanced mini-batches
    if PICK_MINIBATCHES:
      print('### pick minibatches')
      batches_list = []
      ## split subset_pos.class into minibatches
      ### see numpy doc for more details:
      # https://docs.scipy.org/doc/numpy/reference/generated/numpy.split.html
      batch_pos = np.array_split(subset_pos, SPLIT_BATCH)
      ## split subset_neg into minibatches
      ## of size == SPLIT_BATCH
      batch_neg = np.array_split(subset_neg, SPLIT_BATCH)
      ## zip together pos and neg subsets to create minibatches
      for mini_index, minibatch_pairs in enumerate(zip(batch_pos, batch_neg)):
        print('### minibatch pair id is:', mini_index,
              'pos shape is:', minibatch_pairs[0].shape[0],
              'neg_shape_is:', minibatch_pairs[1].shape[0],
              sep='\t'
              )

        batch_train = pd.concat(minibatch_pairs)
        # converts to dot matrix | input for model
        minibatch = one_hot_encoding(batch_train, tensor_dim)
        # append each minibatch to minibatch list
        batches_list.append(minibatch)
    else:
      raise NotImplementedError

    # train model
    model = train(
      model,
      pick_minibatch=PICK_MINIBATCHES,
      batches_list=batches_list,
      )
      
    # # create log_parameters
    log_parameters = [
        MAX_ITER,
        model_name, 
        OPTIMIZER,
        model.name,
        PICK_MINIBATCHES,
        iteration,
        POS_SAMPLES,
        neg_ratio,
        neg_samples
        
    ]
    RECORD_TRAINING.append(log_parameters)
    if save_:
      print('## save model as:', f'{MODEL_PATH}/{model_name}.h5')
      # save model at current iteration stage
      model.save(f'{MODEL_PATH}/{model_name}.h5')
      print('## save log as:', LOG_PATH)
      # save log
      save_log(RECORD_TRAINING, LOG_PATH)
  except ValueError as err:
    print('raised ValueError:', err)
    print('it may be possible that you run out of samples from main datasets')
    break


# model initialization..
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
# model name is:	model_arch_00b_0
# iteration is:	0
## pos.class main dataset size is:	(5000, 3)
## neg.class main dataset size is:	(10000, 3)
## neg.class ratio is:	1
## pos.class samples size is:	100
## neg.class samples size is:	100
### pick minibatches
### minibatch pair id is:	0	pos shape is:	10	neg_shape_is:	10
### minibatch pair id is:	1	pos shape is:	10	neg_shape_is:	10
### minibatch pair id is:	2	pos shape is:	10	neg_shape_is:	10
### minibatch pair id is:	3	pos shape is:	10	neg_shape_is:	10
### minibatch pair id is:	4	pos shape is:	10	neg_shape_is:	10
### minibatch pair id is:	5	pos shape is:	10	neg_shape_is:	10
### minibatch pair id is:	6	pos shape is:	10	neg_shape_is:	10
### minibatch pair id is:	7	pos shape is:	10	neg_shape_is:	10
### minibatch pair id is:	8	pos shape is:	10	neg_shape_is:	10
### minibatch pair id is:	9	pos shape is:	10	neg_shape_is:	10
## s