# Implementing ULMFit with tensorflow - Part 2

This notebook is born to implement ULMfit model. This is a mixture of different papers: the first and the second are from Merity et al 2017 https://arxiv.org/pdf/1708.02182.pdf which inspired the ULMfit lm phase, the second is https://arxiv.org/pdf/1711.03953.pdf which provides us with a optimization on the softmax layer which greatly boost the lm performance, and last but not the least the ULMfit paper by Ruder et al 2018 https://arxiv.org/pdf/1801.06146.pdf . 
In this notebook we implement the finetuning

In [1]:
import tensorflow as tf

import numpy as np
from math import ceil

from sklearn.metrics import accuracy_score

from LMNets.models import LanguageModelAWD, LanguageModelMoS
from LMNets.losses import lm_loss_sparse, log_scalar, cross_entropy_w_softmax
from LMNets.optimize import minimize_w_clipping
from LMNets.data import penn_treebank, wikitext, iterator

tf.enable_eager_execution()

  from ._conv import register_converters as _register_converters


In [2]:
tf.executing_eagerly()

True

In [3]:
def batch_score(model, x, y, seq_len):
    y_, _ =  model.forward(x, seq_len)
    return tf.pow(2,tf.losses.sparse_softmax_cross_entropy(y, logits=y_))

def compute_token_score(model, tokens, bs, bptt):
    scores = []
    for x, y, seq_len in iterator.get_bptt_batch_iterator(tokens, bs, bptt):
        scores.append(batch_score(model, x, y, seq_len))
    return np.mean(scores)

def evaluate_using_weighted_f1(model, X_test, y_test, X_val, y_val,
                               batch_size):
    
    y_pred_test = np.array(model.predict(X_test, batch_size=batch_size))
    y_pred_val = np.array(model.predict(X_val, batch_size=batch_size))

    f1_test, _ = find_f1_threshold(y_val, y_pred_val, y_test, y_pred_test,
                                   average='weighted')
    return f1_test

def find_f1_threshold(y_val, y_pred_val, y_test, y_pred_test,
                      average='binary'):

    thresholds = np.arange(0.01, 0.5, step=0.01)
    f1_scores = []

    for t in thresholds:
        y_pred_val_ind = (y_pred_val > t)
        f1_val = f1_score(y_val, y_pred_val_ind, average=average)
        f1_scores.append(f1_val)

    best_t = thresholds[np.argmax(f1_scores)]
    y_pred_ind = (y_pred_test > best_t)
    f1_test = f1_score(y_test, y_pred_ind, average=average)
    return f1_test, best_t

def my_score_fun(y_, y):
    args=np.argmax(y_, axis=1)
    y_ = np.zeros(y.shape)
    y_[range(y_.shape[0]),args] = 1
    return accuracy_score(y, y_)

def compute_score(model, x_val, y_true, n_classes, bs, score_fun):
    scores = []
    
    for x, y, seq_len in iterator.get_batch_iterator(x_val, y_true, n_classes, bs):
        scores.append(score_fun(model.forward(x, seq_len), y))
    return np.mean(scores)

In [4]:
class DenseDropBN:
    
    def __init__(self, shape, dropout):
        self.trainable = True
        self.shape = shape
        self.dropout_rate = dropout
        self.build()
        
    def build(self):
        self.dense = tf.Variable(tf.random_normal((self.shape)))
        self._trainable_weights = self.dense
        
    def forward(self, input_, training=True):
        dropout_dense = tf.layers.dropout(self.dense, self.dropout_rate)
        batch_norm_activ = tf.layers.batch_normalization(tf.matmul(input_, dropout_dense), 
                                                         training=training)
        return tf.nn.relu(batch_norm_activ)
    
    def get_trainable_weights(self):
        return [self._trainable_weights] if self.trainable else []
    
class TransferModel:
    
    def __init__(self, input_dim, output_dim, training=True):
        self.hidden_dim = 256
        self.dropout_rate = 0.4
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.training = training
        self.build()
        
    def build(self):
        self.dense1 = DenseDropBN((self.input_dim, self.hidden_dim), self.dropout_rate)
        self.dense2 = DenseDropBN((self.hidden_dim, self.output_dim), self.dropout_rate)
        
        self.layers = [self.dense1, self.dense2]
        
    def forward(self, input_):
        h_output = self.dense1.forward(input_, training=self.training)
        result = self.dense2.forward(h_output, training=self.training)
        return result


class UniversalLMClassifier:
    
    def __init__(self, language_model, transfer_model):
        self.lm = language_model
        self.tf_model = transfer_model
        self.build()
        
    def build(self):
        self.layers = self.lm.layers[:-1] + self.tf_model.layers
        self.ckp = tf.train.Checkpoint(**dict([(str(i),var) for i, var in enumerate(self.get_trainable_weights())]))

    def forward(self, X, seq_len):
        output = self.lm.forward_last(X, seq_len)
        return self.tf_model.forward(output)
    
    def slanted_t_lr(self, t, T, cut_frac, ratio, n_max):
        p = 0
        cut = T * cut_frac
        if t < cut:
            p = t/cut
        else:
            p = 1 - (t - cut) / (cut * (1 / (cut_frac - 1))) 
        lr = n_max * (1 + p*(ratio - 1)) / ratio
        return lr
    
    def discr_finetuning(self, gradients, layers):
        var_count = len(gradients)
        layer_index = 0
        for level_layer, layer in enumerate(layers[::-1]):
            trainable_weights = layer.get_trainable_weights()
            for _ in trainable_weights:
                if level_layer > 0:
                    gradients[var_count - 1 - layer_index] /= (level_layer*2.6)
                layer_index += 1
    
    def fine_tune(self,
                  train_tokens, 
                  val_tokens, 
                  epochs, 
                  loss, 
                  logging=False,
                  log_dir="./finetune_log/",
                  ckpt_dir="./finetune_ckpts",
                  batch_size=32, 
                  val_bs=32,
                  cut_frac=0.1, 
                  ratio=32, 
                  n_max=0.01, 
                  bptt=10):
        
        if logging:
            summary_writer = tf.contrib.summary.create_file_writer(log_dir, flush_millis=10000)
            summary_writer.set_as_default()
            global_step = tf.train.get_or_create_global_step()
        
        iteration = 0
        T_estimate = epochs * int(len(train_tokens) / (bptt*1.5*batch_size))
        current_val_score = compute_token_score(self.lm, val_tokens, val_bs, bptt)
        
        for epoch in range(epochs):
            for x_train, y_train, seq_len in iterator.get_bptt_batch_iterator(train_tokens, batch_size, bptt):
                
                if logging:
                    global_step.assign_add(1)

                with tf.GradientTape() as tape:
                    loss_ = loss(self.lm, 
                                x_train, 
                                y_train, 
                                seq_len, 
                                logging=logging,
                                iteration=iteration)
                trainable_weights = self.lm.get_trainable_weights()
                gradients = tape.gradient(loss_, trainable_weights)
                # Apply Slanted Triangular learning rate
                gradients = [grad * self.slanted_t_lr(iteration, T_estimate, cut_frac, ratio, n_max)
                             for grad in gradients]
                # Apply discriminative finetuning
                self.discr_finetuning(gradients, self.lm.layers)
                # Apply gradient clipping
                gradients = [tf.clip_by_norm(grad, clip_norm=0.25) for grad in gradients]
                # Update weights
                self.lm.backward(trainable_weights, gradients)
            
            val_score = compute_token_score(self.lm, val_tokens, val_bs)
            
            if val_score < current_val_score:
                self.lm.save_model(ckpt=ckpt_dir)

    
    def train(self,
              x_train, 
              y_train, 
              x_val, 
              y_val, 
              loss, 
              epochs,
              score_fun,
              logging=False,
              log_dir="./ulm_log/",
              ckpt_dir="./ulm_ckpt/",
              batch_size=32, 
              val_bs=32,
              cut_frac=0.1, 
              ratio=32, 
              n_max=0.01):
        
        if logging:
            summary_writer = tf.contrib.summary.create_file_writer(log_dir, flush_millis=10000)
            summary_writer.set_as_default()
            global_step = tf.train.get_or_create_global_step()
            
        iteration = 0
        n_classes = y_train.shape[1]
        T = epochs * ceil(x_train.shape[0] / batch_size)
        current_val_score = compute_score(self, x_val, y_val, n_classes, val_bs, score_fun)
        for epoch in range(epochs):
            for x_train, y_train, seq_len in iterator.get_batch_iterator(x_train, y_train, n_classes, batch_size):
                
                if logging:
                    global_step.assign_add(1)

                with tf.GradientTape() as tape:
                    loss_ = loss(self, 
                                x_train, 
                                y_train, 
                                seq_len, 
                                logging=logging,
                                iteration=iteration)
                # Get unfrozen weights
                trainable_weights = [weight for layer in self.layers[-epoch-1:] for weight in layer.get_trainable_weights()]
                gradients = tape.gradient(loss_, trainable_weights)
                # Apply Slanted Triangular learning rate
                gradients = [grad * self.slanted_t_lr(iteration, T, cut_frac, ratio, n_max)
                             for grad in gradients]
                # Apply discriminative finetuning on the unfrozer layers
                self.discr_finetuning(gradients, self.layers[-epoch-1:])
                # Apply gradient clipping
                gradients = [tf.clip_by_norm(grad, clip_norm=0.25) for grad in gradients]
                # Update weights
                self.backward(trainable_weights, gradients)
            
            val_score = compute_score(self, x_val, y_val, n_classes, val_bs, score_fun)
            
            if val_score > current_val_score:
                self.save_model(ckpt=ckpt_dir)
    
    def backward(self, weights, gradients):
        for weight, grad in zip(weights, gradients):
            weight.assign_sub(grad)
            
    def get_trainable_weights(self):
        return [weight for layer in self.layers for weight in layer.get_trainable_weights()]
    
    def save_model(self, ckpt="./ulm_ckpt/"):
        self.ckp.save(ckpt)

    def restore_model(self, ckpt="./ulm_ckpt/"):
        self.ckp.restore(tf.train.latest_checkpoint(ckpt))
        
        

# Dummy finetuning and training of the language model

In [5]:
max_voc = 60000

In [6]:
p_tr_tokens, p_val_tokens, p_test_tokens, p_vocab = penn_treebank.load_data(max_voc)
voc_size = len(p_vocab)

## Finetuning

In [7]:
awd_asgd = LanguageModelAWD(voc_size)
awd_asgd.restore_model()

tf_model = TransferModel(awd_asgd.layers[-2].units, 26)

In [16]:
ulm_clf = UniversalLMClassifier(awd_asgd, tf_model)

In [9]:
ulm_clf.fine_tune(p_tr_tokens, p_val_tokens, 1, lm_loss_sparse)

## Training

In [6]:
# Create the dataset
import string 
from sklearn.preprocessing import LabelBinarizer

start_token = 26
end_token = 27
input_size = 32
timesteps = 10

def get_next_char(char):
    next_id = char2id[char] + 1
    next_id = 0 if next_id > 25 else next_id
    return id2char[next_id]

def get_next_chars_from_nparray(array):
    return [get_next_char(char) for char in array]


chars = list(string.ascii_lowercase)

id2char = {start_token:"\t", end_token:"\n"}
char2id = dict([(token,key) for key, token in id2char.items()])

for i,char in enumerate(chars):
    id2char[i] = char
    char2id[char] = i

enc_inputs = np.random.choice(chars, (input_size, timesteps))
dec_inputs = np.array([get_next_chars_from_nparray(example) for example in enc_inputs])

In [7]:
lb = LabelBinarizer()

In [8]:
my_x = np.array([char2id[c] for arr in enc_inputs for c in arr]).reshape((input_size,timesteps))
lb.fit([c for c in string.ascii_letters[0:26]])
my_y = lb.transform(dec_inputs[:,-1])
seq_len = [timesteps]*input_size

In [11]:
ulm_clf.train(my_x, my_y, my_x, my_y, cross_entropy_w_softmax, 100, my_score_fun)

In [None]:
compute_token_score(awd_asgd, p_val_tokens, 256, 10)