# Shakespeare Translator Colab Training Notebook

This is an interactive version of the training script. It make it easier to tweak, analyze, and view the model in real time.

### Run below if in colab

In [None]:
# %%capture
# !pip install tensorflow_text

## Imports

In [45]:
import os
import json
import time
import datetime
import numpy as np
import tensorflow as tf
import tensorflow_text as tf_text
import matplotlib.pyplot as plt
from data_utils import DataManager, load_sentencepiece_model
from utils import print_bar, predict, create_masks
from model import Transformer

## Train utility functions. Custom loss, accuracy, and schedule

The custom learning rate schedule will increase linearly to about 1e-4, then decay ~1/sqrt(step) thereafter.

The model doesn't do well if the learning rate gets as high as 1e-3.

In [46]:
### Custom learning rate schedulers
class RootDecaySchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, max_lr, warmup_steps, **kwargs):
        super().__init__(**kwargs)
        self.max_lr = max_lr
        self.warmup_steps = warmup_steps
    def __call__(self, step):
        linear = self.max_lr*(step/self.warmup_steps)
        decay_steps = tf.math.maximum(step-self.warmup_steps, 1e-7)
        fall = self.max_lr*0.1**(decay_steps/5000.)
        return tf.math.minimum(linear, fall)
    
class TransformerSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps, **kwargs):
        super().__init__(**kwargs)
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps
    def get_config(self):
        base_config = super().get_config()
        return {**base_config, "warmup_steps": self.warmup_steps}
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)/ 10.
    
def loss_fn(labels, logits):
    mask = tf.not_equal(labels, 0)
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels, logits)
    mask = tf.cast(mask, loss.dtype)
    loss *= mask
    return tf.reduce_sum(loss)/tf.reduce_sum(mask)

def accuracy_fn(labels, logits):
    mask = tf.not_equal(labels, 0)
    preds = tf.cast(tf.argmax(logits, axis=-1), labels.dtype)
    acc = tf.cast(tf.equal(labels, preds), tf.float32)
    acc = tf.boolean_mask(acc, mask)
    return tf.reduce_mean(acc)

## Tokenizer configurations

SentencePiece has the option to randomize the tokenization. This means we can ask it to randomly choose from the top "nbest_size" number of tokenization sequences. This helps to both regularize the model and make it more robust to learning a variety of subword meanings. This is a more of data augmentation and greatly enhances the model.

In [48]:
input_text='data_clean/modern.txt'
target_text='data_clean/original.txt'
inp_sp_model_file='tokenizers/modern2k.model'
tar_sp_model_file='tokenizers/original2k.model'
inp_nbest_size = 5
tar_nbest_size = 5

## Load text data

SentencePiece is blazingly fast, so we train directly on the text data (which also lets us do the tokenization sampling). The caveat, is the input data needs to have been minorly cleaned, which it is.

In [49]:
data_config = {
    'input_text':input_text,
    'target_text':target_text,
    'inp_sp_model_file':inp_sp_model_file,
    'tar_sp_model_file':tar_sp_model_file,
    'inp_nbest_size':inp_nbest_size,
    'tar_nbest_size':tar_nbest_size
}
dm = DataManager.directly_from_text(data_config)
input_vocab_size = int(dm.inp_tokenizer.vocab_size())
target_vocab_size = int(dm.tar_tokenizer.vocab_size())

## Model initialization

In [58]:
# Setup the model
num_layers = 4
d_model=512
num_heads=4
d_ffn=1024
max_position=512
dropout_rate=0.1

tf.keras.backend.clear_session()
config = {
    'num_layers': num_layers,
    'd_model':d_model,
    'num_heads': num_heads,
    'd_ffn': d_ffn,
    'input_vocab_size': input_vocab_size,
    'target_vocab_size': target_vocab_size,
    'pe_input': max_position,
    'pe_target': max_position,
    'p_drop': dropout_rate
}

model = Transformer(**config)




Initializing model...
Model parameters:
{'num_layers': 4, 'd_model': 512, 'num_heads': 4, 'd_ffn': 1024, 'input_vocab_size': 2048, 'target_vocab_size': 2048, 'pe_input': 512, 'pe_target': 512, 'p_drop': 0.1}


## Set optimizer and learning rate

In [None]:
warmup_steps = 1000
learning_rate = TransformerSchedule(d_model=d_model, warmup_steps=warmup_steps)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

## Define metrics and training and evaluation steps

In [None]:
# Define metrics
train_loss = tf.keras.metrics.Mean()
valid_loss = tf.keras.metrics.Mean()
train_acc = tf.keras.metrics.Mean()
valid_acc = tf.keras.metrics.Mean()

@tf.function
def train_step(inp, tar):
    tar_in = tar[:, :-1]
    tar_out = tar[:, 1:]
    tar_len = tf.shape(tar_in)[1]
    enc_padding_mask, look_ahead_mask, dec_padding_mask = create_masks(inp, tar_in)
    with tf.GradientTape() as tape:
        logits, _ = model(inp, tar_in, training=True, enc_padding_mask=enc_padding_mask,
                       look_ahead_mask=look_ahead_mask, dec_padding_mask=dec_padding_mask)
        loss = loss_fn(tar_out, logits)
    accuracy = accuracy_fn(tar_out, logits)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    train_loss(loss)
    train_acc(accuracy)

@tf.function
def evaluation_step(inp, tar):
    tar_in = tar[:, :-1]
    tar_out = tar[:, 1:]
    tar_len = tf.shape(tar_in)[1]
    enc_padding_mask, look_ahead_mask, dec_padding_mask = create_masks(inp, tar_in)
    logits, _ = model(inp, tar_in, training=False, enc_padding_mask=enc_padding_mask,
                   look_ahead_mask=look_ahead_mask, dec_padding_mask=dec_padding_mask)
    loss = loss_fn(tar_out, logits)
    accuracy = accuracy_fn(tar_out, logits)
    valid_loss(loss)
    valid_acc(accuracy)

## Initialize TensorBoard logging

In [60]:
!rm -r logs
train_log_dir = './logs/' + '/train'
test_log_dir = './logs/' + '/test'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
test_summary_writer = tf.summary.create_file_writer(test_log_dir)

%load_ext tensorboard
%tensorboard --logdir logs

## Set max input sequences length, initialize global step, initialize train/test datasets

In [None]:
max_length=100

# Configure datasets for training
glob_step = tf.Variable(0, dtype=tf.int64) # This will break tf.summary if we use int32
train_ds, valid_ds = dm.get_train_valid_datasets(max_length)

## [optional] Print out a model summary

We need to input some data into the model so that it can initialize it's weights. Afterwards we can print out the model summary. This makes the code awkward, as it's right in the middle of the dataset setup. This is because we want to do this before caching the dataset, but we can't do it until we have initialized the dataset.

In [None]:
temp_inp, temp_tar = next(iter(train_ds.batch(1)))
model(temp_inp, temp_tar[:, :-1], False, None, None, None)
model.summary()

## Configure dataset for training: shuffle, batch, cache, prefetch.

In [63]:
batch_size = 32

train_ds = train_ds.cache().shuffle(10000).batch(batch_size, drop_remainder=True)
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)
valid_ds = valid_ds.cache().batch(batch_size, drop_remainder=True)
valid_ds = valid_ds.prefetch(tf.data.AUTOTUNE)
DATASET_SIZE = int(dm.ds_size//batch_size)
iterator=iter(train_ds)

## Training loop

For each loop we run a training step and print out the loss and accuracy to a log or stdout. If run in colab, the print bar doesn't show up for some reason, so we manually print every 100 steps. On epoch ends, we evaluate the validation set and log the information. 

Every 500 steps we also print out the current model's prediction of a simple sentence "Where are you?". Given our limited HS english, we expect this to be something long the lines of "Wherefore art thou?"

The log information should hopefully appear in the tensorboard app a few cells back. We must always re-run that tensorboard cell before we run the training loop cell. If you have to restart the training loop cell, make sure you remove the ```!rm -r logs``` line in the tensorboard cell.

In [None]:
epochs=20

absolute_start = time.time()
print("\n\n~~~~~~~~~~ Beginning training ~~~~~~~~~~")
for epoch in range(epochs):

    print('\n'+'-'*10+f' Epoch {epoch+1}/{epochs} '+'-'*10)
    start = time.time()
    for metric in [train_loss, valid_loss, train_acc, valid_acc]:
        metric.reset_states()

    for step, (inp, tar) in enumerate(train_ds):
        
        train_step(inp, tar)
        
        diff = (time.time()-start)/(step+1)
        print_bar(step, DATASET_SIZE, diff, train_loss.result().numpy())
        if (int(glob_step)+1)%100==0:
            step = int(glob_step)
            iter_message = f"Iteration {step+1:02d}/{DATASET_SIZE*epochs}:"
            time_message = f" {1/diff:.2f} it/s."
            loss_message = f" Loss: {float(train_loss.result()):.3f}"
            acc_message = f" Accuracy: {float(train_acc.result()):.3f}"
            print(iter_message+time_message+loss_message+acc_message)

        with train_summary_writer.as_default():
            tf.summary.scalar('loss', train_loss.result(), step=glob_step)
            tf.summary.scalar('accuracy', train_acc.result(), step=glob_step)
            tf.summary.scalar('lr', learning_rate(tf.cast(glob_step, tf.float32)), step=glob_step)
        glob_step.assign_add(1)

        if int(glob_step)%500==0:
            sentence = 'where are you?'
            pred_sentence = dm.tar_tokenizer.detokenize(predict(sentence, dm.inp_tokenizer, model)[0])
            print(f"Input sentence: {sentence}")
            print(f"Output sentence: {pred_sentence.numpy()[0].decode()}")

    if DATASET_SIZE is None:
        DATASET_SIZE = int(glob_step)

    for inp, tar in valid_ds:
        evaluation_step(inp, tar)

    with test_summary_writer.as_default():
        tf.summary.scalar('loss', valid_loss.result(), step=glob_step)
        tf.summary.scalar('accuracy', valid_acc.result(), step=glob_step)

tot_time = time.time()-absolute_start
minutes = int(tot_time)//60
seconds = int(tot_time)%60
print('*'*100+"\n\nTRAINING COMPLETE.\n\n"+'*'*100)
print(f"\n\nTotal time: {minutes:02d}min. {seconds:02d}sec.\n\n")

## Saving the model

Since we are running this in a jupyter notebook, we don't have any checkpoint files. This is assuming your browswer won't crash and you can stop training and save at any time.

If you want the checkpoints, you can copy them over from the script code. Or just run the script!

We need to save 1) the tensorboard logs 2) the model configuration (so we can reload it from the weights) 3) the model weights

Trying to directly save the model will fail, as it doesn't like that model takes multiple inputs (inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask). I haven't figured out how to deal with that, other than to just save the weights and make a new model used only for inference, that doesn't take any tar or mask input. 

In [None]:
from google.colab import files

!mv logs translate_logs
!zip -r translate_logs.zip translate_logs
files.download('translate_logs.zip')

with open('model_config.json', 'w') as file:
    file.write(json.dumps(config))

In [None]:
model.save_weights('translator_weights', )
files.download('/content/translator_weights.index')
files.download('/content/translator_weights.data-00000-of-00001')

## Loading a saved model

In [80]:
import os
import json
from model import Transformer
from utils import predict

In [81]:
if os.path.isfile('config.json'):
    with open('config.json', 'r') as file:
        config = json.loads(file.read())
else:
    raise FileExistsError('Could not find configuration file.')
        
model = Transformer(**config)

In [82]:
random_inp = tf.random.uniform((1,4), 0, 10, tf.int32)
_ = model(random_inp, random_inp, training=False, enc_padding_mask=None, dec_padding_mask=None, look_ahead_mask=None)

model.load_weights('translator_weights')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fefd3c1d070>

In [124]:
predict('The king fought hard, but in the end he died.', inp_tokenizer, tar_tokenizer, model)[0]

"tf.Tensor(b'the king hath funed, but in his mind died.', shape=(), dtype=string)"

In [None]:
# S3
# DEEP LEARNING AMI
# p2 xlarge is the smallest machine learning
# have to be in the .ssh folder when ssh'ing. use ssh -i enter_thing_here