# Description

In this notebook, we will build the **Machine Translation** from English to Vietnamese.

After the training process finish, we implement the **Post-Training-Quantization** to compress the trained model.

**NOTE**: to choose the quantization type, such as QUANTIZED_TYPE (int8, uint8, float16) or QUANTIZED_TECHNIQUE (symmetric or asymmetric), please read the file `model_utils_quant.py`

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
import random
random.seed(42)
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try: tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:   print(e)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import regex as re
import string
import nltk
from sklearn.model_selection import train_test_split

from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset
import tensorflow_text as tf_text

from utils.read_file_utils import *
from utils.model_utils_quant import *

In [None]:
PATH_EN_FILE_TRAIN = r"data/processed_data/en_sent_train.txt"
PATH_VI_FILE_TRAIN = r"data/processed_data/vi_sent_train.txt"

PATH_EN_FILE_TEST = r"data/processed_data/en_sent_test.txt"
PATH_VI_FILE_TEST = r"data/processed_data/vi_sent_test.txt"

PATH_TOKENIZER = r"data/tokeninzer_en_vi_converter"

MAX_TOKENS = 128

N_samples = None

# 1. Data handling

- First, we need to download the dataset.

In [None]:
list_en_sentence_train = read_text_file(PATH_EN_FILE_TRAIN)
list_en_sentence_train = np.array(list_en_sentence_train)
list_vi_sentence_train = read_text_file(PATH_VI_FILE_TRAIN)
list_vi_sentence_train = np.array(list_vi_sentence_train)

list_en_sentence_val = read_text_file(PATH_EN_FILE_TEST)
list_en_sentence_val = np.array(list_en_sentence_val)
list_vi_sentence_val = read_text_file(PATH_VI_FILE_TEST)
list_vi_sentence_val = np.array(list_vi_sentence_val)

if N_samples is not None:
    random_indices = np.random.choice(len(list_en_sentence_train), size=N_samples, replace=False)
    list_en_sentence_train = list_en_sentence_train[random_indices]
    list_vi_sentence_train = list_vi_sentence_train[random_indices]

    random_indices = np.random.choice(len(list_en_sentence_val), size=N_samples, replace=False)
    list_en_sentence_val = list_en_sentence_val[random_indices]
    list_vi_sentence_val = list_vi_sentence_val[random_indices]

assert len(list_en_sentence_train) == len(list_vi_sentence_train)
assert len(list_en_sentence_val) == len(list_vi_sentence_val)

print(f"Number of training sample: {len(list_en_sentence_train)}")
print(f"Number of validation sample: {len(list_en_sentence_val)}")

In [None]:
train_examples = tf.data.Dataset.from_tensor_slices((list_en_sentence_train, list_vi_sentence_train))
val_examples = tf.data.Dataset.from_tensor_slices((list_en_sentence_val, list_vi_sentence_val))

In [None]:
for en_examples, vi_examples in train_examples.batch(3).take(1):
    print('> Examples in English:')
    for en in en_examples.numpy():
        print(en.decode('utf-8'))
    print()

    print('> Examples in Vietnamese:')
    for vi in vi_examples.numpy():
        print(vi.decode('utf-8'))

## 1.1. Load tokenizer

- We load the pre-train Tokenizer and test it. 

In [None]:
tokenizers = tf.saved_model.load(PATH_TOKENIZER)

encoded = tokenizers.vi.tokenize(vi_examples)

print('> This is a padded-batch of token IDs:')
for row in encoded.to_list():
  print(row)

- The `detokenize` method can convert these tokens index back to original text 

In [None]:
list_original_sentence = tokenizers.vi.detokenize(encoded)

for original_sentence in list_original_sentence.numpy():
    print(original_sentence.decode('utf-8'))

## 1.2. Set up data pipeline with `tf.data`

The following function takes batches of text as input, and converts them to a format suitable for training.

- It tokenizes them into ragged batches.
- It trims each to be no longer than MAX_TOKENS.
- It splits the target (Vietnamese) tokens into inputs and labels. These are shifted by one step so that at each input location the label is the id of the next token.
- It converts the RaggedTensors to padded dense Tensors.
- It returns an (inputs, labels) pair.

In [None]:
def prepare_batch(en, vi):
    """
    This function take pair of en and vi. Then return suitable format for training
    """

    en = tokenizers.en.tokenize(en)      # Output is ragged.
    en = en[:, :MAX_TOKENS]    # Trim to MAX_TOKENS.
    en = en.to_tensor()  # Convert to 0-padded dense Tensor

    vi = tokenizers.vi.tokenize(vi)
    vi = vi[:, :(MAX_TOKENS+1)]
    vi_inputs = vi[:, :-1].to_tensor()  # Drop the [END] tokens
    vi_labels = vi[:, 1:].to_tensor()   # Drop the [START] tokens

    return (en, vi_inputs), vi_labels

In [None]:
BUFFER_SIZE = 10_000
BATCH_SIZE = 200

def make_batches(ds):
  return (
      ds
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)
      .map(prepare_batch, tf.data.AUTOTUNE)
      .prefetch(buffer_size=tf.data.AUTOTUNE))

In [None]:
# Create training and validation set batches.
train_batches = make_batches(train_examples)
val_batches = make_batches(val_examples)

In [None]:
for (pt, en), en_labels in train_batches.take(1):
  break

print(pt.shape)
print(en.shape)
print(en_labels.shape)

# 2. Define component

In [None]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

tf.keras.backend.clear_session()
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=tokenizers.en.get_vocab_size().numpy(),
    target_vocab_size=tokenizers.vi.get_vocab_size().numpy(),
    dropout_rate=dropout_rate)

- We can test the output shape of Transformer model.
- Then, we can test the output shape of the attention score, which has shape `(batch, heads, target_seq, input_seq)`

In [None]:
output = transformer((pt, en))

print(en.shape)
print(pt.shape)
print(f"Output shape: {output.shape}")

In [None]:
transformer.summary()

In [None]:
for layer in transformer.encoder.submodules:
    if isinstance(layer, Custom_Quantization_MultiHeadAttention):
        print(layer.count_params())

# 3. Training

## 3.1. Custom optimizer

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
  

learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

## 3.2. Loss and metrics

- Since the target sequences are padded, it is important to apply a padding mask when calculating the loss. 
- In other word, we will remove the padding 0 when calculating the mask. That mean the padding 0 will not affect the loss value.

In [None]:
def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

## 3.3. Custom Quatization Callback

In [None]:
class Post_Quantization_Callback(tf.keras.callbacks.Callback):
    def __init__(self):
        super(Post_Quantization_Callback, self).__init__()

    def on_train_end(self, logs=None):
        self.quantization_layer()

    def quantization_layer(self):
        for layer in self.model.encoder.submodules:
            if isinstance(layer, Custom_Quantization_MultiHeadAttention):
                layer.post_training_quantization()

        for layer in self.model.decoder.submodules:
            if isinstance(layer, Custom_Quantization_MultiHeadAttention):  
                layer.post_training_quantization()

## 3.3. Training

In [None]:
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])


callbacks = [
    Post_Quantization_Callback()
]

In [None]:
EPOCHS = 10
transformer.fit(train_batches,
                epochs=EPOCHS,
                validation_data=val_batches,
                callbacks=callbacks
                )

In [None]:
transformer.evaluate(val_batches)
print()

transformer.summary()

# 4. Inference

The following steps are used for inference:
- Encode the input sentence using the Vietnamese tokenizer (tokenizers.pt). This is the encoder input.
- The decoder input is initialized to the [START] token.
- Calculate the padding masks and the look ahead masks.
- The decoder then outputs the predictions by looking at the encoder output and its own output (self-attention).
- Concatenate the predicted token to the decoder input and pass it to the decoder **AGAIN**.
- The process will stop until we get the [END] token or reach the maximum number of token.

In [None]:
class Translator(tf.Module):
  def __init__(self, tokenizers, transformer):
    self.tokenizers = tokenizers
    self.transformer = transformer

  def __call__(self, sentence, max_length=MAX_TOKENS):

    # The input sentence is English, hence adding the `[START]` and `[END]` tokens.
    assert isinstance(sentence, tf.Tensor)
    if len(sentence.shape) == 0:
      sentence = sentence[tf.newaxis]

    sentence = self.tokenizers.en.tokenize(sentence).to_tensor()
    encoder_input = sentence

    # As the output language is Vietnamese, initialize the output with the `[START]` token.
    start_end = self.tokenizers.vi.tokenize([''])[0]
    start = start_end[0][tf.newaxis]
    end = start_end[1][tf.newaxis]

    # Define the output_array with [START] token
    # `tf.TensorArray` is required here, so that the dynamic-loop can be traced by `tf.function`.
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)

    for i in tf.range(max_length):
      output = tf.transpose(output_array.stack())
      predictions = self.transformer([encoder_input, output], training=False)

      # Select the last token from the `seq_len` dimension.
      predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.

      predicted_id = tf.argmax(predictions, axis=-1)

      # Concatenate the `predicted_id` to the output which is given to the decoder as its input.
      output_array = output_array.write(i+1, predicted_id[0])

      if predicted_id == end:
        break

    output = tf.transpose(output_array.stack()) # Shape (1, tokens)
    
    # We get the predicted output and corresponding token
    text = tokenizers.vi.detokenize(output)[0]  # Shape: `()`.
    tokens = tokenizers.vi.lookup(output)[0]

    return text, tokens

In [None]:
translator = Translator(tokenizers, transformer)

In [None]:
sentence = 'Ha Noi is the beautiful country.'

translated_text, translated_tokens = translator(tf.constant(sentence))
print(translated_text.numpy().decode('utf-8'))