In [2]:
!git clone https://github.com/Haris-Ali007/transformers-from-scratch.git

Cloning into 'transformers-from-scratch'...
remote: Enumerating objects: 23, done.[K
remote: Counting objects: 100% (23/23), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 23 (delta 8), reused 17 (delta 5), pack-reused 0[K
Receiving objects: 100% (23/23), 6.44 KiB | 6.44 MiB/s, done.
Resolving deltas: 100% (8/8), done.


# Train

In [6]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_text
from utils import masked_accuracy, masked_loss, CustomScheduler
from model import Transformer
import argparse

def prepare_batch(pt, en):
    pt = tokenizers.pt.tokenize(pt)
    pt = pt[:, :MAX_TOKENS]
    pt = pt.to_tensor()

    en = tokenizers.en.tokenize(en)
    en = en[:, :(MAX_TOKENS+1)] # all rows MAX_TOKENS size cols
    en_true = en[:, :-1].to_tensor() # DROP end token
    en_label = en[:, 1:].to_tensor() # DROP start token
    return (pt, en_true), en_label


def make_batches(ds):
    return (ds
            .shuffle(BUFFER_SIZE)
            .batch(BATCH_SIZE)
            .map(prepare_batch, tf.data.AUTOTUNE)
            .prefetch(buffer_size=tf.data.AUTOTUNE))

if __name__=="__main__":

    ### CONFIGURATIONS
    MAX_TOKENS=128
    BUFFER_SIZE = 20000
    num_layers=4
    d_model=128
    dff=512
    num_heads=8
    dropout_rate=0.1
    BATCH_SIZE = 8
    TRAINING_EPOCHS = 10
    MODEL_SAVE_PATH = 'transformer_model'

    #### Downloading dataset
    examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en',
                                with_info=True,
                                as_supervised=True)
    train_examples, val_examples = examples['train'], examples['validation']
    for pt_examples, en_examples in train_examples.batch(3).take(1):
        print('Examples in Portuguese')
        for pt in pt_examples.numpy():
            print(pt.decode('utf-8'))
        print()

        print('Examples in english')
        for en in en_examples.numpy():
            print(en.decode('utf-8'))

    #### Pre processing
    model_name = 'ted_hrlr_translate_pt_en_converter'
    tf.keras.utils.get_file(
                    f'{model_name}.zip',
                    f'https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip',
                    cache_dir='.', cache_subdir='', extract=True
                )
    tokenizers = tf.saved_model.load(model_name)
    encoded = tokenizers.en.tokenize(en_examples)

    print('> Token ID batch')
    for row in encoded.to_list():
        print(row)

    round_trip = tokenizers.en.detokenize(encoded)
    print(f"English Vocab size {tokenizers.en.get_vocab_size()}")
    print(f"Portuguese Vocab size {tokenizers.pt.get_vocab_size()}")

    lengths = []
    for pt_examples, en_examples in train_examples.batch(1).take(1):
        pt_tokens = tokenizers.pt.tokenize(pt_examples)
        lengths.append(pt_tokens.row_lengths())

        en_tokens = tokenizers.en.tokenize(en_examples)
        lengths.append(en_tokens.row_lengths())
        print('.', end='', flush=True)

    train_batches = make_batches(train_examples)
    val_batches = make_batches(val_examples)

    transformer = Transformer(num_layers=num_layers, d_model=d_model,
                            dff=dff, num_heads=num_heads,
                            dropout_rate=dropout_rate,
                            input_vocab_size=tokenizers.pt.get_vocab_size(),
                            target_vocab_size=tokenizers.en.get_vocab_size())

    optimizer = tf.keras.optimizers.Adam(learning_rate=CustomScheduler(d_model))
    transformer.compile(optimizer=optimizer,
                        loss=masked_loss,
                        metrics=[masked_accuracy])
    transformer.fit(train_batches, epochs=TRAINING_EPOCHS, validation_data=val_batches)
    # transformer.save(model_save_path)

# Test

In [71]:
class Translator(tf.Module):
  def __init__(self, tokenizers, transformer):
    self.tokenizers = tokenizers
    self.transformer = transformer

  def __call__(self, sentence, max_length=MAX_TOKENS):
    # The input sentence is Portuguese, hence adding the `[START]` and `[END]` tokens.
    assert isinstance(sentence, tf.Tensor)
    if len(sentence.shape) == 0:
      sentence = sentence[tf.newaxis]

    sentence = self.tokenizers.pt.tokenize(sentence).to_tensor()

    encoder_input = sentence

    # As the output language is English, initialize the output with the
    # English `[START]` token.
    start_end = self.tokenizers.en.tokenize([''])[0]
    start = start_end[0][tf.newaxis]
    end = start_end[1][tf.newaxis]

    # `tf.TensorArray` is required here (instead of a Python list), so that the
    # dynamic-loop can be traced by `tf.function`.
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)

    for i in tf.range(max_length):
      output = tf.transpose(output_array.stack())
      predictions = self.transformer([encoder_input, output], training=False)

      # Select the last token from the `seq_len` dimension.
      predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.
      predicted_id = tf.argmax(predictions, axis=-1)
      # Concatenate the `predicted_id` to the output which is given to the
      # decoder as its input.
      output_array = output_array.write(i+1, predicted_id[0])

      if predicted_id == end:
        break

    output = tf.transpose(output_array.stack())
    # The output shape is `(1, tokens)`.
    text = tokenizers.en.detokenize(output)[0]  # Shape: `()`.
    tokens = tokenizers.en.lookup(output)[0]

    # `tf.function` prevents us from using the attention_weights that were
    # calculated on the last iteration of the loop.
    # So, recalculate them outside the loop.
    self.transformer([encoder_input, output[:,:-1]], training=False)
    attention_weights = self.transformer.deocder.last_attn_scores

    return text, tokens, attention_weights

def print_translation(sentence, tokens, ground_truth):
  print(f'{"Input:":15s}: {sentence}')
  print(f'{"Prediction":15s}: {tokens.numpy().decode("utf-8")}')
  print(f'{"Ground truth":15s}: {ground_truth}')

In [66]:
translator = Translator(tokenizers, transformer)

In [68]:
sentence = 'este é um problema que temos que resolver.'
ground_truth = 'this is a problem we have to solve .'

translated_text, translated_tokens, attention_weights = translator(
    tf.constant(sentence))
print_translation(sentence, translated_text, ground_truth)

Input:         : este é um problema que temos que resolver.
Prediction     : and i ' m going to think about the world .
Ground truth   : this is a problem we have to solve .
