Application of the *transformer tutorial* from https://www.tensorflow.org/text/tutorials/transformer with another dataset: french and english sentences.

In [1]:
import collections
import logging
import os
import pathlib
import re
import string
import sys
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow_text as text
import tensorflow as tf

In [2]:
logging.getLogger('tensorflow').setLevel(logging.ERROR)  # suppress warnings

# Prepare dataset

In [3]:
examples= tfds.load('ted_multi_translate')
train_examples, val_examples = examples['train'], examples['validation']

In [6]:
type(train_examples)

tensorflow.python.data.ops.dataset_ops.PrefetchDataset

In [7]:
for item in train_examples.take(3):
    #print(item)
    print(item["translations"]["language"])
    print(len(tf.where(item["translations"]["language"]==b"fr").numpy()))
    print("************************************************")

tf.Tensor(
[b'ar' b'bg' b'de' b'en' b'es' b'fa' b'fr' b'gl' b'he' b'hr' b'hu' b'id'
 b'it' b'ja' b'ko' b'nl' b'pl' b'pt-br' b'ro' b'ru' b'th' b'tr' b'vi'
 b'zh-cn' b'zh-tw'], shape=(25,), dtype=string)
1
************************************************
tf.Tensor([b'en'], shape=(1,), dtype=string)
0
************************************************
tf.Tensor(
[b'ar' b'bg' b'cs' b'de' b'el' b'en' b'es' b'eu' b'fa' b'fr' b'he' b'hr'
 b'hu' b'it' b'ja' b'ko' b'nl' b'pl' b'pt-br' b'ro' b'ru' b'sk' b'sq'
 b'sr' b'sv' b'th' b'tr' b'uk' b'vi' b'zh-cn' b'zh-tw'], shape=(31,), dtype=string)
1
************************************************


In [8]:
import pdb
def get_sentence(df):
    if len(tf.where(df["translations"]["language"]==b"fr").numpy()):
        lang_idx_fr = tf.where(df["translations"]["language"]==b"fr").numpy()[0][0]
        if len(tf.where(df["translations"]["language"]==b"en").numpy()):
            lang_idx_en = tf.where(df["translations"]["language"]==b"en").numpy()[0][0]
            return df['translations']['translation'][lang_idx_fr], df['translations']['translation'][lang_idx_en]
        else:
            pass
    else:
        pass
    
    

In [9]:
train_set = {}
train_set["en"]=[]
train_set["fr"]=[]
for item in train_examples:
    get_sentence(item)
    if get_sentence(item) is not None:
        train_set["fr"].append(get_sentence(item)[0].numpy())
        train_set["en"].append(get_sentence(item)[1].numpy())

In [10]:
len(train_set["en"])

192304

In [11]:
train_fr = tf.data.Dataset.from_tensor_slices(train_set["fr"])
train_en = tf.data.Dataset.from_tensor_slices(train_set["en"])
train_set = tf.data.Dataset.zip((train_fr,train_en))

In [12]:
for item in train_set.take(3):
    print(item)

(<tf.Tensor: shape=(), dtype=string, numpy=b'Et je ne sais pas si \xc3\xa7a vous arrive , mais quand je ferme les yeux parfois pour m&apos; endormir , Je n&apos; arr\xc3\xaate pas de penser \xc3\xa0 mes yeux .'>, <tf.Tensor: shape=(), dtype=string, numpy=b'And so , I don &apos;t know if you &apos;ve ever had this , but when I close my eyes sometimes and try to sleep , I can &apos;t stop thinking about my own eyes .'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Et ainsi donc , le temps \xc3\xa9tant compt\xc3\xa9 , Si je vous joue litt\xc3\xa9ralement juste les 2 premi\xc3\xa8res lignes . C&apos; est tr\xc3\xa8s simple .'>, <tf.Tensor: shape=(), dtype=string, numpy=b'And so therefore , because time is short , if I just play you literally the first maybe two lines or so . It &apos;s very straightforward .'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'M\xc3\xaame dans les pays du monde qui ont les meilleures ressources , cet \xc3\xa9cart d&apos; esp\xc3\xa9rance de vie est de 20 ans .'

# Create french, english tokenizers

In [13]:
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [14]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 8000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)


In [15]:
%%time
fr_vocab = bert_vocab.bert_vocab_from_dataset(
    train_fr.batch(1000).prefetch(2),
    **bert_vocab_args
)


CPU times: user 1min 37s, sys: 115 ms, total: 1min 37s
Wall time: 1min 35s


In [16]:
%%time
en_vocab = bert_vocab.bert_vocab_from_dataset(
    train_en.batch(1000).prefetch(2),
    **bert_vocab_args
)


CPU times: user 1min 18s, sys: 98.6 ms, total: 1min 18s
Wall time: 1min 16s


In [17]:
en_vocab

['[PAD]',
 '[START]',
 '[END]',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&',
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '=',
 '?',
 '@',
 '\\',
 '^',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '£',
 '¿',
 'æ',
 'ø',
 'τ',
 'ย',
 'ร',
 'อ',
 '–',
 '—',
 '’',
 '•',
 '∇',
 '♪',
 '♫',
 'the',
 'and',
 'to',
 'apos',
 'of',
 'that',
 'in',
 'it',
 'you',
 'we',
 'is',
 'quot',
 'this',
 'so',
 'they',
 'was',
 'for',
 '##s',
 'are',
 'have',
 'what',
 'but',
 'on',
 'with',
 'can',
 'there',
 'about',
 'be',
 'as',
 'at',
 'all',
 'not',
 'do',
 'one',
 'my',
 're',
 'people',
 'like',
 'from',
 'if',
 'now',
 'our',
 'just',
 'these',
 'an',
 'he',
 'or',
 'when',
 'very',
 'because',
 'out',
 'me',
 'by',
 'going',
 'how',
 'know',
 'up',
 'them',
 'more',
 'had',
 'see',
 'think',
 'were',
 'which',
 '

In [18]:
def write_vocab_file(filepath, vocab):
    with open(filepath, 'w') as f:
        for token in vocab:
            print(token, file=f)


In [19]:
write_vocab_file('fr_vocab.txt', fr_vocab)
write_vocab_file('en_vocab.txt', en_vocab)

In [20]:
fr_tokenizer = text.BertTokenizer('fr_vocab.txt', **bert_tokenizer_params)
en_tokenizer = text.BertTokenizer('en_vocab.txt', **bert_tokenizer_params)


In [21]:
[item for item in dir(en_tokenizer) if not item.startswith('_')]

['detokenize',
 'name',
 'name_scope',
 'non_trainable_variables',
 'split',
 'split_with_offsets',
 'submodules',
 'tokenize',
 'tokenize_with_offsets',
 'trainable_variables',
 'variables',
 'with_name_scope']

In [22]:
for fr_examples, en_examples in train_set.batch(3).take(1):
    for fr in fr_examples.numpy():
        print(fr.decode('utf-8'))

    print()

    for en in en_examples.numpy():
        print(en.decode('utf-8'))
    
    encoded = en_tokenizer.tokenize(en_examples).merge_dims(-2,-1)
    for row in encoded.to_list():
        print(row)

Et je ne sais pas si ça vous arrive , mais quand je ferme les yeux parfois pour m&apos; endormir , Je n&apos; arrête pas de penser à mes yeux .
Et ainsi donc , le temps étant compté , Si je vous joue littéralement juste les 2 premières lignes . C&apos; est très simple .
Même dans les pays du monde qui ont les meilleures ressources , cet écart d&apos; espérance de vie est de 20 ans .

And so , I don &apos;t know if you &apos;ve ever had this , but when I close my eyes sometimes and try to sleep , I can &apos;t stop thinking about my own eyes .
And so therefore , because time is short , if I just play you literally the first maybe two lines or so . It &apos;s very straightforward .
Even in the best-resourced countries in the world , this life expectancy gap is as much as 20 years .
[77, 89, 13, 43, 156, 8, 79, 28, 54, 131, 115, 84, 8, 79, 28, 148, 317, 135, 88, 13, 97, 123, 43, 667, 110, 692, 498, 77, 330, 78, 1637, 13, 43, 100, 8, 79, 28, 54, 544, 337, 102, 110, 243, 692, 15]
[77, 89, 1

In [24]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def add_start_end(ragged):
    count = ragged.bounding_shape()[0]
    starts = tf.fill([count,1], START)
    ends = tf.fill([count,1], END)
    return tf.concat([starts, ragged, ends], axis=1)


In [25]:
def cleanup_text(reserved_tokens, token_txt):
    # Drop the reserved tokens, except for "[UNK]".
    bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
    bad_token_re = "|".join(bad_tokens)

    bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
    result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

    # Join them into strings.
    result = tf.strings.reduce_join(result, separator=' ', axis=-1)

    return result

In [26]:
class CustomTokenizer(tf.Module):
    def __init__(self, reserved_tokens, vocab_path):
        self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
        self._reserved_tokens = reserved_tokens
        self._vocab_path = tf.saved_model.Asset(vocab_path)

        vocab = pathlib.Path(vocab_path).read_text().splitlines()
        self.vocab = tf.Variable(vocab)

        ## Create the signatures for export:   

        # Include a tokenize signature for a batch of strings. 
        self.tokenize.get_concrete_function(
            tf.TensorSpec(shape=[None], dtype=tf.string))

        # Include `detokenize` and `lookup` signatures for:
        #   * `Tensors` with shapes [tokens] and [batch, tokens]
        #   * `RaggedTensors` with shape [batch, tokens]
        self.detokenize.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.detokenize.get_concrete_function(
              tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

        self.lookup.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.lookup.get_concrete_function(
              tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

        # These `get_*` methods take no arguments
        self.get_vocab_size.get_concrete_function()
        self.get_vocab_path.get_concrete_function()
        self.get_reserved_tokens.get_concrete_function()

    @tf.function
    def tokenize(self, strings):
        enc = self.tokenizer.tokenize(strings)
        # Merge the `word` and `word-piece` axes.
        enc = enc.merge_dims(-2,-1)
        enc = add_start_end(enc)
        return enc

    @tf.function
    def detokenize(self, tokenized):
        words = self.tokenizer.detokenize(tokenized)
        return cleanup_text(self._reserved_tokens, words)

    @tf.function
    def lookup(self, token_ids):
        return tf.gather(self.vocab, token_ids)

    @tf.function
    def get_vocab_size(self):
        return tf.shape(self.vocab)[0]

    @tf.function
    def get_vocab_path(self):
        return self._vocab_path

    @tf.function
    def get_reserved_tokens(self):
        return tf.constant(self._reserved_tokens)


In [27]:
reserved_tokens

['[PAD]', '[START]', '[END]']

In [28]:
tokenizers = tf.Module()
tokenizers.fr = CustomTokenizer(reserved_tokens, 'fr_vocab.txt')
tokenizers.en = CustomTokenizer(reserved_tokens, 'en_vocab.txt')


In [29]:
model_name = 'ted_hrlr_translate_fr_en_converter'
tf.saved_model.save(tokenizers, model_name)


In [30]:
reloaded_tokenizers = tf.saved_model.load(model_name)
reloaded_tokenizers.en.get_vocab_size().numpy()


7836

In [31]:
tokens = reloaded_tokenizers.en.tokenize(['Hello TensorFlow!'])
tokens.numpy()


array([[   1, 3320, 2258,  694,  940, 2560,    2,    2]])

In [32]:
text_tokens = reloaded_tokenizers.en.lookup(tokens)
text_tokens

<tf.RaggedTensor [[b'[START]', b'grass', b'risks', b'reality', b'sounds', b'##ill', b'[END]', b'[END]']]>

In [33]:
def tokenize_pairs(fr, en):
    fr = tokenizers.fr.tokenize(fr)
    print(fr)
    # Convert from ragged to dense, padding with zeros.
    fr = fr.to_tensor()
    print(fr)
    en = tokenizers.en.tokenize(en)
    # Convert from ragged to dense, padding with zeros.
    en = en.to_tensor()
    return fr, en

In [34]:
for fr_examples, en_examples in train_set.batch(3).take(1):
    tokenize_pairs(fr_examples, en_examples)


<tf.RaggedTensor [[1, 103, 117, 129, 363, 122, 138, 137, 114, 334, 12, 128, 159, 117, 1544, 107, 586, 526, 121, 46, 7, 101, 27, 112, 3258, 161, 2569, 161, 12, 117, 47, 7, 101, 27, 1195, 122, 102, 424, 34, 235, 586, 14, 2], [1, 103, 289, 152, 12, 106, 203, 854, 422, 12, 138, 117, 114, 1162, 1149, 225, 107, 18, 1222, 1592, 14, 36, 7, 101, 27, 105, 154, 406, 14, 2], [1, 158, 120, 107, 234, 125, 160, 118, 145, 107, 1504, 896, 12, 261, 4123, 37, 7, 101, 27, 3951, 102, 190, 105, 102, 432, 184, 14, 2]]>
tf.Tensor(
[[   1  103  117  129  363  122  138  137  114  334   12  128  159  117
  1544  107  586  526  121   46    7  101   27  112 3258  161 2569  161
    12  117   47    7  101   27 1195  122  102  424   34  235  586   14
     2]
 [   1  103  289  152   12  106  203  854  422   12  138  117  114 1162
  1149  225  107   18 1222 1592   14   36    7  101   27  105  154  406
    14    2    0    0    0    0    0    0    0    0    0    0    0    0
     0]
 [   1  158  120  107  234  125  160  1

In [35]:
BUFFER_SIZE = 192304
BATCH_SIZE = 64

In [36]:
def make_batches(ds):
    return (
      ds
      .cache()
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)
      .map(tokenize_pairs, num_parallel_calls=tf.data.AUTOTUNE)
      .prefetch(tf.data.AUTOTUNE))


train_batches = make_batches(train_set)

tf.RaggedTensor(values=Tensor("StatefulPartitionedCall:0", shape=(None,), dtype=int64), row_splits=Tensor("StatefulPartitionedCall:1", shape=(None,), dtype=int64))
Tensor("RaggedToTensor/RaggedTensorToTensor:0", shape=(None, None), dtype=int64)


In [37]:
for item in train_set.take(3):
    print(item)

(<tf.Tensor: shape=(), dtype=string, numpy=b'Et je ne sais pas si \xc3\xa7a vous arrive , mais quand je ferme les yeux parfois pour m&apos; endormir , Je n&apos; arr\xc3\xaate pas de penser \xc3\xa0 mes yeux .'>, <tf.Tensor: shape=(), dtype=string, numpy=b'And so , I don &apos;t know if you &apos;ve ever had this , but when I close my eyes sometimes and try to sleep , I can &apos;t stop thinking about my own eyes .'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'Et ainsi donc , le temps \xc3\xa9tant compt\xc3\xa9 , Si je vous joue litt\xc3\xa9ralement juste les 2 premi\xc3\xa8res lignes . C&apos; est tr\xc3\xa8s simple .'>, <tf.Tensor: shape=(), dtype=string, numpy=b'And so therefore , because time is short , if I just play you literally the first maybe two lines or so . It &apos;s very straightforward .'>)
(<tf.Tensor: shape=(), dtype=string, numpy=b'M\xc3\xaame dans les pays du monde qui ont les meilleures ressources , cet \xc3\xa9cart d&apos; esp\xc3\xa9rance de vie est de 20 ans .'

In [38]:
for (batch, (inp, tar)) in enumerate(train_batches.take(3)):
    print(inp)

tf.Tensor(
[[  1 103 202 ...   0   0   0]
 [  1 106 620 ...   0   0   0]
 [  1 112 457 ...   0   0   0]
 ...
 [  1 167 548 ...   0   0   0]
 [  1  36   7 ...   0   0   0]
 [  1 103 176 ...   0   0   0]], shape=(64, 98), dtype=int64)
tf.Tensor(
[[   1   36    7 ...    0    0    0]
 [   1  103  137 ...    0    0    0]
 [   1  667   18 ...    0    0    0]
 ...
 [   1  116   58 ...    0    0    0]
 [   1  103  116 ...    0    0    0]
 [   1  111 6141 ...    0    0    0]], shape=(64, 139), dtype=int64)
tf.Tensor(
[[   1  106 6762 ...    0    0    0]
 [   1  117 3625 ...    0    0    0]
 [   1  128  285 ...    0    0    0]
 ...
 [   1  132  116 ...    0    0    0]
 [   1  349  137 ...    0    0    0]
 [   1  156  128 ...    0    0    0]], shape=(64, 147), dtype=int64)


# Building transformer model

## Positional encoding

In [39]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

In [40]:
def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

## scaled_dot_product_attention

In [41]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

    # add extra dimensions to add the padding
    # to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [42]:
x = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
create_padding_mask(x)

<tf.Tensor: shape=(3, 1, 1, 5), dtype=float32, numpy=
array([[[[0., 0., 1., 1., 0.]]],


       [[[0., 0., 0., 1., 1.]]],


       [[[1., 1., 1., 0., 0.]]]], dtype=float32)>

In [43]:
x = tf.constant([[1.8, 2.2],[1.8, 2.2], [2.2, 1.8]], dtype=tf.float32)
tf.cast(tf.shape(x)[-1], tf.float32)

<tf.Tensor: shape=(), dtype=float32, numpy=2.0>

In [44]:
softmax = tf.nn.softmax(x, axis=1)
softmax

<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[0.4013123 , 0.59868765],
       [0.4013123 , 0.59868765],
       [0.59868765, 0.4013123 ]], dtype=float32)>

In [45]:
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0) #keep only the entire lower triangle 
    return mask  # (seq_len, seq_len)

# https://www.geeksforgeeks.org/tensorflow-js-tf-linalg-bandpart-function/

In [86]:
def scaled_dot_product_attention(q, k, v, mask):
    """Calculate the attention weights.
    q, k, v must have matching leading dimensions.
    k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
    The mask has different shapes depending on its type(padding or look ahead)
    but it must be broadcastable for addition.

    Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable
          to (..., seq_len_q, seq_len_k). Defaults to None.

    Returns:
    output, attention_weights
    """

    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # add the mask to the scaled tensor.
    if mask is not None:
        #print("mask", mask)
        scaled_attention_logits += (mask * -1e9)
        #print("scaled_attention_logits", scaled_attention_logits)
    # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output, attention_weights

In [48]:
def print_out(q, k, v):
    temp_out, temp_attn = scaled_dot_product_attention(
      q, k, v, None)
    print('Attention weights are:')
    print(temp_attn)
    print('Output is:')
    print(temp_out)

In [49]:
np.set_printoptions(suppress=True)

temp_k = tf.constant([[10, 0, 0],
                      [0, 10, 0],
                      [0, 0, 10],
                      [0, 0, 10]], dtype=tf.float32)  # (4, 3)

temp_v = tf.constant([[1, 0],
                      [10, 0],
                      [100, 5],
                      [1000, 6]], dtype=tf.float32)  # (4, 2)

# This `query` aligns with the second `key`,
# so the second `value` is returned.
temp_q = tf.constant([[0, 10, 0]], dtype=tf.float32)  # (1, 3)
print_out(temp_q, temp_k, temp_v)

Attention weights are:
tf.Tensor([[0. 1. 0. 0.]], shape=(1, 4), dtype=float32)
Output is:
tf.Tensor([[10.  0.]], shape=(1, 2), dtype=float32)


## Transformer

In [67]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3]) 
        #For each batch, we have self.num_heads attentions, and for each attention a subspace of representation 

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention,
                                      (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights

In [50]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
    ])

In [51]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):

        attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

        return out2

In [52]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        self.mha_1 = MultiHeadAttention(d_model, num_heads)
        self.mha_2 = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        self.encoder = EncoderLayer(d_model, num_heads, dff, rate)
        
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)
        
    def call(self,x, training, mask_1, mask_enc):
        output_mha_1 = self.mha_1(x, x,x, mask_1)
        output_mha_1 = self.dropout1(output_mha_1, training=training)
        output_norm_1 = self.layernorm1(x + output_mha_1) 
        
        output_encoder = self.encoder(x, training, mask_enc)
        
        output_mha_2 = self.mha_1(output_encoder, output_encoder,output_norm_1, mask_2)
        output_mha_2 = self.dropout1(output_mha_2, training=training)
        output_norm_2 = self.layernorm1(output_mha_2 + output_norm_1) 
        

        
        ffn_output = self.ffn(output_norm_2)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(output_norm_2 + ffn_output)  # (batch_size, input_seq_len, d_model)
        
        return out2

In [53]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training,
           look_ahead_mask, padding_mask):
        # enc_output.shape == (batch_size, input_seq_len, d_model)

        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(
            enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)

        return out3, attn_weights_block1, attn_weights_block2

In [87]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding,
                                                self.d_model)

        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
                           for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):

        seq_len = tf.shape(x)[1]

        # adding embedding and position encoding.
        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        #    print(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        #print(tf.math.sqrt(tf.cast(self.d_model, tf.float32)))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x  # (batch_size, input_seq_len, d_model)

In [55]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
               maximum_position_encoding, rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate)
                           for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training,
           look_ahead_mask, padding_mask):

        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                                 look_ahead_mask, padding_mask)

            attention_weights[f'decoder_layer{i+1}_block1'] = block1
            attention_weights[f'decoder_layer{i+1}_block2'] = block2

        # x.shape == (batch_size, target_seq_len, d_model)
        return x, attention_weights

In [91]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               target_vocab_size, pe_input, pe_target, rate=0.1):
        super().__init__()
        self.encoder = Encoder(num_layers, d_model, num_heads, dff,
                                 input_vocab_size, pe_input, rate)

        self.decoder = Decoder(num_layers, d_model, num_heads, dff,
                               target_vocab_size, pe_target, rate)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inputs, training):
        # Keras models prefer if you pass all your inputs in the first argument
        inp, tar = inputs

        enc_padding_mask, look_ahead_mask, dec_padding_mask = self.create_masks(inp, tar)

        enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)

        # dec_output.shape == (batch_size, tar_seq_len, d_model)
        dec_output, attention_weights = self.decoder(
            tar, enc_output, training, look_ahead_mask, dec_padding_mask)

        final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

        return final_output, attention_weights

    def create_masks(self, inp, tar):
        # Encoder padding mask
        enc_padding_mask = create_padding_mask(inp)

        # Used in the 2nd attention block in the decoder.
        # This padding mask is used to mask the encoder outputs.
        dec_padding_mask = create_padding_mask(inp)

        # Used in the 1st attention block in the decoder.
        # It is used to pad and mask future tokens in the input received by
        # the decoder.
        look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
        dec_target_padding_mask = create_padding_mask(tar)
        look_ahead_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
        #print(dec_padding_mask)
        return enc_padding_mask, look_ahead_mask, dec_padding_mask

# Training

In [92]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [93]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

In [94]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [95]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [96]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)


def accuracy_function(real, pred):
    accuracies = tf.equal(real, tf.argmax(pred, axis=2))

    mask = tf.math.logical_not(tf.math.equal(real, 0))
    accuracies = tf.math.logical_and(mask, accuracies)

    accuracies = tf.cast(accuracies, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

In [98]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

In [99]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=tokenizers.fr.get_vocab_size().numpy(),
    target_vocab_size=tokenizers.en.get_vocab_size().numpy(),
    pe_input=1000,
    pe_target=1000,
    rate=dropout_rate)

In [100]:
checkpoint_path = "./checkpoints/train"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Latest checkpoint restored!!')

Latest checkpoint restored!!


In [102]:
EPOCHS = 20

In [104]:
# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]


@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    with tf.GradientTape() as tape:
        predictions, _ = transformer([inp, tar_inp],
                                     training = True)
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(accuracy_function(tar_real, predictions))

In [105]:
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()

  # inp -> portuguese, tar -> english
    for (batch, (inp, tar)) in enumerate(train_batches):
        train_step(inp, tar)

        if batch % 50 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print(f'Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}')

    print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

    print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs\n')

Epoch 1 Batch 0 Loss 1.3823 Accuracy 0.7062
Epoch 1 Batch 50 Loss 1.4987 Accuracy 0.6864
Epoch 1 Batch 100 Loss 1.4989 Accuracy 0.6850
Epoch 1 Batch 150 Loss 1.4974 Accuracy 0.6857
Epoch 1 Batch 200 Loss 1.4970 Accuracy 0.6854
Epoch 1 Batch 250 Loss 1.4966 Accuracy 0.6854
Epoch 1 Batch 300 Loss 1.4984 Accuracy 0.6855
Epoch 1 Batch 350 Loss 1.5004 Accuracy 0.6850
Epoch 1 Batch 400 Loss 1.5032 Accuracy 0.6848
Epoch 1 Batch 450 Loss 1.5021 Accuracy 0.6848
Epoch 1 Batch 500 Loss 1.5034 Accuracy 0.6848
Epoch 1 Batch 550 Loss 1.5072 Accuracy 0.6841
Epoch 1 Batch 600 Loss 1.5076 Accuracy 0.6841
Epoch 1 Batch 650 Loss 1.5089 Accuracy 0.6839
Epoch 1 Batch 700 Loss 1.5106 Accuracy 0.6837
Epoch 1 Batch 750 Loss 1.5112 Accuracy 0.6837
Epoch 1 Batch 800 Loss 1.5137 Accuracy 0.6833
Epoch 1 Batch 850 Loss 1.5146 Accuracy 0.6830
Epoch 1 Batch 900 Loss 1.5155 Accuracy 0.6828
Epoch 1 Batch 950 Loss 1.5161 Accuracy 0.6826
Epoch 1 Batch 1000 Loss 1.5166 Accuracy 0.6825
Epoch 1 Batch 1050 Loss 1.5169 Accur

Epoch 3 Batch 2550 Loss 1.5193 Accuracy 0.6828
Epoch 3 Batch 2600 Loss 1.5196 Accuracy 0.6828
Epoch 3 Batch 2650 Loss 1.5204 Accuracy 0.6827
Epoch 3 Batch 2700 Loss 1.5207 Accuracy 0.6826
Epoch 3 Batch 2750 Loss 1.5213 Accuracy 0.6826
Epoch 3 Batch 2800 Loss 1.5221 Accuracy 0.6825
Epoch 3 Batch 2850 Loss 1.5226 Accuracy 0.6824
Epoch 3 Batch 2900 Loss 1.5229 Accuracy 0.6824
Epoch 3 Batch 2950 Loss 1.5235 Accuracy 0.6823
Epoch 3 Batch 3000 Loss 1.5236 Accuracy 0.6823
Epoch 3 Loss 1.5238 Accuracy 0.6823
Time taken for 1 epoch: 237.70 secs

Epoch 4 Batch 0 Loss 1.5437 Accuracy 0.6754
Epoch 4 Batch 50 Loss 1.4609 Accuracy 0.6908
Epoch 4 Batch 100 Loss 1.4715 Accuracy 0.6889
Epoch 4 Batch 150 Loss 1.4663 Accuracy 0.6900
Epoch 4 Batch 200 Loss 1.4668 Accuracy 0.6899
Epoch 4 Batch 250 Loss 1.4695 Accuracy 0.6891
Epoch 4 Batch 300 Loss 1.4692 Accuracy 0.6893
Epoch 4 Batch 350 Loss 1.4735 Accuracy 0.6889
Epoch 4 Batch 400 Loss 1.4741 Accuracy 0.6888
Epoch 4 Batch 450 Loss 1.4743 Accuracy 0.6888


Epoch 6 Batch 1900 Loss 1.4906 Accuracy 0.6871
Epoch 6 Batch 1950 Loss 1.4909 Accuracy 0.6871
Epoch 6 Batch 2000 Loss 1.4910 Accuracy 0.6870
Epoch 6 Batch 2050 Loss 1.4919 Accuracy 0.6869
Epoch 6 Batch 2100 Loss 1.4919 Accuracy 0.6870
Epoch 6 Batch 2150 Loss 1.4926 Accuracy 0.6868
Epoch 6 Batch 2200 Loss 1.4930 Accuracy 0.6867
Epoch 6 Batch 2250 Loss 1.4936 Accuracy 0.6867
Epoch 6 Batch 2300 Loss 1.4938 Accuracy 0.6866
Epoch 6 Batch 2350 Loss 1.4948 Accuracy 0.6865
Epoch 6 Batch 2400 Loss 1.4955 Accuracy 0.6864
Epoch 6 Batch 2450 Loss 1.4962 Accuracy 0.6863
Epoch 6 Batch 2500 Loss 1.4969 Accuracy 0.6862
Epoch 6 Batch 2550 Loss 1.4977 Accuracy 0.6861
Epoch 6 Batch 2600 Loss 1.4983 Accuracy 0.6860
Epoch 6 Batch 2650 Loss 1.4984 Accuracy 0.6861
Epoch 6 Batch 2700 Loss 1.4988 Accuracy 0.6861
Epoch 6 Batch 2750 Loss 1.4989 Accuracy 0.6861
Epoch 6 Batch 2800 Loss 1.4991 Accuracy 0.6861
Epoch 6 Batch 2850 Loss 1.4995 Accuracy 0.6860
Epoch 6 Batch 2900 Loss 1.5000 Accuracy 0.6859
Epoch 6 Batch

Epoch 9 Batch 1350 Loss 1.4660 Accuracy 0.6909
Epoch 9 Batch 1400 Loss 1.4670 Accuracy 0.6907
Epoch 9 Batch 1450 Loss 1.4668 Accuracy 0.6908
Epoch 9 Batch 1500 Loss 1.4677 Accuracy 0.6907
Epoch 9 Batch 1550 Loss 1.4679 Accuracy 0.6907
Epoch 9 Batch 1600 Loss 1.4689 Accuracy 0.6905
Epoch 9 Batch 1650 Loss 1.4699 Accuracy 0.6903
Epoch 9 Batch 1700 Loss 1.4700 Accuracy 0.6903
Epoch 9 Batch 1750 Loss 1.4704 Accuracy 0.6902
Epoch 9 Batch 1800 Loss 1.4715 Accuracy 0.6901
Epoch 9 Batch 1850 Loss 1.4719 Accuracy 0.6900
Epoch 9 Batch 1900 Loss 1.4733 Accuracy 0.6898
Epoch 9 Batch 1950 Loss 1.4739 Accuracy 0.6897
Epoch 9 Batch 2000 Loss 1.4744 Accuracy 0.6897
Epoch 9 Batch 2050 Loss 1.4745 Accuracy 0.6896
Epoch 9 Batch 2100 Loss 1.4749 Accuracy 0.6896
Epoch 9 Batch 2150 Loss 1.4752 Accuracy 0.6896
Epoch 9 Batch 2200 Loss 1.4757 Accuracy 0.6895
Epoch 9 Batch 2250 Loss 1.4761 Accuracy 0.6895
Epoch 9 Batch 2300 Loss 1.4762 Accuracy 0.6895
Epoch 9 Batch 2350 Loss 1.4764 Accuracy 0.6894
Epoch 9 Batch

Epoch 12 Batch 550 Loss 1.4292 Accuracy 0.6960
Epoch 12 Batch 600 Loss 1.4312 Accuracy 0.6956
Epoch 12 Batch 650 Loss 1.4306 Accuracy 0.6957
Epoch 12 Batch 700 Loss 1.4321 Accuracy 0.6956
Epoch 12 Batch 750 Loss 1.4334 Accuracy 0.6955
Epoch 12 Batch 800 Loss 1.4349 Accuracy 0.6954
Epoch 12 Batch 850 Loss 1.4368 Accuracy 0.6951
Epoch 12 Batch 900 Loss 1.4368 Accuracy 0.6952
Epoch 12 Batch 950 Loss 1.4383 Accuracy 0.6950
Epoch 12 Batch 1000 Loss 1.4392 Accuracy 0.6949
Epoch 12 Batch 1050 Loss 1.4400 Accuracy 0.6949
Epoch 12 Batch 1100 Loss 1.4406 Accuracy 0.6949
Epoch 12 Batch 1150 Loss 1.4417 Accuracy 0.6947
Epoch 12 Batch 1200 Loss 1.4418 Accuracy 0.6947
Epoch 12 Batch 1250 Loss 1.4425 Accuracy 0.6946
Epoch 12 Batch 1300 Loss 1.4434 Accuracy 0.6945
Epoch 12 Batch 1350 Loss 1.4439 Accuracy 0.6944
Epoch 12 Batch 1400 Loss 1.4450 Accuracy 0.6943
Epoch 12 Batch 1450 Loss 1.4456 Accuracy 0.6942
Epoch 12 Batch 1500 Loss 1.4463 Accuracy 0.6941
Epoch 12 Batch 1550 Loss 1.4473 Accuracy 0.6939
E

Epoch 14 Batch 2900 Loss 1.4555 Accuracy 0.6931
Epoch 14 Batch 2950 Loss 1.4559 Accuracy 0.6930
Epoch 14 Batch 3000 Loss 1.4556 Accuracy 0.6931
Epoch 14 Loss 1.4556 Accuracy 0.6931
Time taken for 1 epoch: 242.06 secs

Epoch 15 Batch 0 Loss 1.3721 Accuracy 0.7129
Epoch 15 Batch 50 Loss 1.3996 Accuracy 0.7012
Epoch 15 Batch 100 Loss 1.3995 Accuracy 0.7006
Epoch 15 Batch 150 Loss 1.4064 Accuracy 0.6991
Epoch 15 Batch 200 Loss 1.4034 Accuracy 0.7000
Epoch 15 Batch 250 Loss 1.4038 Accuracy 0.7002
Epoch 15 Batch 300 Loss 1.4075 Accuracy 0.6996
Epoch 15 Batch 350 Loss 1.4094 Accuracy 0.6994
Epoch 15 Batch 400 Loss 1.4105 Accuracy 0.6992
Epoch 15 Batch 450 Loss 1.4100 Accuracy 0.6997
Epoch 15 Batch 500 Loss 1.4105 Accuracy 0.6995
Epoch 15 Batch 550 Loss 1.4101 Accuracy 0.6996
Epoch 15 Batch 600 Loss 1.4115 Accuracy 0.6995
Epoch 15 Batch 650 Loss 1.4138 Accuracy 0.6991
Epoch 15 Batch 700 Loss 1.4173 Accuracy 0.6986
Epoch 15 Batch 750 Loss 1.4185 Accuracy 0.6984
Epoch 15 Batch 800 Loss 1.4188 Ac

Epoch 17 Batch 2100 Loss 1.4351 Accuracy 0.6958
Epoch 17 Batch 2150 Loss 1.4353 Accuracy 0.6958
Epoch 17 Batch 2200 Loss 1.4353 Accuracy 0.6958
Epoch 17 Batch 2250 Loss 1.4359 Accuracy 0.6957
Epoch 17 Batch 2300 Loss 1.4363 Accuracy 0.6957
Epoch 17 Batch 2350 Loss 1.4364 Accuracy 0.6957
Epoch 17 Batch 2400 Loss 1.4368 Accuracy 0.6956
Epoch 17 Batch 2450 Loss 1.4371 Accuracy 0.6956
Epoch 17 Batch 2500 Loss 1.4373 Accuracy 0.6956
Epoch 17 Batch 2550 Loss 1.4378 Accuracy 0.6955
Epoch 17 Batch 2600 Loss 1.4382 Accuracy 0.6954
Epoch 17 Batch 2650 Loss 1.4387 Accuracy 0.6954
Epoch 17 Batch 2700 Loss 1.4392 Accuracy 0.6953
Epoch 17 Batch 2750 Loss 1.4398 Accuracy 0.6953
Epoch 17 Batch 2800 Loss 1.4399 Accuracy 0.6953
Epoch 17 Batch 2850 Loss 1.4402 Accuracy 0.6952
Epoch 17 Batch 2900 Loss 1.4404 Accuracy 0.6952
Epoch 17 Batch 2950 Loss 1.4408 Accuracy 0.6952
Epoch 17 Batch 3000 Loss 1.4413 Accuracy 0.6951
Epoch 17 Loss 1.4414 Accuracy 0.6951
Time taken for 1 epoch: 242.75 secs

Epoch 18 Batch

Epoch 20 Batch 1350 Loss 1.4153 Accuracy 0.6989
Epoch 20 Batch 1400 Loss 1.4155 Accuracy 0.6989
Epoch 20 Batch 1450 Loss 1.4158 Accuracy 0.6989
Epoch 20 Batch 1500 Loss 1.4158 Accuracy 0.6989
Epoch 20 Batch 1550 Loss 1.4162 Accuracy 0.6988
Epoch 20 Batch 1600 Loss 1.4175 Accuracy 0.6986
Epoch 20 Batch 1650 Loss 1.4173 Accuracy 0.6987
Epoch 20 Batch 1700 Loss 1.4182 Accuracy 0.6986
Epoch 20 Batch 1750 Loss 1.4188 Accuracy 0.6986
Epoch 20 Batch 1800 Loss 1.4197 Accuracy 0.6984
Epoch 20 Batch 1850 Loss 1.4204 Accuracy 0.6983
Epoch 20 Batch 1900 Loss 1.4211 Accuracy 0.6982
Epoch 20 Batch 1950 Loss 1.4206 Accuracy 0.6983
Epoch 20 Batch 2000 Loss 1.4211 Accuracy 0.6983
Epoch 20 Batch 2050 Loss 1.4218 Accuracy 0.6982
Epoch 20 Batch 2100 Loss 1.4224 Accuracy 0.6981
Epoch 20 Batch 2150 Loss 1.4225 Accuracy 0.6981
Epoch 20 Batch 2200 Loss 1.4226 Accuracy 0.6980
Epoch 20 Batch 2250 Loss 1.4237 Accuracy 0.6979
Epoch 20 Batch 2300 Loss 1.4239 Accuracy 0.6980
Epoch 20 Batch 2350 Loss 1.4240 Accuracy

# Inference

In [109]:
class Translator(tf.Module):
    def __init__(self, tokenizers, transformer):
        self.tokenizers = tokenizers
        self.transformer = transformer

    def __call__(self, sentence, max_length=20):
        # input sentence is french, hence adding the start and end token
        assert isinstance(sentence, tf.Tensor)
        if len(sentence.shape) == 0:
            sentence = sentence[tf.newaxis]

        sentence = self.tokenizers.fr.tokenize(sentence).to_tensor()

        encoder_input = sentence

        # as the target is english, the first token to the transformer should be the
        # english start token.
        start_end = self.tokenizers.en.tokenize([''])[0]
        start = start_end[0][tf.newaxis]
        end = start_end[1][tf.newaxis]

        # `tf.TensorArray` is required here (instead of a python list) so that the
        # dynamic-loop can be traced by `tf.function`.
        output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
        output_array = output_array.write(0, start)

        for i in tf.range(max_length):
            output = tf.transpose(output_array.stack())
            predictions, _ = self.transformer([encoder_input, output], training=False)

            # select the last token from the seq_len dimension
            predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

            predicted_id = tf.argmax(predictions, axis=-1)

            # concatentate the predicted_id to the output which is given to the decoder
            # as its input.
            output_array = output_array.write(i+1, predicted_id[0])

            if predicted_id == end:
                break

        output = tf.transpose(output_array.stack())
        # output.shape (1, tokens)
        text = tokenizers.en.detokenize(output)[0]  # shape: ()

        tokens = tokenizers.en.lookup(output)[0]

        # `tf.function` prevents us from using the attention_weights that were
        # calculated on the last iteration of the loop. So recalculate them outside
        # the loop.
        _, attention_weights = self.transformer([encoder_input, output[:,:-1]], training=False)

        return text, tokens, attention_weights

In [110]:
translator = Translator(tokenizers, transformer)

In [111]:
def print_translation(sentence, tokens, ground_truth):
    print(f'{"Input:":15s}: {sentence}')
    print(f'{"Prediction":15s}: {tokens.numpy().decode("utf-8")}')
    print(f'{"Ground truth":15s}: {ground_truth}')

In [115]:
sentence = "ceci est un problème que nous devons résoudre."
ground_truth = "this is a problem we have to solve ."

translated_text, translated_tokens, attention_weights = translator(
    tf.constant(sentence))
print_translation(sentence, translated_text, ground_truth)

Input:         : ceci est un problème que nous devons résoudre.
Prediction     : this is a problem we have to solve .
Ground truth   : this is a problem we have to solve .
