In [None]:
%pip install tensorflow_datasets
%pip install tensorflow-text

In [None]:
import tensorflow as tf
import tensorflow_text as text
import tensorflow_datasets as tfds
import numpy as np

In [3]:
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True,
                               as_supervised=True)
train_examples, val_examples = examples['train'], examples['validation']
[train_examples, val_examples]

[<_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.string, name=None))>,
 <_PrefetchDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.string, name=None))>]

In [None]:
model_name = "ted_hrlr_translate_pt_en_converter"
tf.keras.utils.get_file(
    f"{model_name}.zip",
    f"https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip",
    cache_dir='.', cache_subdir='', extract=True
)
tokenizers = tf.saved_model.load(model_name)

In [None]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

def tokenize_pairs(pt, en):
    pt = tokenizers.pt.tokenize(pt)
    # Convert from ragged to dense, padding with zeros.
    pt = pt.to_tensor()

    en = tokenizers.en.tokenize(en)
    # Convert from ragged to dense, padding with zeros.
    en = en.to_tensor()
    return pt, en

def make_batches(ds):
  return (
      ds
      .cache()
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)
      .map(tokenize_pairs, num_parallel_calls=tf.data.AUTOTUNE)
      .prefetch(tf.data.AUTOTUNE))


train_batches = make_batches(train_examples)
val_batches = make_batches(val_examples)

\Large{PE_{(pos, 2i)} = \sin(pos / 10000^{2i / d_{model} })}

In [None]:
def get_angles(position, i, dim_model):
    degree = 2*(i//2)/np.float32(dim_model)
    return position / np.power(10**3, degree)

def position_encoding(position, dim_model, get_angle):
    angle_rads = get_angle(np.arange(position)[:, np.newaxis],
                          np.arange(dim_model)[np.newaxis, :],
                          dim_model)
    
    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

  # add extra dimensions to add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

# In summary, the purpose of this line of code is to create a square matrix filled with ones and then retain only the lower triangular
# elements, while setting all the elements above the main diagonal to zero. 
# This is a common operation in certain mathematical computations or when dealing with symmetric matrices,
#  where the upper triangular part is redundant due to symmetry and is therefore set to zero to save memory and computation.
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

In [4]:
def scalar_dot_product_attention(query, key, value, mask):
    matmul = tf.matmul(query, key, transpose_b=True)
    dim_k = tf.cast(tf.shape(key)[-1], tf.float32)
    # normalizing the matmul
    dot_attention_logits = matmul / tf.math.sqrt(dim_k)

    # add the mask to the scaled tensor.
    if mask is not None:
        dot_attention_logits += (mask * -1e9)
    attention_weights = tf.nn.softmax(dot_attention_logits, axis = -1)

    output = tf.matmul(attention_weights, value)

    return output, attention_weights

In [8]:
class MultiHead_Attention(tf.keras.layers.Layer):
    def __init__(self, dim_model, num_heads, attention):
        super(MultiHead_Attention, self).__init__()
        self.dim_model = dim_model
        self.num_heads = num_heads
        self.attention = attention

        assert self.dim_model % self.num_heads == 0

        self.depth = self.dim_model // self.num_heads

        self.weigths_query = tf.keras.layers.Dense(self.dim_model)
        self.weigths_key = tf.keras.layers.Dense(self.dim_model)
        self.weigths_value = tf.keras.layers.Dense(self.dim_model)

        self.dense = tf.keras.layers.Dense(self.dim_model)
    
    def split_heads(self, inputs, batch_size):
        """
        Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        inputs = tf.reshape(inputs, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(inputs, perm=[0, 2, 1, 3])
    def call(self, query, key, value, mask):
        batch_size = tf.shape(query)[0]

        q = self.weigths_query(query)
        k = self.weigths_key(key)
        v = self.weigths_value(value)

        q_splited = self.split_heads(q, batch_size)
        k_splited = self.split_heads(k, batch_size)
        v_splited = self.split_heads(v, batch_size)

        scaled_attention, attention_weights = self.attention(q_splited, k_splited, v_splited, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(scaled_attention,
                                    (batch_size, -1, self.dim_model))  # (batch_size, seq_len_q, d_model)

        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights

In [12]:
temp_mha = MultiHead_Attention(dim_model=512, num_heads=8, attention=scalar_dot_product_attention)
y = tf.random.uniform((1, 60, 512))  # (batch_size, encoder_sequence, d_model)
out, attn = temp_mha(value=y, key=y, query=y, mask=None)
out.shape, attn.shape

(TensorShape([1, 60, 512]), TensorShape([1, 8, 60, 60]))

In [None]:
def point_wise_feed_forward_network(dim_model, dense_units):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dense_units, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(dim_model)  # (batch_size, seq_len, d_model)
  ])

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, dim_model, num_heads, dense_units, attention, drop_rate):
        super(EncoderLayer, self).__init__()
        self.dim_model = dim_model
        self.multihead_attention = MultiHead_Attention(dim_model, num_heads, attention)
        self.feed_forward = point_wise_feed_forward_network(dim_model, dense_units)
        
        self.layer_normalizer_one = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_normalizer_two = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout_one = tf.keras.layers.Dropout(drop_rate)
        self.dropout_two = tf.keras.layers.Dropout(drop_rate)
    
    def call(self, inputs, training, mask):
        attention_output = self.multihead_attention(inputs, inputs, inputs, mask)
        attention_output = self.dropout_one(attention_output, training= training)

        output_one = self.layer_normalizer_one(inputs + attention_output)

        output_two = self.feed_forward(output_one)
        output_two = self.dropout_two(output_two, training= training)

        return self.layer_normalizer_one(output_two + output_one)

class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, dim_model, num_heads, dense_units, attention, drop_rate):
        super(DecoderLayer, self).__init__()
        self.multihead_one = MultiHead_Attention(dim_model, num_heads, attention)
        self.multihead_two = MultiHead_Attention(dim_model, num_heads, attention)
        
        self.feed_forward = point_wise_feed_forward_network(dim_model, dense_units)
        
        self.layer_normalizer_one = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_normalizer_two = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_normalizer_three = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout_one = tf.keras.layers.Dropout(drop_rate)
        self.dropout_two = tf.keras.layers.Dropout(drop_rate)
        self.dropout_three = tf.keras.layers.Dropout(drop_rate)
        
    def call(self, inputs, training, encoder_output, look_ahead_mask, padding_mask):
        attn1, attn_w1 = self.multihead_one(inputs, inputs, inputs, look_ahead_mask)
        attn1 = self.dropout_one(attn1)
        output_one = self.layer_normalizer_one(attn1 + inputs)

        attn2, attn_w2 = self.multihead_two(encoder_output, encoder_output, output_one, padding_mask)
        attn2 = self.dropout_two(attn2)
        output_two = self.layer_normalizer_two(attn2 + output_one)

        output_three = self.feed_forward(output_two)
        output_three = self.dropout_three(output_three, training= training)

        return self.layer_normalizer_three(output_three + output_one), attn_w1, attn_w2

In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, units, dim_model, num_heads, dense_units, vocab_size, maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()
        self.dim_model = dim_model
        self.units = units
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, self.dim_model)
        self.pos_encoding = position_encoding(maximum_position_encoding, self.dim_model)
        self.enc_layers = [EncoderLayer(attention=scalar_dot_product_attention, 
                                        dense_units=dense_units, 
                                        dim_model=dim_model,
                                        drop_rate=rate,
                                        num_heads=num_heads) for _ in range(units)]
        self.dropout = tf.keras.layers.Dropout(rate)
    
    def call(self, inputs, training, mask):
        seq_len = tf.shape(inputs)[1]
        embedded = self.embedding(inputs)
        embedded *= tf.math.sqrt(self.dim_model, tf.float32)
        embedded += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(embedded, training= training)

        for i in range(self.units):
            x = self.enc_layers[i](x, training, mask)
        
        return x  # (batch_size, input_seq_len, d_model)

In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, units, dim_model, num_heads, dense_units, target_vocab_size, maximum_position_encoding, rate=0.1):
        super(Decoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(target_vocab_size, dim_model)
        self.dim_model = dim_model
        self.units = units

        self.pos_encoding = position_encoding(maximum_position_encoding, dim_model)

        self.dec_layers = [DecoderLayer(dim_model=dim_model,
                                        attention=scalar_dot_product_attention,
                                        dense_units=dense_units,
                                        drop_rate=rate,
                                        num_heads=num_heads) for _ in range(units)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, inputs, enc_output, training, look_ahead_mask, padding_mask):
        seq_len = tf.shape(inputs)[1]
        attention_weights = {}
        x = self.embedding(inputs)
        x *= tf.math.sqrt(self.dim_model, tf.float32)
        x += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)
        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output,training,
                                                    look_ahead_mask,
                                                    padding_mask)
            attention_weights[f'decoder_layer{i+1}_block1'] = block1
            attention_weights[f'decoder_layer{i+1}_block2'] = block2
        # x.shape == (batch_size, target_seq_len, d_model)
        return x, attention_weights

In [None]:
class Transformer(tf.keras.Model):
    def __init__(self, units, dim_model, num_heads, dense_units, input_vocab_size,
                target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()
        self.encoder = Encoder(units, dim_model, num_heads, dense_units, input_vocab_size, pe_input, rate)
        self.decorder = Decoder(units, dim_model, num_heads, dense_units, target_vocab_size, pe_target, rate)
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def create_masks(self, inp, tar):
        # Encoder padding mask
        enc_padding_mask = create_padding_mask(inp)

        # Used in the 2nd attention block in the decoder.
        # This padding mask is used to mask the encoder outputs.
        dec_padding_mask = create_padding_mask(inp)

        # Used in the 1st attention block in the decoder.
        # It is used to pad and mask future tokens in the input received by
        # the decoder.
        look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
        dec_target_padding_mask = create_padding_mask(tar)
        look_ahead_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

        return enc_padding_mask, look_ahead_mask, dec_padding_mask
    
    def call(self, inputs, training):
        # Keras models prefer if you pass all your inputs in the first argument
        inp, tar = inputs
        enc_padding_mask, look_ahead_mask, dec_padding_mask = self.create_masks(inp, tar)
        enc_output = self.encoder(inputs, training, enc_padding_mask)
        dec_output, attention_weights = self.decoder(tar, 
                                                     enc_output, 
                                                     training, 
                                                     look_ahead_mask, 
                                                     dec_padding_mask)
        final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

        return final_output, attention_weights

In [None]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)


def accuracy_function(real, pred):
  accuracies = tf.equal(real, tf.argmax(pred, axis=2))

  mask = tf.math.logical_not(tf.math.equal(real, 0))
  accuracies = tf.math.logical_and(mask, accuracies)

  accuracies = tf.cast(accuracies, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

In [None]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=tokenizers.pt.get_vocab_size().numpy(),
    target_vocab_size=tokenizers.en.get_vocab_size().numpy(),
    pe_input=1000,
    pe_target=1000,
    rate=dropout_rate)

In [None]:
# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]


@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]

  with tf.GradientTape() as tape:
    predictions, _ = transformer([inp, tar_inp],
                                 training = True)
    loss = loss_function(tar_real, predictions)

  gradients = tape.gradient(loss, transformer.trainable_variables)
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

  train_loss(loss)
  train_accuracy(accuracy_function(tar_real, predictions))

EPOCHS = 20

for epoch in range(EPOCHS):
  train_loss.reset_states()
  train_accuracy.reset_states()

  # inp -> portuguese, tar -> english
  for (batch, (inp, tar)) in enumerate(train_batches):
    train_step(inp, tar)

    if batch % 50 == 0:
      print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

  print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

In [None]:
class Translator(tf.Module):
  def __init__(self, tokenizers, transformer):
    self.tokenizers = tokenizers
    self.transformer = transformer

  def __call__(self, sentence, max_length=20):
    # input sentence is portuguese, hence adding the start and end token
    assert isinstance(sentence, tf.Tensor)
    if len(sentence.shape) == 0:
      sentence = sentence[tf.newaxis]

    sentence = self.tokenizers.pt.tokenize(sentence).to_tensor()

    encoder_input = sentence

    # as the target is english, the first token to the transformer should be the
    # english start token.
    start_end = self.tokenizers.en.tokenize([''])[0]
    start = start_end[0][tf.newaxis]
    end = start_end[1][tf.newaxis]

    # `tf.TensorArray` is required here (instead of a python list) so that the
    # dynamic-loop can be traced by `tf.function`.
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)

    for i in tf.range(max_length):
      output = tf.transpose(output_array.stack())
      predictions, _ = self.transformer([encoder_input, output], training=False)

      # select the last token from the seq_len dimension
      predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

      predicted_id = tf.argmax(predictions, axis=-1)

      # concatentate the predicted_id to the output which is given to the decoder
      # as its input.
      output_array = output_array.write(i+1, predicted_id[0])

      if predicted_id == end:
        break

    output = tf.transpose(output_array.stack())
    # output.shape (1, tokens)
    text = tokenizers.en.detokenize(output)[0]  # shape: ()

    tokens = tokenizers.en.lookup(output)[0]

    # `tf.function` prevents us from using the attention_weights that were
    # calculated on the last iteration of the loop. So recalculate them outside
    # the loop.
    _, attention_weights = self.transformer([encoder_input, output[:,:-1]], training=False)

    return text, tokens, attention_weights

In [None]:
translator = Translator(tokenizers, transformer)