In [1]:
import logging
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
def positional_encoding(length, depth):
  """Returns a (length, depth) position encoding matrix."""
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]
  depths = np.arange(depth)[np.newaxis, :]/depth
  
  angle_rates = 1 / (10000**depths)   
  angle_rads = positions * angle_rates

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1) 

  return tf.cast(pos_encoding, dtype=tf.float32)

class PositionalEmbedding(tf.keras.layers.Layer):
  """Positional embedding layer."""
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True) 
    self.pos_encoding = positional_encoding(length=2048, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x

class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

class CrossAttention(BaseAttention):
  """Cross-attention layer between context and target."""
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True)
   
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

class GlobalSelfAttention(BaseAttention):
  """Self-attention layer with global attention mask."""
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

class CausalSelfAttention(BaseAttention):
  """Self-attention layer with causal attention mask."""
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

class FeedForward(tf.keras.layers.Layer):
  """Feed-forward layer."""
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x) 
    return x


class EncoderLayer(tf.keras.layers.Layer):
  """Encoder layer."""
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

class Encoder(tf.keras.layers.Layer):
  """Transformer encoder."""
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    x = self.pos_embedding(x)
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x

class DecoderLayer(tf.keras.layers.Layer):
  """Decoder layer."""
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)
    
    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  
    return x

class Decoder(tf.keras.layers.Layer):
  """Transformer decoder."""
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]

    self.last_attn_scores = None

  def call(self, x, context):
    x = self.pos_embedding(x)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores
    return x

class Transformer(tf.keras.Model):
  """Transformer model."""
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
    super().__init__()
    self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=input_vocab_size,
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs):
    context, x  = inputs

    context = self.encoder(context)

    x = self.decoder(x, context)

    logits = self.final_layer(x)

    return logits

### Let's Load Our Data

In [3]:
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en',
                               with_info=True,
                               as_supervised=True)

train_examples, val_examples = examples['train'], examples['validation']

Let's take a sample of our data

In [4]:
for pt_examples, en_examples in train_examples.batch(3).take(1):
  print('> Examples in Portuguese:')
  for pt in pt_examples.numpy():
    print(pt.decode('utf-8'))
  print()

  #English Translation
  print('> Examples in English:')
  for en in en_examples.numpy():
    print(en.decode('utf-8'))

> Examples in Portuguese:
e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .
mas e se estes fatores fossem ativos ?
mas eles não tinham a curiosidade de me testar .

> Examples in English:
and when you improve searchability , you actually take away the one advantage of print , which is serendipity .
but what if it were active ?
but they did n't test for curiosity .


In [5]:
model_name = 'ted_hrlr_translate_pt_en_converter'
tf.keras.utils.get_file(
    f'{model_name}.zip',
    f'https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip',
    cache_dir='.', cache_subdir='', extract=True
)

'.\\ted_hrlr_translate_pt_en_converter.zip'

In [6]:
l = list(train_examples)
en = [x[1] for x in l]
pt = [x[0] for x in l]
print(en)



In [7]:
en = [x.numpy().decode("utf-8") for x in en]
pt = [x.numpy().decode("utf-8") for x in pt]

In [8]:
en

['and when you improve searchability , you actually take away the one advantage of print , which is serendipity .',
 'but what if it were active ?',
 "but they did n't test for curiosity .",
 'and this conscious defiance is why i , as an agnostic , can still have faith .',
 'you can use everything on the table on me .',
 "`` i write a lot about `` '' security theater , '' '' which are products that make people feel secure , but do n't actually do anything . ''",
 "and they 've put it deep down in an iron mine in minnesota , ok , deep under the ground , and in fact , in the last couple of days announced the most sensitive results so far .",
 'see , some people might fear girls not liking them back .',
 "no , what happened to us , chris , is that power , it 's priced off the margin .",
 'back to my question : why did i stay ?',
 "`` when i found out that there are a whole bunch of industries that did n't have copyright protection , i thought , `` '' what exactly is the underlying logic ?

In [9]:
oov = '<OOV>'

en_tokenizer, pt_tokenizer = Tokenizer(oov_token=oov, filters="!\"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n"), Tokenizer(oov_token=oov, filters="!\"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n")
en_tokenizer.fit_on_texts(['<START>', '<END>']+en)
pt_tokenizer.fit_on_texts(['<START>', '<END>']+pt)

In [10]:
def tokenize(x, tokenizer):
    sequences = tokenizer.texts_to_sequences(x)
    return [[[tokenizer.word_index["<start>"]] + seq + [tokenizer.word_index["<end>"]] for seq in sequences]][0]

def detokenize(x, tokenizer):
    for i in range(len(x)):
        while tokenizer.word_index["<start>"] in x[i]:
            x[i].remove(tokenizer.word_index["<start>"])
        while tokenizer.word_index["<end>"] in x[i]:
            x[i].remove(tokenizer.word_index["<end>"])
        
    return tokenizer.sequences_to_texts(x)

def pad(x, MAX_LEN=300):
    return tf.convert_to_tensor(tf.keras.preprocessing.sequence.pad_sequences(x, padding='post', truncating='post', maxlen=MAX_LEN))

In [11]:
MAX_TOKENS=128
MAX_LEN = 200
def prepare_batch(first, second):
    first = first[:, :MAX_TOKENS]
    #first = tf.convert_to_tensor(first)
    
    second = second[:, :(MAX_TOKENS+1)]
    second_inputs = second[:, :-1]  # Drop the [END] tokseconds
    second_labels = second[:, 1:]   # Drop the [START] tokseconds
    
    return (first, second_inputs), second_labels

BUFFER_SIZE = 20000
BATCH_SIZE = 64
def make_batches(ds):
  return (
      ds
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)
      .map(prepare_batch, tf.data.AUTOTUNE)
      .prefetch(buffer_size=tf.data.AUTOTUNE))

In [12]:
ten = pad(tokenize(en, en_tokenizer), MAX_LEN)
tpt = pad(tokenize(pt, pt_tokenizer), MAX_LEN)
ten

<tf.Tensor: shape=(51785, 200), dtype=int32, numpy=
array([[16169,     3,    53, ...,     0,     0,     0],
       [16169,    22,    24, ...,     0,     0,     0],
       [16169,    22,    18, ...,     0,     0,     0],
       ...,
       [16169,    10,     2, ...,     0,     0,     0],
       [16169,    46,     2, ...,     0,     0,     0],
       [16169,    34,   393, ...,     0,     0,     0]])>

In [13]:
pten= [(a,b) for a,b in zip(tpt, ten)]
def gen():
    for x in pten:
        yield x

In [14]:
dataset = tf.data.Dataset.from_generator(gen, output_signature=(
    tf.TensorSpec(shape=(200,), dtype=tf.int32),
    tf.TensorSpec(shape=(200,), dtype=tf.int32)
)).prefetch(buffer_size=tf.data.AUTOTUNE)
dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(200,), dtype=tf.int32, name=None), TensorSpec(shape=(200,), dtype=tf.int32, name=None))>

In [15]:
ds = make_batches(dataset)

In [16]:
detokenize([list(list(ds.take(1))[0][1][0].numpy())], en_tokenizer)

["i think that 's what all this is pointing at <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV> <OOV>"]

In [17]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1
input_vocabs = len(pt_tokenizer.word_index)+1
output_vocabs = len(en_tokenizer.word_index)+1

transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=input_vocabs,
    target_vocab_size=output_vocabs,
    dropout_rate=dropout_rate)

In [18]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [19]:
def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [20]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

In [21]:
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

In [None]:
transformer.fit(ds, epochs=20)

Epoch 1/20
Epoch 2/20
  5/810 [..............................] - ETA: 2:37:25 - loss: 5.7682 - masked_accuracy: 0.1673