In [None]:
import tensorflow.compat.v2 as tf 
import tensorflow_datasets as tfds

## 0. Dataset

In [None]:
pt2en_train = tfds.load('ted_hrlr_translate/pt_to_en', split='train', as_supervised=True)
for pt, en in pt2en_train.take(1):
  print(pt.numpy().decode('utf-8'))
  print(en.numpy().decode('utf-8'))


# e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .
# and when you improve searchability , you actually take away the one advantage of print , which is serendipity .

e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .
and when you improve searchability , you actually take away the one advantage of print , which is serendipity .


In [None]:
print(pt2en_train.as_numpy_iterator())

<tensorflow.python.data.ops.dataset_ops._NumpyIterator object at 0x7f3ea8edce50>


In [None]:
class Dataset():
    """loads and preps a dataset for machine translation"""

    def __init__(self):
        """
        creates the instance attributes:
            data_train - contains the ted_hrlr_translate/pt_to_en tf.data.Dataset train split, loaded as_supervided
            data_valid - contains the ted_hrlr_translate/pt_to_en tf.data.Dataset validate split, loaded as_supervided
            tokenizer_pt - Portuguese tokenizer created from the training set
            tokenizer_en - English tokenizer created from the training set
        """
        self.data_train = tfds.load('ted_hrlr_translate/pt_to_en', split='train', as_supervised=True)
        self.data_valid = tfds.load('ted_hrlr_translate/pt_to_en', split='validation', as_supervised=True)
        self.tokenizer_pt, self.tokenizer_en = self.tokenize_dataset(self.data_train)

    def tokenize_dataset(self, data):
        """
        creates sub-word tokenizers for dataset
        
        data is a tf.data.Dataset whose examples are formatted as a tuple (pt, en)
            pt is the tf.Tensor containing the Portuguese sentence
            en is the tf.Tensor containing the corresponding English sentence
        
        The maximum vocab size should be set to 2**15
        
        Returns: tokenizer_pt, tokenizer_en
            tokenizer_pt is the Portuguese tokenizer
            tokenizer_en is the English tokenizer
        """
        f = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus
        en_tok = f((en.numpy() for _, en in data.take(10)),
                         target_vocab_size=2**15)
        pt_tok = f((pt.numpy() for pt, _ in data.take(10)),
                         target_vocab_size=2**15)
        return pt_tok, en_tok

In [None]:
data = Dataset()
for pt, en in data.data_train.take(1):
    print(pt.numpy().decode('utf-8'))
    print(en.numpy().decode('utf-8'))
for pt, en in data.data_valid.take(1):
    print(pt.numpy().decode('utf-8'))
    print(en.numpy().decode('utf-8'))
print(type(data.tokenizer_pt))
print(type(data.tokenizer_en))


# e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .
# and when you improve searchability , you actually take away the one advantage of print , which is serendipity .
# tinham comido peixe com batatas fritas ?
# did they eat fish and chips ?
# <class 'tensorflow_datasets.core.deprecated.text.subword_text_encoder.SubwordTextEncoder'>
# <class 'tensorflow_datasets.core.deprecated.text.subword_text_encoder.SubwordTextEncoder'>

e quando melhoramos a procura , tiramos a única vantagem da impressão , que é a serendipidade .
and when you improve searchability , you actually take away the one advantage of print , which is serendipity .
tinham comido peixe com batatas fritas ?
did they eat fish and chips ?
<class 'tensorflow_datasets.core.deprecated.text.subword_text_encoder.SubwordTextEncoder'>
<class 'tensorflow_datasets.core.deprecated.text.subword_text_encoder.SubwordTextEncoder'>


## 1. Encode Tokens

In [None]:
class Dataset():
    """loads and preps a dataset for machine translation"""

    def __init__(self):
        """
        creates the instance attributes:
            data_train - contains the ted_hrlr_translate/pt_to_en tf.data.Dataset train split, loaded as_supervided
            data_valid - contains the ted_hrlr_translate/pt_to_en tf.data.Dataset validate split, loaded as_supervided
            tokenizer_pt - Portuguese tokenizer created from the training set
            tokenizer_en - English tokenizer created from the training set
        """
        self.data_train = tfds.load('ted_hrlr_translate/pt_to_en', split='train', as_supervised=True)
        self.data_valid = tfds.load('ted_hrlr_translate/pt_to_en', split='validation', as_supervised=True)
        self.tokenizer_pt, self.tokenizer_en = self.tokenize_dataset(self.data_train)

    def tokenize_dataset(self, data):
        """
        creates sub-word tokenizers for dataset
        
        data is a tf.data.Dataset whose examples are formatted as a tuple (pt, en)
            pt is the tf.Tensor containing the Portuguese sentence
            en is the tf.Tensor containing the corresponding English sentence
        
        The maximum vocab size should be set to 2**15
        
        Returns: tokenizer_pt, tokenizer_en
            tokenizer_pt is the Portuguese tokenizer
            tokenizer_en is the English tokenizer
        """
        f = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus
        en_tok = f((en.numpy() for _, en in data.take(10)),
                         target_vocab_size=2**15)
        pt_tok = f((pt.numpy() for pt, _ in data.take(10)),
                         target_vocab_size=2**15)
        return pt_tok, en_tok
    
    def encode(self, pt, en):
        """
        encodes a translation into tokens

        pt is the tf.Tensor containing the Portuguese sentence
        en is the tf.Tensor containing the corresponding English sentence
        
        The tokenized sentences should include the start and end of sentence tokens
        The start token should be indexed as vocab_size
        The end token should be indexed as vocab_size + 1
        
        Returns: pt_tokens, en_tokens
            pt_tokens is a np.ndarray containing the Portuguese tokens
            en_tokens is a np.ndarray. containing the English tokens
        """

        return self.tokenizer_en.encode(en.numpy()), self.tokenizer_pt.encode(pt.numpy())


In [None]:
data = Dataset()
for pt, en in data.data_train.take(1):
    print(data.encode(pt, en))
for pt, en in data.data_valid.take(1):
    print(data.encode(pt, en))

# ([30138, 6, 36, 17925, 13, 3, 3037, 1, 4880, 3, 387, 2832, 18, 18444, 1, 5, 8, 3, 16679, 19460, 739, 2, 30139], 
# [28543, 4, 56, 15, 1266, 20397, 10721, 1, 15, 100, 125, 352, 3, 45, 3066, 6, 8004, 1, 88, 13, 14859, 2, 28544])
# ([30138, 289, 15409, 2591, 19, 20318, 26024, 29997, 28, 30139], [28543, 93, 25, 907, 1366, 4, 5742, 33, 28544])

([98, 111, 101, 33, 120, 105, 102, 111, 33, 122, 112, 118, 33, 106, 110, 113, 115, 112, 119, 102, 33, 116, 102, 98, 115, 100, 105, 98, 99, 106, 109, 106, 117, 122, 33, 45, 33, 122, 112, 118, 33, 98, 100, 117, 118, 98, 109, 109, 122, 33, 117, 98, 108, 102, 33, 98, 120, 98, 122, 33, 117, 105, 102, 33, 112, 111, 102, 33, 98, 101, 119, 98, 111, 117, 98, 104, 102, 33, 112, 103, 33, 113, 115, 106, 111, 117, 33, 45, 33, 120, 105, 106, 100, 105, 33, 106, 116, 33, 116, 102, 115, 102, 111, 101, 106, 113, 106, 117, 122, 33, 47], [105, 36, 117, 121, 101, 114, 104, 115, 36, 113, 105, 112, 108, 115, 118, 101, 113, 115, 119, 36, 101, 36, 116, 118, 115, 103, 121, 118, 101, 36, 48, 36, 120, 109, 118, 101, 113, 115, 119, 36, 101, 36, 1, 114, 109, 103, 101, 36, 122, 101, 114, 120, 101, 107, 105, 113, 36, 104, 101, 36, 109, 113, 116, 118, 105, 119, 119, 3, 115, 36, 48, 36, 117, 121, 105, 36, 2, 36, 101, 36, 119, 105, 118, 105, 114, 104, 109, 116, 109, 104, 101, 104, 105, 36, 50])
([101, 106, 101, 33, 117,

## 2. TF Encode

In [None]:
class Dataset():
    """loads and preps a dataset for machine translation"""

    def __init__(self):
        """
        creates the instance attributes:
            data_train - contains the ted_hrlr_translate/pt_to_en tf.data.Dataset train split, loaded as_supervided
            data_valid - contains the ted_hrlr_translate/pt_to_en tf.data.Dataset validate split, loaded as_supervided
            tokenizer_pt - Portuguese tokenizer created from the training set
            tokenizer_en - English tokenizer created from the training set
        """
        self.data_train = tfds.load('ted_hrlr_translate/pt_to_en', split='train', as_supervised=True)
        self.data_valid = tfds.load('ted_hrlr_translate/pt_to_en', split='validation', as_supervised=True)
        self.tokenizer_pt, self.tokenizer_en = self.tokenize_dataset(self.data_train)
        self.data_train = self.data_train.map(self.tf_encode)
        self.data_valid = self.data_valid.map(self.tf_encode)

    def tokenize_dataset(self, data):
        """
        creates sub-word tokenizers for dataset
        
        data is a tf.data.Dataset whose examples are formatted as a tuple (pt, en)
            pt is the tf.Tensor containing the Portuguese sentence
            en is the tf.Tensor containing the corresponding English sentence
        
        The maximum vocab size should be set to 2**15
        
        Returns: tokenizer_pt, tokenizer_en
            tokenizer_pt is the Portuguese tokenizer
            tokenizer_en is the English tokenizer
        """
        f = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus
        en_tok = f((en.numpy() for _, en in data.take(10)),
                         target_vocab_size=2**15)
        pt_tok = f((pt.numpy() for pt, _ in data.take(10)),
                         target_vocab_size=2**15)
        return pt_tok, en_tok
    
    def encode(self, pt, en):
        """
        encodes a translation into tokens

        pt is the tf.Tensor containing the Portuguese sentence
        en is the tf.Tensor containing the corresponding English sentence
        
        The tokenized sentences should include the start and end of sentence tokens
        The start token should be indexed as vocab_size
        The end token should be indexed as vocab_size + 1
        
        Returns: pt_tokens, en_tokens
            pt_tokens is a np.ndarray containing the Portuguese tokens
            en_tokens is a np.ndarray. containing the English tokens
        """

        return self.tokenizer_en.encode(en.numpy()), self.tokenizer_pt.encode(pt.numpy())
    
    def tf_encode(self, pt, en):
        """
        acts as a tensorflow wrapper for the encode instance method
        Make sure to set the shape of the pt and en return tensors
        Update the class constructor def __init__(self):
        update the data_train and data_validate attributes by tokenizing the examples
        """
        return tf.py_function(self.encode, [pt, en], (tf.int32, tf.int32))


In [None]:
data = Dataset()
print('got here')
for pt, en in data.data_train.take(1):
    print(pt, en)
for pt, en in data.data_valid.take(1):
    print(pt, en)

# tf.Tensor([30138     6    36 17925    13     3  3037     1  4880     3   387  2832   18    18444     1     5     8     3 16679 19460   739     2 30139], shape=(23,), dtype=int64) tf.Tensor([28543     4    56    15  1266 20397 10721     1    15   100   125   352  3    45  3066     6  8004     1    88    13 14859     2 28544], shape=(23,), dtype=int64)
# tf.Tensor([30138   289 15409  2591    19 20318 26024 29997    28 30139], shape=(10,), dtype=int64) tf.Tensor([28543    93    25   907  1366     4  5742    33 28544], shape=(9,), dtype=int64)

got here
tf.Tensor(
[ 98 111 101  33 120 105 102 111  33 122 112 118  33 106 110 113 115 112
 119 102  33 116 102  98 115 100 105  98  99 106 109 106 117 122  33  45
  33 122 112 118  33  98 100 117 118  98 109 109 122  33 117  98 108 102
  33  98 120  98 122  33 117 105 102  33 112 111 102  33  98 101 119  98
 111 117  98 104 102  33 112 103  33 113 115 106 111 117  33  45  33 120
 105 106 100 105  33 106 116  33 116 102 115 102 111 101 106 113 106 117
 122  33  47], shape=(111,), dtype=int32) tf.Tensor(
[105  36 117 121 101 114 104 115  36 113 105 112 108 115 118 101 113 115
 119  36 101  36 116 118 115 103 121 118 101  36  48  36 120 109 118 101
 113 115 119  36 101  36   1 114 109 103 101  36 122 101 114 120 101 107
 105 113  36 104 101  36 109 113 116 118 105 119 119   3 115  36  48  36
 117 121 105  36   2  36 101  36 119 105 118 105 114 104 109 116 109 104
 101 104 105  36  50], shape=(95,), dtype=int32)
tf.Tensor(
[101 106 101  33 117 105 102 122  33 102  98 117  33 103 106 116

## 3. Pipeline

In [None]:
class Dataset():
    """loads and preps a dataset for machine translation"""

    def __init__(self, batch_size, max_len):
        """
        creates the instance attributes:
            data_train - contains the ted_hrlr_translate/pt_to_en encodings, loaded as_supervided
            data_valid - contains the ted_hrlr_translate/pt_to_en encodings, loaded as_supervided
            tokenizer_pt - Portuguese tokenizer created from the training set
            tokenizer_en - English tokenizer created from the training set
            batch_size is the batch size for training/validation
            max_len is the maximum number of tokens allowed per example sentence

update the data_train attribute by performing the following actions:
    filter out all examples that have either sentence with more than max_len tokens
    cache the dataset to increase performance
    shuffle the entire dataset
    split the dataset into padded batches of size batch_size
    prefetch the dataset using tf.data.experimental.AUTOTUNE to increase performance
update the data_validate attribute by performing the following actions:
    filter out all examples that have either sentence with more than max_len tokens
    split the dataset into padded batches of size batch_size
        """
        self.max = max_len
        self.bs = batch_size
        self.data_train = tfds.load('ted_hrlr_translate/pt_to_en', split='train', as_supervised=True)
        self.data_valid = tfds.load('ted_hrlr_translate/pt_to_en', split='validation', as_supervised=True)
        self.tokenizer_pt, self.tokenizer_en = self.tokenize_dataset(self.data_train)
        self.data_train = self.data_train.map(self.tf_encode)
        self.data_train = self.data_train.filter(self.check_len)
        self.data_train = self.data_train.cache()
        self.data_train = self.data_train.shuffle(25000)
        for i in self.data_train.take(1):
          print(i)
        self.data_train = self.data_train.padded_batch(batch_size, padded_shapes=(batch_size, -1))
        self.data_train = self.data_train.prefetch(1000)

        self.data_valid = self.data_valid.map(self.tf_encode)
        self.data_valid = self.data_valid.filter(self.check_len)
        self.data_valid = self.data_valid.padded_batch(batch_size)

    def check_len(self, a, b):
        """Checks for size == max_len"""
        c = tf.logical_and(tf.size(a) <= self.max,
                           tf.size(b) <= self.max)
        return c

    def tokenize_dataset(self, data):
        """
        creates sub-word tokenizers for dataset
        
        data is a tf.data.Dataset whose examples are formatted as a tuple (pt, en)
            pt is the tf.Tensor containing the Portuguese sentence
            en is the tf.Tensor containing the corresponding English sentence
        
        The maximum vocab size should be set to 2**15
        
        Returns: tokenizer_pt, tokenizer_en
            tokenizer_pt is the Portuguese tokenizer
            tokenizer_en is the English tokenizer
        """
        f = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus
        en_tok = f((en.numpy() for _, en in data.take(10)),
                         target_vocab_size=2**15)
        pt_tok = f((pt.numpy() for pt, _ in data.take(10)),
                         target_vocab_size=2**15)
        return pt_tok, en_tok
    
    def encode(self, pt, en):
        """
        encodes a translation into tokens

        pt is the tf.Tensor containing the Portuguese sentence
        en is the tf.Tensor containing the corresponding English sentence
        
        The tokenized sentences should include the start and end of sentence tokens
        The start token should be indexed as vocab_size
        The end token should be indexed as vocab_size + 1
        
        Returns: pt_tokens, en_tokens
            pt_tokens is a np.ndarray containing the Portuguese tokens
            en_tokens is a np.ndarray. containing the English tokens
        """
        return self.tokenizer_en.encode(en.numpy()), self.tokenizer_pt.encode(pt.numpy())
    
    def tf_encode(self, pt, en):
        """
        acts as a tensorflow wrapper for the encode instance method
        Make sure to set the shape of the pt and en return tensors
        Update the class constructor def __init__(self):
        update the data_train and data_validate attributes by tokenizing the examples
        """
        return tf.py_function(self.encode, [pt, en], (tf.int32, tf.int32))

In [None]:
tf.compat.v1.set_random_seed(0)
data = Dataset(32, 40)
for pt, en in data.data_train.take(1):
    print(pt, en)
for pt, en in data.data_valid.take(1):
    print(pt, en)

# tf.Tensor(
# [[30138  1029   104 ...     0     0     0]
#  [30138    40     8 ...     0     0     0]
#  [30138    12    14 ...     0     0     0]
#  ...
#  [30138    72 23483 ...     0     0     0]
#  [30138  2381   420 ...     0     0     0]
#  [30138     7 14093 ...     0     0     0]], shape=(32, 39), dtype=int64) tf.Tensor(
# [[28543   831   142 ...     0     0     0]
#  [28543    16    13 ...     0     0     0]
#  [28543    19     8 ...     0     0     0]
#  ...
#  [28543    18    27 ...     0     0     0]
#  [28543  2648   114 ... 28544     0     0]
#  [28543  9100 19214 ...     0     0     0]], shape=(32, 37), dtype=int64)
# tf.Tensor(
# [[30138   289 15409 ...     0     0     0]
#  [30138    86   168 ...     0     0     0]
#  [30138  5036     9 ...     0     0     0]
#  ...
#  [30138  1157 29927 ...     0     0     0]
#  [30138    33   837 ...     0     0     0]
#  [30138   126  3308 ...     0     0     0]], shape=(32, 32), dtype=int64) tf.Tensor(
# [[28543    93    25 ...     0     0     0]
#  [28543    11    20 ...     0     0     0]
#  [28543    11  2850 ...     0     0     0]
#  ...
#  [28543    11   406 ...     0     0     0]
#  [28543     9   152 ...     0     0     0]
#  [28543     4   272 ...     0     0     0]], shape=(32, 35), dtype=int64)

(<tf.Tensor: shape=(22,), dtype=int32, numpy=
array([232, 220, 217, 231, 217, 148, 215, 220, 221, 224, 216, 230, 217,
       226, 148, 225, 213, 232, 232, 217, 230,   2], dtype=int32)>, <tf.Tensor: shape=(25,), dtype=int32, numpy=
array([215, 229, 229, 101, 213, 228, 219, 211, 224, 309, 281, 101,  32,
       219, 223, 226, 225, 228, 230, 211, 224, 230, 215, 229,   4],
      dtype=int32)>)


ValueError: ignored

## 4. Create Masks

In [None]:
def create_masks(inputs, target):
    """
    creates all masks for training/validation:

    inputs is a tf.Tensor of shape (batch_size, seq_len_in) that contains the input sentence
    target is a tf.Tensor of shape (batch_size, seq_len_out) that contains the target sentence
    
    uses only tensorflow operations in order to properly function in the training step
    
    Returns: encoder_mask, combined_mask, decoder_mask
        encoder_mask is the tf.Tensor padding mask of shape (batch_size, 1, 1, seq_len_in) to be applied in the encoder
        combined_mask is the tf.Tensor of shape (batch_size, 1, seq_len_out, seq_len_out) used in the 1st attention block 
            in the decoder to pad and mask future tokens in the input received by the decoder. 
            It takes the maximum between a lookaheadmask and the decoder target padding mask.
        decoder_mask is the tf.Tensor padding mask of shape (batch_size, 1, 1, seq_len_in) used in the 2nd attention block in the decoder.
    """
    enc_mask = padding_mask(inputs)
    dec_mask = padding_mask(inputs)
    look_ahead = look_ahead_mask(tf.shape(target)[1])
    tar_mask = padding_mask(target)
    combined_mask = tf.maximum(tar_mask, look_ahead)
    return enc_mask, combined_mask, dec_mask

def padding_mask(seq):
    """Creates padded mask"""
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]


def look_ahead_mask(size):
    """Creates look ahead mask"""
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask[tf.newaxis, ...]


## 5. Train