# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Preprocessing

Following the Extract-Transform-Load pattern
1. i will extract the data into tf.data.Dataset class
2. perform some preprocessing/transformation on the data
3. load the data into the model

Create the 2 tokenizers en and sp
1. because the data is small and fit in memmory i will loop throw the file and make 2 list en and sp
2. adapt the en_vect on the en data and adapt the sp_tokenizer on the sp data

created a class to read, clean and tokenize the text data and return train_dataset, val_dataset, input_tok and target_tok

In [2]:
import string
import re
import unicodedata
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization


from sklearn.model_selection import train_test_split

class MTM:
  def __init__(self, name):
    self.name = name

  def unicode_to_ascii(self, s):
    ''' unicode string '''
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

  def preprocess_sentence(self, s):
    ''' 
        unicode sentence to ascii 
        keep any character and .?!,¿ punctiuations
        add space between punctiuation and words for decoding
        adding [SOS] and [EOS] special tokens
    '''
    s = self.unicode_to_ascii(s.lower().strip())
    
    s = re.sub(r'[^ a-z.?!,¿]', '', s)
    s = re.sub(r"([?.!,¿])", r" \1 ", s)
    s = re.sub(r'[" "]+', " ", s)
    
    return '[SOS] ' + s + ' [EOS]'

  def tokenize(self, data, seq_len):
    vect = TextVectorization(standardize=None, output_sequence_length=seq_len)
    vect.adapt(data)

    return vect

  def create_dataset(self, texts, in_targets, ot_targets):
    return tf.data.Dataset.from_tensor_slices(((texts, in_targets), ot_targets))

  def read_data(self, path):
    inputs = []
    targets = []
    with open(path) as f:
      for line in f.readlines():
        line = line.split('\t')
        inputs.append(self.preprocess_sentence(line[1]))
        targets.append(self.preprocess_sentence(line[0]))

    max_inputs_length = max(len(x.split()) for x in inputs)
    max_targets_length = max(len(x.split()) for x in targets)
    
    return inputs, targets, max_inputs_length, max_targets_length

  def call(self, path):
    inputs, targets, max_inputs_length, max_targets_length = self.read_data(path)
    
    input_vect = self.tokenize(inputs, max_inputs_length)
    target_vect = self.tokenize(targets, max_targets_length)

    x_train, x_val, y_train, y_val = train_test_split(inputs, targets)
  
    train_sp = x_train
    train_en_inputs = [' '.join(seq.split()[:-1]) for seq in y_train]
    train_en_outputs = [' '.join(seq.split()[1:]) for seq in y_train]

    val_sp = x_val
    val_en_inputs = [' '.join(seq.split()[:-1]) for seq in y_val]
    val_en_outputs = [' '.join(seq.split()[1:]) for seq in y_val]

    train_dataset = self.create_dataset(train_sp, train_en_inputs, train_en_outputs)
    val_dataset = self.create_dataset(val_sp, val_en_inputs, val_en_outputs)

    return train_dataset, val_dataset, input_vect, target_vect


In [3]:
##
BATCH_SIZE = 16
##

mtm = MTM('spa-en')
train_dataset, val_dataset, input_vect, target_vect = mtm.call('./data/spa.txt')

def vectorize(x, y):
    a, b = x
    return (input_vect(a), target_vect(b)), target_vect(y)

train_dataset = train_dataset.map(vectorize) \
                             .batch(BATCH_SIZE) \
                             .prefetch(BATCH_SIZE) \
                             .cache()

val_dataset = val_dataset.map(vectorize) \
                             .batch(BATCH_SIZE) \
                             .prefetch(BATCH_SIZE) \
                             .cache()

In [4]:
for (sp, en_inputs), en_labels in train_dataset.take(1):
  break
print(sp[0])
print(en_inputs[0])
print(en_labels[0])

tf.Tensor(
[    2   113   445    14    43  2744    31 24704     4     3     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0], shape=(80,), dtype=int64)
tf.Tensor(
[   2    6  550   12  104 1008    7 1058    4    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0], shape=(82,), dtype=int64)
tf.Tensor(
[   6  5

# Transformer

Transformers known for their parallel computing unlike RNNs but the position of the words is missing
to solve this problem the authers of the paper "Attention is all you need" offers a fixed absolute position method called Position Embeddings.
The idea is that we generate a fixed vector of the same size as the input sentence of shape (batch, seq_length, dim) and add it the the embedding of the sentence to encode the postion information.

So
1. make a configuration class
2. import the model
3. prepare the data
4. train the model

In [5]:
from dataclasses import dataclass

@dataclass
class Config:
    n_layers: int = 6
    input_vocab_size: int = None
    target_vocab_size: int = None
    d_model: int = 300
    num_heads: int = 8
    ffd_units: int = 256
    dropout: float = .3

In [6]:
from model import Transformer

config = Config()

input_vocab_size=len(input_vect.get_vocabulary())
target_vocab_size=len(target_vect.get_vocabulary())

config.input_vocab_size = input_vocab_size
config.target_vocab_size = target_vocab_size

model = Transformer(config)

model((sp, en_inputs)).shape

TensorShape([16, 82, 14163])

In [7]:
model.summary()

Model: "transformer"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder (Encoder)           multiple                  26259036  
                                                                 
 decoder (Decoder)           multiple                  39834636  
                                                                 
 dense_24 (Dense)            multiple                  4263063   
                                                                 
Total params: 70,356,735
Trainable params: 70,356,735
Non-trainable params: 0
_________________________________________________________________


In [8]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

  def get_config(self):
    config = {
        'd_model': self.d_model,
        'warmup_steps': self.warmup_steps
    }
    return config

In [9]:
def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [10]:
learning_rate = CustomSchedule(config.d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate)

model.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

In [11]:
# path = './checkpoints'
# checkpoints = tensorflow.keras.callbacks.ModelCheckpoint(path, save_best_only=True)
earlystopping = tf.keras.callbacks.EarlyStopping(patience=1)
# tensorboard = tensorflow.keras.callbacks.TensorBoard('./logs')
# lrONplateau = tf.keras.callbacks.ReduceLROnPlateau(
#                                             monitor="val_loss",
#                                             factor=0.5,
#                                             patience=1,
#                                             min_lr=0.000001
#                                          )

In [12]:
print(input_vocab_size)
print(target_vocab_size)

26673
14163


In [13]:
model.fit(train_dataset,
          epochs=1, 
          validation_data=val_dataset,
          callbacks=[earlystopping])



<keras.callbacks.History at 0x1f1744a6d00>

In [14]:
### END OF THE MODEL

tf.saved_model.save(model, './saved_model')



INFO:tensorflow:Assets written to: ./saved_model\assets


INFO:tensorflow:Assets written to: ./saved_model\assets


In [16]:
model.save_weights('./weights/model_weights.h5')

In [20]:
from tensorflow.keras.models import Sequential


input_vect = Sequential([input_vect])
input_vect.save('./text vectorizer/input_vectorizer', save_format='tf')

target_vect = Sequential([target_vect])
target_vect.save('./text vectorizer/target_vectorizer', save_format='tf')





INFO:tensorflow:Assets written to: ./text vectorizer/input_vectorizer\assets


INFO:tensorflow:Assets written to: ./text vectorizer/input_vectorizer\assets






INFO:tensorflow:Assets written to: ./text vectorizer/target_vectorizer\assets


INFO:tensorflow:Assets written to: ./text vectorizer/target_vectorizer\assets


In [36]:
vocab = input_vect.layers[0].layers[0].layers[0].get_vocabulary()

with open('./vocabulary/vocab.txt', 'w', encoding='utf-8') as f:
    for v in vocab:
        f.write(v + '\n')