In [0]:
import pandas as pd
import numpy as np
import math
import re
import time
from google.colab import drive

In [0]:
try:
  %tensorflow_version 2.x
except:
  pass
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

In [4]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
with open('./europarl-v7.fr-en.en',
          mode='r',
          encoding='utf-8') as f:
          europarl_en = f.read()

with open('./europarl-v7.fr-en.fr',
          mode='r',
          encoding='utf-8') as f:
          europarl_fr = f.read()

with open('./nonbreaking_prefix.en',
          mode='r',
          encoding='utf-8') as f:
          nonbreaking_prefix_en = f.read()

with open('./nonbreaking_prefix.fr',
          mode='r',
          encoding='utf-8') as f:
          nonbreaking_prefix_fr = f.read()

## Preprocessing

In [6]:
len(europarl_fr),len(europarl_en)

(335706962, 301210536)

In [0]:
nonbreaking_prefix_en = nonbreaking_prefix_en.split('\n')
nonbreaking_prefix_en = [' ' + pref + '.' for pref in nonbreaking_prefix_en]

In [0]:
nonbreaking_prefix_fr = nonbreaking_prefix_fr.split('\n')
nonbreaking_prefix_fr = [' ' + pref + '.' for pref in nonbreaking_prefix_fr]

In [0]:
corpus_en = europarl_en
for prefix in nonbreaking_prefix_en:
  corpus_en = corpus_en.replace(prefix, prefix +'###')
corpus_en = re.sub(r'\.(?=[0-9]|[a-z]|[A-Z])','.###',corpus_en)
corpus_en = re.sub(r'.\#\#\#',' ',corpus_en)
corpus_en = re.sub(r'  +',' ',corpus_en)
corpus_en = corpus_en.split('\n')

In [0]:
corpus_fr = europarl_fr
for prefix in nonbreaking_prefix_fr:
  corpus_fr = corpus_fr.replace(prefix, prefix +'###')
corpus_fr = re.sub(r'\.(?=[0-9]|[a-z]|[A-Z])','.###',corpus_fr)
corpus_fr = re.sub(r'.\#\#\#',' ',corpus_fr)
corpus_fr = re.sub(r'  +',' ',corpus_fr)
corpus_fr = corpus_fr.split('\n')

In [0]:
len(corpus_en),len(corpus_fr)

In [0]:
tokenizer_en = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    corpus_en,target_vocab_size=2**13
)
tokenizer_fr = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    corpus_fr,target_vocab_size=2**13
)

In [0]:
VOCAB_SIZE_EN = tokenizer_en.vocab_size + 2 
VOCAB_SIZE_FR = tokenizer_fr.vocab_size + 2

In [0]:
inputs = [[VOCAB_SIZE_EN - 2] +tokenizer_en.encode(sentence)  + [VOCAB_SIZE_EN-1] for sentence in corpus_en]
outputs = [[VOCAB_SIZE_FR - 2] +tokenizer_fr.encode(sentence) + [VOCAB_SIZE_FR-1] for sentence in corpus_fr]

In [0]:
len(inputs),len(outputs)

 Without reversed won't work because each time you remove an element, the indices of the list are shifted by one. So you need to start by the end of the list so that you are always referring to the good index.



In [0]:
MAX_LEN = 20
idx_to_remove = [count for count,sentence in enumerate(inputs) if len(sentence) > MAX_LEN]
print('input length: ',len(inputs))
print('output length: ',len(outputs))
for id in reversed(idx_to_remove):
  #if id < len(inputs) and id < len(outputs):
    del inputs[id]
    del outputs[id]

idx_to_remove = [count for count,sentence in enumerate(outputs) if len(sentence) > MAX_LEN]
for id in reversed(idx_to_remove):
  #if id < len(inputs) and id < len(outputs):
    del inputs[id]
    del outputs[id]
print('input length: ',len(inputs))
print('output length: ',len(outputs))

In [0]:
for sent in inputs:
  print(len(sent))
  print(sent)
  print(tokenizer_en.decode(sent[1:len(sent)-1]))
  break

In [0]:
inputs[0]

In [0]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,                                                      
                                                       value=0,
                                                       padding='post',
                                                       maxlen=20)
outputs = tf.keras.preprocessing.sequence.pad_sequences(outputs,
                                                        value=0,
                                                        padding='post',
                                                        maxlen=20)

In [0]:
len(inputs[0])

In [0]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

In [0]:
dataset = tf.data.Dataset.from_tensor_slices((inputs,outputs))
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [0]:
class PositionalEncoding(layers.Layer):
  def __init__(self):
    super(PositionalEncoding, self).__init__()

  def get_angles(self,pos,i,d_model):
    angles = 1 / np.power(10000.,(2*(i//2))) / (np.float32(d_model))
    return pos * angles

  def call(self,inputs):
    seq_length = inputs.shape.as_list()[-2]
    d_model = inputs.shape.as_list()[-1]

    angles = self.get_angles(np.arange(seq_length)[:, np.newaxis],
                             np.arange(d_model)[np.newaxis, :],
                             d_model)
    angles[:, 0::2] = np.sin(angles[:, 0::2])
    angles[:, 1::2] = np.cos(angles[:, 1:2])
    pos_encoding = angles[np.newaxis, ...]

    return inputs + tf.cast(pos_encoding, tf.float32)


## Scaled Dot-Product Attention

Attention(Q, K, V) = softmax((QK.T) / sqrt(dk))*V

In [0]:
def scaled_dot_product(queries, keys, values, mask):
  product = tf.matmul(queries, keys, transpose_b=True)
  
  keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32) # want it to be a float that why we do cast and take the last composant
  scaled_product = product / tf.math.sqrt(keys_dim)

  if mask is not None:
    scaled_product += (mask * -1e9)

  attention = tf.matmul(tf.nn.softmax(scaled_product, axis=-1), values)
  return attention


## Multi-head attention

In [0]:
class MultiHeadAttention(layers.Layer):
  def __init__(self, nb_proj):
    super(MultiHeadAttention, self).__init__()
    self.nb_proj = nb_proj

  def build(self, input_shape) : #Initialization like init but will be called when we call the object for the first time
    #complete the initialization phase but will have to much more information because it will have input when being called  
    self.d_model = input_shape[-1]
    assert self.d_model % self.nb_proj == 0

    self.d_proj = self.d_model // self.nb_proj
    
    self.query_lin = layers.Dense(units=self.d_model)
    self.key_lin = layers.Dense(units=self.d_model)
    self.value_lin = layers.Dense(units=self.d_model)

    self.final_linear = layers.Dense(units=self.d_model)
  
  def split_proj(self, inputs, batch_size):#inputs (batch, seq_length, d_model):
    shape = (batch_size,
             -1,
             self.nb_proj,
             self.d_proj)
    splitted_inputs = tf.reshape(inputs, shape=shape) #(batch_size, seq_length, nb_proj, d_proj)
    return tf.transpose(splitted_inputs, perm=[0, 2, 1, 3])#(batch_size, nb_proj, seq_length, d_proj)

  def call(self, queries, keys, values, mask):
    batch_size = tf.shape(queries)[0]

    queries = self.query_lin(queries)
    keys = self.key_lin(keys)
    values =  self.value_lin(values)

    queries = self.split_proj(queries, batch_size)
    keys = self.split_proj(keys, batch_size)
    values = self.split_proj(values, batch_size)#attend to more information and to be able to get more relations between the elements of a sequence

    attention = scaled_dot_product(queries, keys, values, mask)

    attention = tf.transpose(attention, perm=[0, 2, 1, 3])
    concat_attention = tf.reshape(attention, 
                                  shape= (batch_size, -1, self.d_model))
    
    outputs = self.final_linear(concat_attention)
    return outputs




## Encoder

In [0]:
class EncoderLayer(layers.Layer):
  def __init__(self, FNN_units, nb_proj, dropout):
    super(EncoderLayer, self).__init__()
    self.FNN_units = FNN_units
    self.nb_proj = nb_proj
    self.dropout = dropout

  def build(self, input_shape):
    self.d_model = input_shape[-1]

    self.multi_head_attention = MultiHeadAttention(self.nb_proj)
    self.dropout_1 = layers.Dropout(rate=self.dropout)
    self.norm_1 = layers.LayerNormalization(epsilon=1e-6)

    self.dense_1 = layers.Dense(units=self.FNN_units, activation='relu')
    self.dense_2 = layers.Dense(units=self.d_model)
    self.dropout_2 = layers.Dropout(rate=self.dropout)
    self.norm_2 = layers.LayerNormalization(epsilon=1e-6)


  def call(self, inputs, mask, training):
    attention = self.multi_head_attention(inputs,
                                          inputs,
                                          inputs,
                                          mask)
    #print(attention.shape, inputs.shape)
    attention = self.dropout_1(attention, training=training)
    attention = self.norm_1(attention + inputs)

    outputs = self.dense_1(attention)
    outputs = self.dense_2(outputs)
    outputs = self.dropout_2(outputs)
    outputs = self.norm_2(outputs + attention)

    return outputs
  


In [0]:
class Encoder(layers.Layer):
  def __init__(self,
               nb_layers,
               FNN_units,
               nb_proj,
               dropout,
               vocab_size,
               d_model,
               name='encoder'):
    super(Encoder, self).__init__(name = name)
    self.nb_layers = nb_layers
    self.d_model = d_model

    self.embedding = layers.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding()
    self.dropout = layers.Dropout(rate=dropout)
    self.enc_layers = [EncoderLayer(FNN_units,
                                  nb_proj,
                                  dropout)
                    for _ in range(self.nb_layers)]

  def call(self, inputs, mask, training):
    outputs = self.embedding(inputs)
    outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    outputs = self.pos_encoding(outputs)
    outputs = self.dropout(outputs, training)

    for i in range(self.nb_layers):
     outputs = self.enc_layers[i](outputs, mask, training)

    return outputs




## Decoder

In [0]:
class DecoderLayer(layers.Layer):
  def __init__(self, FNN_units, nb_proj, dropout):
    super(DecoderLayer, self).__init__()
    self.FNN_units = FNN_units
    self.nb_proj = nb_proj
    self.dropout = dropout

  def build(self, input_shape):
    self.d_model = input_shape[-1]

    #self multi head attention
    self.multi_head_attention_1 = MultiHeadAttention(self.nb_proj)
    self.dropout_1 = layers.Dropout(rate=self.dropout)
    self.norm_1 = layers.LayerNormalization(epsilon=1e-6)

    # Multi head attention combined with encoder output
    self.multi_head_attention_2 = MultiHeadAttention(self.nb_proj)
    self.dropout_2 = layers.Dropout(rate=self.dropout)
    self.norm_2 = layers.LayerNormalization(epsilon=1e-6)

    #Feed forward
    self.dense_1 = layers.Dense(units=self.FNN_units, activation='relu')
    self.dense_2 = layers.Dense(units=self.d_model)
    self.dropout_3 = layers.Dropout(rate=self.dropout)
    self.norm_3 = layers.LayerNormalization(epsilon=1e-6)


  def call(self, inputs, enc_outputs, mask_1, mask_2, training):
    attention = self.multi_head_attention_1(inputs,
                                          inputs,
                                          inputs,
                                          mask_1)
    attention = self.dropout_1(attention, training)
    attention = self.norm_1(attention + inputs)

    attention_2 = self.multi_head_attention_2(attention,
                                          enc_outputs,
                                          enc_outputs,
                                          mask_2)

    attention_2 = self.dropout_2(attention_2, training)
    attention_2 = self.norm_2(attention_2 + attention)

   
    outputs = self.dense_1(attention_2)
    outputs = self.dense_2(outputs)
    outputs = self.dropout_3(outputs, training)
    outputs = self.norm_3(outputs + attention_2)


    return outputs
  



In [0]:
class Decoder(layers.Layer):
  def __init__(self,
               nb_layers,
               FNN_units,
               nb_proj,
               dropout,
               vocab_size,
               d_model,
               name='decoder'):
    super(Decoder, self).__init__(name = name)
    self.nb_layers = nb_layers
    self.d_model = d_model

    self.embedding = layers.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding()
    self.dropout = layers.Dropout(rate=dropout)
    self.dec_layers = [DecoderLayer(FNN_units,
                                  nb_proj,
                                  dropout)
                    for _ in range(self.nb_layers)]

   

  def call(self, inputs, enc_outputs, mask_1, mask_2, training):
    outputs = self.embedding(inputs)
    outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) #normalization
    outputs = self.pos_encoding(outputs)
    outputs = self.dropout(outputs, training)

    for i in range(self.nb_layers):
      outputs = self.dec_layers[i](outputs,
                                   enc_outputs,
                                   mask_1,
                                   mask_2,
                                   training)

    return outputs



## Transformer

In [0]:
class Transformer(tf.keras.Model):
  def __init__(self,
               vocab_size_enc,
               vocab_size_dec,
               d_model,
               nb_layers,
               FNN_units,
               nb_proj,
               dropout,
               name = 'transformer'
               ):
    super(Transformer, self).__init__()
    self.encoder = Encoder(nb_layers,
                           FNN_units,
                           nb_proj,
                           dropout,
                           vocab_size_enc,
                           d_model)
    self.decoder = Decoder(nb_layers,
                           FNN_units,
                           nb_proj,
                           dropout,
                           vocab_size_dec,
                           d_model)
    self.last_linear = layers.Dense(units = vocab_size_dec)

  def create_padding_mask(self, seq): # seq: (batch_size, seq_length)
    mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return mask[:, None, None, :]

  def create_look_ahead_mask(self, seq):
    seq_len = tf.shape(seq)[1]
    look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
    return look_ahead_mask


  def call(self, enc_inputs, dec_inputs, training):
    enc_mask = self.create_padding_mask(enc_inputs)
    dec_mask_1 = tf.maximum(
        self.create_padding_mask(dec_inputs),
        self.create_look_ahead_mask(dec_inputs)
    )
    dec_mask_2 = self.create_padding_mask(enc_inputs)

    enc_outputs = self.encoder(enc_inputs, enc_mask, training)
    dec_outputs = self.decoder(dec_inputs,
                              enc_outputs,
                              dec_mask_1,
                              dec_mask_2,
                              training)
    
    outputs = self.last_linear(dec_outputs)
    return outputs



## Training

In [0]:
tf.keras.backend.clear_session()

# Hyper-parameters
D_MODEL = 128 # 512
NB_LAYERS = 4 # 6
FFN_UNITS = 512 # 2048
NB_PROJ = 8 # 8
DROPOUT_RATE = 0.1 # 0.1

transformer = Transformer(vocab_size_enc=VOCAB_SIZE_EN,
                          vocab_size_dec=VOCAB_SIZE_FR,
                          d_model=D_MODEL,
                          nb_layers=NB_LAYERS,
                          FNN_units=FFN_UNITS,
                          nb_proj=NB_PROJ,
                          dropout=DROPOUT_RATE)

In [0]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                            reduction='none')

def loss_function(target, pred):
  mask = tf.math.logical_not(tf.math.equal(target, 0))
  loss_ = loss_object(target, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

In [0]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()

    self.d_model = tf.cast(d_model, tf.float32)
    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps**-1.5)
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(learning_rate,
                                    beta_1=0.9,
                                    beta_2=0.98,
                                    epsilon=1e-9)  

In [0]:
checkpoint_path = './drive/ My DRive/projects/transformer/ckpt'
ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
  ckpt.restart(ckpt_manager.latest_checkpoint)
  print('Latest checkpoint restored!!')

In [0]:
EPOCHS = 2
for epoch in range(EPOCHS):
  print(f'Start of epoch{epoch+1}')
  start = time.time()

  train_loss.reset_states()
  train_accuracy.reset_states()

  for (batch, (enc_inputs, targets)) in enumerate(dataset):
    dec_inputs = targets[:, :-1]
    dec_outputs_real = targets[:, 1:]
    with tf.GradientTape() as taps:
      predictions = transformer(enc_inputs, dec_inputs, True)
      loss = loss_function(dec_outputs_real, predictions)
    
    gradients = taps.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(dec_outputs_real, predictions)
    if batch % 50 == 0:
          print("Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f%}".format(
              epoch+1, batch, train_loss.result(), 100.*train_accuracy.result()))
          
  ckpt_save_path = ckpt_manager.save()
  print("Saving checkpoint for epoch {} at {}".format(epoch+1,
                                                      ckpt_save_path))
  print("Time taken for 1 epoch: {} secs\n".format(time.time() - start))

# Evaluation

In [0]:
def evaluate(inp_sentence):
  inp_sentence = [VOCAB_SIZE_EN-2] + tokenizer_en.encode(inp_sentence) + [ VOCAB_SIZE_EN-1]
  enc_input = tf.expand_dims(inp_sentence, axis=0) #Simulate the bathc dimension

  dec_output = tf.expand_dims([VOCAB_SIZE_FR-2], axis=0) #which is also the decoder input

  #make several iteration of the transformer
  for _ in range(MAX_LEN):
    predictions = transformer(enc_input, dec_output, False)#(1, seq_len, vocab_size_fr)

    prediction = predictions[:, -1:, :]

    #index of the next word
    predicted_id = tf.cast(tf.argmax(prediction, axis=-1), tf.int32)

    if predicted_id == VOCAB_SIZE_FR-1:#Translation is done
      return tf.squeeze(dec_output, axis=0) # get read of the batch dimension

    dec_output = tf.concat([dec_ouput, predicted_id], axis=-1)
  return tf.squeeze(dec_output, axis=0)

In [0]:
def translate(sentence):
  output = evaluate(sentence).numpy()

  predicted_sentence = tokenizer_fr.decode(
      [i for i in output if i < VOCAB_SIZE_FR-2]
  )
  print(f'Input: {sentence}')
  print(f'Predicted translation: {predicted_sentence}')

In [0]:
translate('It was so wonderful to build this application and I have learned a lot from this course and really enjoy it!!!')