# Import dependencies

In [1]:
import numpy as np 
import math 
import re 
import time
#from google.colab import drive

In [2]:
import tensorflow as tf 
from tensorflow.keras import layers
import tensorflow_datasets as tfdf

# Loading Files

In [3]:
with open("C:\\Users\\gaura\\Desktop\\Modern Natural Language Processing in Python\\Transformer\\europarl-v7.fr-en.en",
          mode='r',
          encoding='utf-8') as f:
    europarl_en = f.read()
with open("C:\\Users\\gaura\\Desktop\\Modern Natural Language Processing in Python\\Transformer\\europarl-v7.fr-en.fr",
          mode='r',
          encoding='utf-8') as f:
    europarl_fr = f.read()
with open("C:\\Users\\gaura\\Desktop\\Modern Natural Language Processing in Python\\Transformer\\nonbreaking_prefix.en",
          mode='r',
          encoding='utf-8') as f:
    non_breaking_prefix_en = f.read()
with open("C:\\Users\\gaura\\Desktop\\Modern Natural Language Processing in Python\\Transformer\\nonbreaking_prefix.fr",
          mode='r',
          encoding='utf-8') as f:
    non_breaking_prefix_fr = f.read()

In [4]:
europarl_en[:50]

'Resumption of the session\nI declare resumed the se'

# Cleaning

In [5]:
non_breaking_prefix_en = non_breaking_prefix_en.split("\n")
non_breaking_prefix_en = [' ' + pref + '.' for pref in non_breaking_prefix_en]
non_breaking_prefix_fr = non_breaking_prefix_fr.split("\n")
non_breaking_prefix_fr = [' ' + pref + '.' for pref in non_breaking_prefix_fr]

In [6]:
#We will need each word and other symbol that we want to keep to be in lower case and separated by spaces so
#we can "tokenize" them.
corpus_en = europarl_en
# Add $$$ after non ending sentence points
for prefix in non_breaking_prefix_en:
    corpus_en = corpus_en.replace(prefix, prefix + '$$$')
corpus_en = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])", ".$$$", corpus_en)
# Remove $$$ markers
corpus_en = re.sub(r".\$\$\$", '', corpus_en)
# Clear multiple spaces
corpus_en = re.sub(r"  +", " ", corpus_en)
corpus_en = corpus_en.split('\n')

corpus_fr = europarl_fr
for prefix in non_breaking_prefix_fr:
    corpus_fr = corpus_fr.replace(prefix, prefix + '$$$')
corpus_fr = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])", ".$$$", corpus_fr)
corpus_fr = re.sub(r".\$\$\$", '', corpus_fr)
corpus_fr = re.sub(r"  +", " ", corpus_fr)
corpus_fr = corpus_fr.split('\n')

## Tokenizing text

In [7]:
tokenizer_en = tfdf.features.text.SubwordTextEncoder.build_from_corpus(
    corpus_en, target_vocab_size=2**13)
tokenizer_fr = tfdf.features.text.SubwordTextEncoder.build_from_corpus(
    corpus_fr, target_vocab_size=2**13)


VOCAB_SIZE_EN = tokenizer_en.vocab_size + 2
VOCAB_SIZE_FR = tokenizer_fr.vocab_size + 2


inputs = [[VOCAB_SIZE_EN-2] + tokenizer_en.encode(sentence) + [VOCAB_SIZE_EN-1]
          for sentence in corpus_en]
outputs = [[VOCAB_SIZE_FR-2] + tokenizer_fr.encode(sentence) + [VOCAB_SIZE_FR-1]
           for sentence in corpus_fr]

## Remove too long sentences 

In [8]:
MAX_LENGTH = 20
idx_to_remove = [count for count, sent in enumerate(inputs)
                 if len(sent) > MAX_LENGTH]
for idx in reversed(idx_to_remove):
    del inputs[idx]
    del outputs[idx]
idx_to_remove = [count for count, sent in enumerate(outputs)
                 if len(sent) > MAX_LENGTH]
for idx in reversed(idx_to_remove):
    del inputs[idx]
    del outputs[idx]

In [9]:
#Input and Output Creation
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                       value=0,
                                                       padding='post',
                                                       maxlen=MAX_LENGTH)
outputs = tf.keras.preprocessing.sequence.pad_sequences(outputs,
                                                        value=0,
                                                        padding='post',
                                                        maxlen=MAX_LENGTH)

In [10]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

# Model Building 

## Embedding

In [11]:
class PositionalEncoding(layers.Layer):

    def __init__(self):
        super(PositionalEncoding, self).__init__()
    
    def get_angles(self, pos, i, d_model):
        angles = 1 / np.power(10000., (2*(i//2)) / np.float32(d_model))
        return pos * angles

    def call(self, inputs):
        seq_length = inputs.shape.as_list()[-2]
        d_model = inputs.shape.as_list()[-1]
        angles = self.get_angles(np.arange(seq_length)[:, np.newaxis],
                                 np.arange(d_model)[np.newaxis, :],
                                 d_model)
        angles[:, 0::2] = np.sin(angles[:, 0::2])
        angles[:, 1::2] = np.cos(angles[:, 1::2])
        pos_encoding = angles[np.newaxis, ...]
        return inputs + tf.cast(pos_encoding, tf.float32)

## Attention

In [12]:
def scaled_dot_product_attention(queries, keys, values, mask):
    product = tf.matmul(queries, keys, transpose_b=True)
    
    keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)
    scaled_product = product / tf.math.sqrt(keys_dim)
    
    if mask is not None:
        scaled_product += (mask * -1e9)
    
    attention = tf.matmul(tf.nn.softmax(scaled_product, axis=-1), values)
    
    return attention

## Multi-head attention sublayer

In [13]:
class MultiHeadAttention(layers.Layer):
    
    def __init__(self, nb_proj):
        super(MultiHeadAttention, self).__init__()
        self.nb_proj = nb_proj
        
    def build(self, input_shape):
        self.d_model = input_shape[-1]
        assert self.d_model % self.nb_proj == 0
        
        self.d_proj = self.d_model // self.nb_proj
        
        self.query_lin = layers.Dense(units=self.d_model)
        self.key_lin = layers.Dense(units=self.d_model)
        self.value_lin = layers.Dense(units=self.d_model)
        
        self.final_lin = layers.Dense(units=self.d_model)
        
    def split_proj(self, inputs, batch_size): # inputs: (batch_size, seq_length, d_model)
        shape = (batch_size,
                 -1,
                 self.nb_proj,
                 self.d_proj)
        splited_inputs = tf.reshape(inputs, shape=shape) # (batch_size, seq_length, nb_proj, d_proj)
        return tf.transpose(splited_inputs, perm=[0, 2, 1, 3]) # (batch_size, nb_proj, seq_length, d_proj)
    
    def call(self, queries, keys, values, mask):
        batch_size = tf.shape(queries)[0]
        
        queries = self.query_lin(queries)
        keys = self.key_lin(keys)
        values = self.value_lin(values)
        
        queries = self.split_proj(queries, batch_size)
        keys = self.split_proj(keys, batch_size)
        values = self.split_proj(values, batch_size)
        
        attention = scaled_dot_product_attention(queries, keys, values, mask)
        
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        
        concat_attention = tf.reshape(attention,
                                      shape=(batch_size, -1, self.d_model))
        
        outputs = self.final_lin(concat_attention)
        
        return outputs

## Encoder

In [14]:
class EncoderLayer(layers.Layer):
    
    def __init__(self, FFN_units, nb_proj, dropout_rate):
        super(EncoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout_rate = dropout_rate
    
    def build(self, input_shape):
        self.d_model = input_shape[-1]
        
        self.multi_head_attention = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
        
        self.dense_1 = layers.Dense(units=self.FFN_units, activation="relu")
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_2 = layers.Dropout(rate=self.dropout_rate)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, inputs, mask, training):
        attention = self.multi_head_attention(inputs,
                                              inputs,
                                              inputs,
                                              mask)
        attention = self.dropout_1(attention, training=training)
        attention = self.norm_1(attention + inputs)
        
        outputs = self.dense_1(attention)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_2(outputs, training=training)
        outputs = self.norm_2(outputs + attention)
        
        return outputs

In [15]:
class Encoder(layers.Layer):
    
    def __init__(self,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout_rate,
                 vocab_size,
                 d_model,
                 name="encoder"):
        super(Encoder, self).__init__(name=name)
        self.nb_layers = nb_layers
        self.d_model = d_model
        
        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout_rate)
        self.enc_layers = [EncoderLayer(FFN_units,
                                        nb_proj,
                                        dropout_rate) 
                           for _ in range(nb_layers)]
    
    def call(self, inputs, mask, training):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training)
        
        for i in range(self.nb_layers):
            outputs = self.enc_layers[i](outputs, mask, training)

        return outputs

# Decoder 

In [16]:
class DecoderLayer(layers.Layer):
    
    def __init__(self, FFN_units, nb_proj, dropout_rate):
        super(DecoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout_rate = dropout_rate
    
    def build(self, input_shape):
        self.d_model = input_shape[-1]
        
        # Self multi head attention
        self.multi_head_attention_1 = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
        
        # Multi head attention combined with encoder output
        self.multi_head_attention_2 = MultiHeadAttention(self.nb_proj)
        self.dropout_2 = layers.Dropout(rate=self.dropout_rate)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
        
        # Feed foward
        self.dense_1 = layers.Dense(units=self.FFN_units,
                                    activation="relu")
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_3 = layers.Dropout(rate=self.dropout_rate)
        self.norm_3 = layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        attention = self.multi_head_attention_1(inputs,
                                                inputs,
                                                inputs,
                                                mask_1)
        attention = self.dropout_1(attention, training)
        attention = self.norm_1(attention + inputs)
        
        attention_2 = self.multi_head_attention_2(attention,
                                                  enc_outputs,
                                                  enc_outputs,
                                                  mask_2)
        attention_2 = self.dropout_2(attention_2, training)
        attention_2 = self.norm_2(attention_2 + attention)
        
        outputs = self.dense_1(attention_2)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_3(outputs, training)
        outputs = self.norm_3(outputs + attention_2)
        
        return outputs

In [17]:
class Decoder(layers.Layer):
    
    def __init__(self,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout_rate,
                 vocab_size,
                 d_model,
                 name="decoder"):
        super(Decoder, self).__init__(name=name)
        self.d_model = d_model
        self.nb_layers = nb_layers
        
        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout_rate)
        
        self.dec_layers = [DecoderLayer(FFN_units,
                                        nb_proj,
                                        dropout_rate) 
                           for i in range(nb_layers)]
    
    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training)
        
        for i in range(self.nb_layers):
            outputs = self.dec_layers[i](outputs,
                                         enc_outputs,
                                         mask_1,
                                         mask_2,
                                         training)

        return outputs

# Transformer

In [18]:
class Transformer(tf.keras.Model):
    
    def __init__(self,
                 vocab_size_enc,
                 vocab_size_dec,
                 d_model,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout_rate,
                 name="transformer"):
        super(Transformer, self).__init__(name=name)
        
        self.encoder = Encoder(nb_layers,
                               FFN_units,
                               nb_proj,
                               dropout_rate,
                               vocab_size_enc,
                               d_model)
        self.decoder = Decoder(nb_layers,
                               FFN_units,
                               nb_proj,
                               dropout_rate,
                               vocab_size_dec,
                               d_model)
        self.last_linear = layers.Dense(units=vocab_size_dec, name="lin_ouput")
    
    def create_padding_mask(self, seq):
        mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
        return mask[:, tf.newaxis, tf.newaxis, :]

    def create_look_ahead_mask(self, seq):
        seq_len = tf.shape(seq)[1]
        look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        return look_ahead_mask
    
    def call(self, enc_inputs, dec_inputs, training):
        enc_mask = self.create_padding_mask(enc_inputs)
        dec_mask_1 = tf.maximum(
            self.create_padding_mask(dec_inputs),
            self.create_look_ahead_mask(dec_inputs)
        )
        dec_mask_2 = self.create_padding_mask(enc_inputs)
        
        enc_outputs = self.encoder(enc_inputs, enc_mask, training)
        dec_outputs = self.decoder(dec_inputs,
                                   enc_outputs,
                                   dec_mask_1,
                                   dec_mask_2,
                                   training)
        
        outputs = self.last_linear(dec_outputs)
        
        return outputs

# Trainning

In [19]:
tf.keras.backend.clear_session()

# Hyper-parameters
D_MODEL = 128 # 512
NB_LAYERS = 4 # 6
FFN_UNITS = 512 # 2048
NB_PROJ = 8 # 8
DROPOUT_RATE = 0.1 # 0.1

transformer = Transformer(vocab_size_enc=VOCAB_SIZE_EN,
                          vocab_size_dec=VOCAB_SIZE_FR,
                          d_model=D_MODEL,
                          nb_layers=NB_LAYERS,
                          FFN_units=FFN_UNITS,
                          nb_proj=NB_PROJ,
                          dropout_rate=DROPOUT_RATE)

In [20]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                            reduction="none")

def loss_function(target, pred):
    mask = tf.math.logical_not(tf.math.equal(target, 0))
    loss_ = loss_object(target, pred)
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)

train_loss = tf.keras.metrics.Mean(name="train_loss")
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name="train_accuracy")

In [21]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)
        
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

leaning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(leaning_rate,
                                     beta_1=0.9,
                                     beta_2=0.98,
                                     epsilon=1e-9)
        

In [22]:
checkpoint_path = "C:\\Users\\gaura\\Desktop\\Modern Natural Language Processing in Python\\Transformer\\ckpt"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

In [23]:
EPOCHS = 10
for epoch in range(EPOCHS):
    print("Start of epoch {}".format(epoch+1))
    start = time.time()
    
    train_loss.reset_states()
    train_accuracy.reset_states()
    
    for (batch, (enc_inputs, targets)) in enumerate(dataset):
        dec_inputs = targets[:, :-1]
        dec_outputs_real = targets[:, 1:]
        with tf.GradientTape() as tape:
            predictions = transformer(enc_inputs, dec_inputs, True)
            loss = loss_function(dec_outputs_real, predictions)
        
        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
        
        train_loss(loss)
        train_accuracy(dec_outputs_real, predictions)
        
        if batch % 50 == 0:
            print("Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}".format(
                epoch+1, batch, train_loss.result(), train_accuracy.result()))
            
    ckpt_save_path = ckpt_manager.save()
    print("Saving checkpoint for epoch {} at {}".format(epoch+1,
                                                        ckpt_save_path))
    print("Time taken for 1 epoch: {} secs\n".format(time.time() - start))

Start of epoch 1
Epoch 1 Batch 0 Loss 6.6340 Accuracy 0.0000
Epoch 1 Batch 50 Loss 6.2636 Accuracy 0.0101
Epoch 1 Batch 100 Loss 6.1788 Accuracy 0.0311
Epoch 1 Batch 150 Loss 6.1183 Accuracy 0.0383
Epoch 1 Batch 200 Loss 6.0342 Accuracy 0.0418
Epoch 1 Batch 250 Loss 5.9366 Accuracy 0.0440
Epoch 1 Batch 300 Loss 5.8097 Accuracy 0.0455
Epoch 1 Batch 350 Loss 5.6868 Accuracy 0.0500
Epoch 1 Batch 400 Loss 5.5521 Accuracy 0.0552
Epoch 1 Batch 450 Loss 5.4314 Accuracy 0.0606
Epoch 1 Batch 500 Loss 5.3187 Accuracy 0.0664
Epoch 1 Batch 550 Loss 5.2144 Accuracy 0.0726
Epoch 1 Batch 600 Loss 5.1134 Accuracy 0.0785
Epoch 1 Batch 650 Loss 5.0207 Accuracy 0.0841
Epoch 1 Batch 700 Loss 4.9302 Accuracy 0.0895
Epoch 1 Batch 750 Loss 4.8453 Accuracy 0.0946
Epoch 1 Batch 800 Loss 4.7643 Accuracy 0.0996
Epoch 1 Batch 850 Loss 4.6867 Accuracy 0.1046
Epoch 1 Batch 900 Loss 4.6147 Accuracy 0.1094
Epoch 1 Batch 950 Loss 4.5478 Accuracy 0.1141
Epoch 1 Batch 1000 Loss 4.4831 Accuracy 0.1186
Epoch 1 Batch 1050 

Epoch 2 Batch 2800 Loss 1.4608 Accuracy 0.4200
Epoch 2 Batch 2850 Loss 1.4563 Accuracy 0.4206
Epoch 2 Batch 2900 Loss 1.4525 Accuracy 0.4212
Epoch 2 Batch 2950 Loss 1.4487 Accuracy 0.4217
Epoch 2 Batch 3000 Loss 1.4446 Accuracy 0.4222
Epoch 2 Batch 3050 Loss 1.4412 Accuracy 0.4227
Epoch 2 Batch 3100 Loss 1.4376 Accuracy 0.4232
Epoch 2 Batch 3150 Loss 1.4337 Accuracy 0.4238
Epoch 2 Batch 3200 Loss 1.4296 Accuracy 0.4243
Epoch 2 Batch 3250 Loss 1.4258 Accuracy 0.4248
Epoch 2 Batch 3300 Loss 1.4222 Accuracy 0.4253
Epoch 2 Batch 3350 Loss 1.4184 Accuracy 0.4258
Epoch 2 Batch 3400 Loss 1.4150 Accuracy 0.4263
Epoch 2 Batch 3450 Loss 1.4115 Accuracy 0.4268
Epoch 2 Batch 3500 Loss 1.4077 Accuracy 0.4274
Epoch 2 Batch 3550 Loss 1.4041 Accuracy 0.4279
Epoch 2 Batch 3600 Loss 1.4006 Accuracy 0.4284
Epoch 2 Batch 3650 Loss 1.3976 Accuracy 0.4289
Epoch 2 Batch 3700 Loss 1.3947 Accuracy 0.4295
Epoch 2 Batch 3750 Loss 1.3917 Accuracy 0.4301
Epoch 2 Batch 3800 Loss 1.3886 Accuracy 0.4307
Epoch 2 Batch

Epoch 3 Batch 5600 Loss 1.1628 Accuracy 0.4649
Epoch 3 Batch 5650 Loss 1.1632 Accuracy 0.4648
Epoch 3 Batch 5700 Loss 1.1637 Accuracy 0.4646
Saving checkpoint for epoch 3 at C:\Users\gaura\Desktop\Modern Natural Language Processing in Python\Transformer\ckpt\ckpt-3
Time taken for 1 epoch: 6577.199537038803 secs

Start of epoch 4
Epoch 4 Batch 0 Loss 1.2336 Accuracy 0.4342
Epoch 4 Batch 50 Loss 1.1992 Accuracy 0.4528
Epoch 4 Batch 100 Loss 1.2025 Accuracy 0.4542
Epoch 4 Batch 150 Loss 1.2144 Accuracy 0.4547
Epoch 4 Batch 200 Loss 1.2165 Accuracy 0.4549
Epoch 4 Batch 250 Loss 1.2159 Accuracy 0.4557
Epoch 4 Batch 300 Loss 1.2160 Accuracy 0.4560
Epoch 4 Batch 350 Loss 1.2169 Accuracy 0.4556
Epoch 4 Batch 400 Loss 1.2139 Accuracy 0.4562
Epoch 4 Batch 450 Loss 1.2096 Accuracy 0.4563
Epoch 4 Batch 500 Loss 1.2062 Accuracy 0.4566
Epoch 4 Batch 550 Loss 1.2046 Accuracy 0.4567
Epoch 4 Batch 600 Loss 1.2028 Accuracy 0.4566
Epoch 4 Batch 650 Loss 1.2029 Accuracy 0.4565
Epoch 4 Batch 700 Loss 1.201

Epoch 5 Batch 2500 Loss 1.0603 Accuracy 0.4825
Epoch 5 Batch 2550 Loss 1.0573 Accuracy 0.4827
Epoch 5 Batch 2600 Loss 1.0544 Accuracy 0.4830
Epoch 5 Batch 2650 Loss 1.0516 Accuracy 0.4833
Epoch 5 Batch 2700 Loss 1.0488 Accuracy 0.4836
Epoch 5 Batch 2750 Loss 1.0462 Accuracy 0.4840
Epoch 5 Batch 2800 Loss 1.0437 Accuracy 0.4843
Epoch 5 Batch 2850 Loss 1.0418 Accuracy 0.4846
Epoch 5 Batch 2900 Loss 1.0395 Accuracy 0.4849
Epoch 5 Batch 2950 Loss 1.0373 Accuracy 0.4852
Epoch 5 Batch 3000 Loss 1.0353 Accuracy 0.4856
Epoch 5 Batch 3050 Loss 1.0332 Accuracy 0.4858
Epoch 5 Batch 3100 Loss 1.0308 Accuracy 0.4860
Epoch 5 Batch 3150 Loss 1.0288 Accuracy 0.4863
Epoch 5 Batch 3200 Loss 1.0269 Accuracy 0.4865
Epoch 5 Batch 3250 Loss 1.0252 Accuracy 0.4867
Epoch 5 Batch 3300 Loss 1.0233 Accuracy 0.4870
Epoch 5 Batch 3350 Loss 1.0214 Accuracy 0.4873
Epoch 5 Batch 3400 Loss 1.0191 Accuracy 0.4877
Epoch 5 Batch 3450 Loss 1.0172 Accuracy 0.4880
Epoch 5 Batch 3500 Loss 1.0154 Accuracy 0.4883
Epoch 5 Batch

Epoch 6 Batch 5300 Loss 0.9837 Accuracy 0.4944
Epoch 6 Batch 5350 Loss 0.9847 Accuracy 0.4941
Epoch 6 Batch 5400 Loss 0.9858 Accuracy 0.4939
Epoch 6 Batch 5450 Loss 0.9868 Accuracy 0.4936
Epoch 6 Batch 5500 Loss 0.9877 Accuracy 0.4933
Epoch 6 Batch 5550 Loss 0.9886 Accuracy 0.4931
Epoch 6 Batch 5600 Loss 0.9893 Accuracy 0.4928
Epoch 6 Batch 5650 Loss 0.9902 Accuracy 0.4926
Epoch 6 Batch 5700 Loss 0.9910 Accuracy 0.4924
Saving checkpoint for epoch 6 at C:\Users\gaura\Desktop\Modern Natural Language Processing in Python\Transformer\ckpt\ckpt-6
Time taken for 1 epoch: 8420.899336576462 secs

Start of epoch 7
Epoch 7 Batch 0 Loss 1.0138 Accuracy 0.4975
Epoch 7 Batch 50 Loss 1.1123 Accuracy 0.4771
Epoch 7 Batch 100 Loss 1.0889 Accuracy 0.4796
Epoch 7 Batch 150 Loss 1.0967 Accuracy 0.4765
Epoch 7 Batch 200 Loss 1.0890 Accuracy 0.4765
Epoch 7 Batch 250 Loss 1.0868 Accuracy 0.4769
Epoch 7 Batch 300 Loss 1.0801 Accuracy 0.4773
Epoch 7 Batch 350 Loss 1.0783 Accuracy 0.4765
Epoch 7 Batch 400 Loss

Epoch 8 Batch 2200 Loss 0.9857 Accuracy 0.4955
Epoch 8 Batch 2250 Loss 0.9832 Accuracy 0.4958
Epoch 8 Batch 2300 Loss 0.9803 Accuracy 0.4961
Epoch 8 Batch 2350 Loss 0.9777 Accuracy 0.4964
Epoch 8 Batch 2400 Loss 0.9752 Accuracy 0.4966
Epoch 8 Batch 2450 Loss 0.9722 Accuracy 0.4968
Epoch 8 Batch 2500 Loss 0.9690 Accuracy 0.4971
Epoch 8 Batch 2550 Loss 0.9661 Accuracy 0.4975
Epoch 8 Batch 2600 Loss 0.9631 Accuracy 0.4978
Epoch 8 Batch 2650 Loss 0.9602 Accuracy 0.4981
Epoch 8 Batch 2700 Loss 0.9577 Accuracy 0.4985
Epoch 8 Batch 2750 Loss 0.9552 Accuracy 0.4988
Epoch 8 Batch 2800 Loss 0.9531 Accuracy 0.4992
Epoch 8 Batch 2850 Loss 0.9513 Accuracy 0.4995
Epoch 8 Batch 2900 Loss 0.9496 Accuracy 0.4997
Epoch 8 Batch 2950 Loss 0.9474 Accuracy 0.5000
Epoch 8 Batch 3000 Loss 0.9459 Accuracy 0.5002
Epoch 8 Batch 3050 Loss 0.9438 Accuracy 0.5004
Epoch 8 Batch 3100 Loss 0.9417 Accuracy 0.5007
Epoch 8 Batch 3150 Loss 0.9401 Accuracy 0.5009
Epoch 8 Batch 3200 Loss 0.9383 Accuracy 0.5012
Epoch 8 Batch

Epoch 9 Batch 5000 Loss 0.9110 Accuracy 0.5067
Epoch 9 Batch 5050 Loss 0.9123 Accuracy 0.5064
Epoch 9 Batch 5100 Loss 0.9136 Accuracy 0.5062
Epoch 9 Batch 5150 Loss 0.9148 Accuracy 0.5059
Epoch 9 Batch 5200 Loss 0.9161 Accuracy 0.5056
Epoch 9 Batch 5250 Loss 0.9174 Accuracy 0.5054
Epoch 9 Batch 5300 Loss 0.9185 Accuracy 0.5051
Epoch 9 Batch 5350 Loss 0.9196 Accuracy 0.5048
Epoch 9 Batch 5400 Loss 0.9210 Accuracy 0.5046
Epoch 9 Batch 5450 Loss 0.9223 Accuracy 0.5043
Epoch 9 Batch 5500 Loss 0.9232 Accuracy 0.5040
Epoch 9 Batch 5550 Loss 0.9242 Accuracy 0.5038
Epoch 9 Batch 5600 Loss 0.9252 Accuracy 0.5035
Epoch 9 Batch 5650 Loss 0.9261 Accuracy 0.5033
Epoch 9 Batch 5700 Loss 0.9270 Accuracy 0.5030
Saving checkpoint for epoch 9 at C:\Users\gaura\Desktop\Modern Natural Language Processing in Python\Transformer\ckpt\ckpt-9
Time taken for 1 epoch: 6631.252534627914 secs

Start of epoch 10
Epoch 10 Batch 0 Loss 0.9803 Accuracy 0.4753
Epoch 10 Batch 50 Loss 1.0170 Accuracy 0.4885
Epoch 10 Batc

# Evaluating

In [24]:
def evaluate(inp_sentence):
    inp_sentence = \
        [VOCAB_SIZE_EN-2] + tokenizer_en.encode(inp_sentence) + [VOCAB_SIZE_EN-1]
    enc_input = tf.expand_dims(inp_sentence, axis=0)
    
    output = tf.expand_dims([VOCAB_SIZE_FR-2], axis=0)
    
    for _ in range(MAX_LENGTH):
        predictions = transformer(enc_input, output, False)
        
        prediction = predictions[:, -1:, :]
        
        predicted_id = tf.cast(tf.argmax(prediction, axis=-1), tf.int32)
        
        if predicted_id == VOCAB_SIZE_FR-1:
            return tf.squeeze(output, axis=0)
        
        output = tf.concat([output, predicted_id], axis=-1)
        
    return tf.squeeze(output, axis=0)

In [25]:
def translate(sentence):
    output = evaluate(sentence).numpy()
    
    predicted_sentence = tokenizer_fr.decode(
        [i for i in output if i < VOCAB_SIZE_FR-2]
    )
    
    print("Input: {}".format(sentence))
    print("Predicted translation: {}".format(predicted_sentence))

In [27]:
translate("i love you")

Input: i love you
Predicted translation: Je vous souhaite beaucoup d'attention
