In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import random

print(tf.__version__)

# Set some random seeds so that we should get the same answer! This method
# sets the keras, python and numpy random seeds
keras.utils.set_random_seed(42)

# Get the ANKI English to Spanish dataset
text_file = keras.utils.get_file(
    fname="spa-eng.zip",
    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
    extract=True,
)
import pathlib
data_path = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

2.12.0
Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [2]:
with open(data_path) as data_file:
    lines = data_file.read().split("\n")[:-1]

import re
# Now convert the lines into English to Spanish pairs
# Since this is a simple model, it makes sense to enforce lower case and
# remove any punctuation
english_spanish_pairs = []
max_length_english = 0
for line in lines:
    line = line.lower()
    # Below regex removes characters not in the list, don't forget tabs (\t)!
    # We need to make sure that we keep the special characters from Spanish
    line = re.sub(r'[^A-Za-z0-9 \táéíóúñ]+', '', line)
    english, spanish = line.split("\t")
    # As seen in the lecture, we need to add start and end tokens for Spanish
    spanish = "<sos> " + spanish + " <eos>"
    english_spanish_pairs.append((english, spanish))


In [3]:
# Shuffle the dataset and split into training and validation samples (80%/20%)
random.shuffle(english_spanish_pairs)
# Let's have a look at a couple of examples
print('We have', len(english_spanish_pairs), 'examples in the dataset')
for w in range (5):
    print('Example',w,':', english_spanish_pairs[w])

num_training = int(0.8*len(english_spanish_pairs))
training_pairs = english_spanish_pairs[:num_training]
validation_pairs = english_spanish_pairs[num_training:]

print("Training sample size =", len(training_pairs),
      "and validation sample =", len(validation_pairs))

We have 118964 examples in the dataset
Example 0 : ('its not in my contract', '<sos> no está en mi contrato <eos>')
Example 1 : ('tom has the right to vote', '<sos> tom tiene el derecho a votar <eos>')
Example 2 : ('my car is covered with pigeon poop', '<sos> mi coche está cubierto de caca de paloma <eos>')
Example 3 : ('tom went to paris to study french', '<sos> tom fue a parís para estudiar francés <eos>')
Example 4 : ('you dont have to do anything you dont want to', '<sos> no tienes que hacer nada que no quieras <eos>')
Training sample size = 95171 and validation sample = 23793


In [4]:
# Define the maximum vocabulary size that we want to use. If there
# are more words than this, then the 15000 most frequent words are
# used as the vocabulary
vocab_size_english = 15000
vocab_size_spanish = 15000

# Define what the maximum sentence length is. If the sentence has
# fewer words then it is padded wih zeros during the vectorisation
sequence_length = 20

# This layer simply converts a word into an integer
english_vectorisation = keras.layers.TextVectorization(
                        max_tokens=vocab_size_english,
                        output_mode="int",
                        output_sequence_length=sequence_length,
                        standardize=None)
# The start / end tokens expand our sequences by one. It isn't two because
# we don't use the <sos> token in the decoder output, or the <eos> token
# in the decoder input
spanish_vectorisation = keras.layers.TextVectorization(
                        max_tokens=vocab_size_spanish,
                        output_mode='int',
                        output_sequence_length=sequence_length+1,
                        standardize=None)

# We need to separate out our language pairs to build the vocabulary
english_words = []
spanish_words = []
for pair in english_spanish_pairs:
  english_words.append(pair[0])
  spanish_words.append(pair[1])

# Actually build the vocabulary
english_vectorisation.adapt(english_words)
spanish_vectorisation.adapt(spanish_words)


In [5]:
# Define the batch size now since the dataset needs to know
batch_size = 64

# We need to format our datasets a little more. We actually need three types
# of sentences
# 1) "encoder_inputs": english sentences
# 2) "decoder_inputs": spanish sentences with <sos> for teacher forcing
# 3) target sentence to predict: spanish sentences with <eos> at the end
# A simple way to use this data and allow batching is a tf.Data.Dataset object
def format_dataset(english, spanish_input, spanish_target):
    inputs = {"encoder_inputs": english_vectorisation(english),
              "decoder_inputs": spanish_vectorisation(spanish_input),}
    target = spanish_vectorisation(spanish_target)
    return (inputs, target,)

def make_dataset(pairs, batch_size):
    eng_texts, spa_texts = zip(*pairs)
    # Decoder input needs to remove the last word (<eos>)
    spanish_input = list(spa_texts)
    for word in range(len(spanish_input)):
        spanish_input[word] = spanish_input[word].rsplit(' ', 1)[0]
    # Target needs to remove the first word (<sos>)
    spanish_target = list(spa_texts)
    for word in range(len(spanish_target)):
        spanish_target[word] = spanish_target[word].split(' ', 1)[1]
    dataset = tf.data.Dataset.from_tensor_slices((list(eng_texts), spanish_input, spanish_target))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.cache().shuffle(2048).prefetch(tf.data.experimental.AUTOTUNE)

training_dataset = make_dataset(training_pairs, batch_size)
validation_dataset = make_dataset(validation_pairs, batch_size)

In [6]:
for inputs, targets in training_dataset.take(1):
    print('inputs["encoder_inputs"] shape: ', inputs["encoder_inputs"].shape)
    print('inputs["decoder_inputs"] shape: ', inputs["decoder_inputs"].shape)
    print('targets shape: ', targets.shape)
    print(inputs["encoder_inputs"][0])
    print(inputs["decoder_inputs"][0])
    print(targets[0])

inputs["encoder_inputs"] shape:  (64, 20)
inputs["decoder_inputs"] shape:  (64, 21)
targets shape:  (64, 21)
tf.Tensor(
[   3 1194   49 2478  195  110   52    0    0    0    0    0    0    0
    0    0    0    0    0    0], shape=(20,), dtype=int64)
tf.Tensor(
[   2 5393   18 1345    5  608   32  233  856    0    0    0    0    0
    0    0    0    0    0    0    0], shape=(21,), dtype=int64)
tf.Tensor(
[5393   18 1345    5  608   32  233  856    3    0    0    0    0    0
    0    0    0    0    0    0    0], shape=(21,), dtype=int64)


---

We have (finally) formatted the training and validation data into a format we can use. Now let's get on with defining all of the components that we need for our transformer.
1. Word embedding and position encoding
2. Multi-head attention
3. Feed-forward network
4. The encoder
5. The decoder
6. The transformer (brings together all of the above)

---

In [None]:
# Firstly, let's make sinusiodal position encoding layer. We will actually
# used a learned position encoding, but I wanted to include this function
# for completeness as it was used in the original transformer
class SinusoidalPositionEncoding(keras.layers.Layer):
    def __init__(self, sequence_length, d_model, **kwargs):
        super().__init__(**kwargs)
        self.l = sequence_length
        self.d_model = d_model

    # Inputs doesn't do anything, but needed to make other interfaces
    def call(self, inputs):
        position = np.arange(self.l)[:, np.newaxis]
        # Create div_term array with shape (d_model//2,)
        # which corresponds to the denominators raised to the power 2i/d_model
        div_term = np.exp(-np.log(10000.0) * (np.arange(0, self.d_model, 2) / self.d_model))
        # Initialize the position encoding array
        encoded_position = np.zeros((self.l, self.d_model))
        # Compute sinusoidal values using broadcasting
        encoded_position[:, 0::2] = np.sin(position * div_term)
        encoded_position[:, 1::2] = np.cos(position * div_term)
        return encoded_position

In [None]:
# Embedding layer that does the embedding and position encoding. We can choose
# whether to use the learned or sinusoidal encoding.
class EmbedAndEncode(keras.layers.Layer):
    def __init__(self, sequence_length, d_model, vocab_size, learned_encoding, **kwargs):
        super().__init__(**kwargs)
        # Embedding layers
        self.l = sequence_length
        self.embedding = keras.layers.Embedding(vocab_size, d_model)
        if learned_encoding:
            self.position = keras.layers.Embedding(sequence_length, d_model)
        else:
            self.position = SinusoidalPositionEncoding(sequence_length, d_model)

    def call(self, inputs):
        e = self.embedding(inputs)
        positions = np.arange(0, self.l, 1)
        p = self.position(positions)
        x = p + e
        return x

---

In this next block we will build the attention mechanism. This is the key component of the transformer. To recap from the lectures, there are a few steps that we need to follow.


1.   We get three inputs passed into the attention heads, which then pass through the $W^Q$, $W^K$ and $W^V$ matrices, which all have dimensions $\left( d_\textrm{model}, d_k \right)$ to give us queries $Q$, keys $K$, and values $V$.
2.   Calculate $A = \textrm{softmax}\left( \frac{QK^T}{\sqrt{d_k}} \right)$
3.   The output from the attention head if given by the matrix product $AV$
4.   Repeat the above for each attention head and concatenate the output
5.   Pass the concatenated output through the dense layer representing matrix $W^0$ with shape $\left( \left(n_\textrm{heads}d_k \right) \times d_\textrm{model}\right)$

Implementation information:

*   We saw in the lectures how to represent the weight matrices as `Dense` layers, where the number of neurons is equal to the number of columns in the matrix. We need to ensure that we don't use a bias term and that we are using a linear (effectively the identity) activation function. This is the default in keras so we don't need to specify it in the code below.
* Use `tf.matmul(A,B)` to multiply matrix $A$ by matrix $B$. It can take optional boolean arguments to transpose the matrices before multiplying (`transpose_a` and `transpose_b`)


In [None]:
# Ordinarily we could just use keras.layers.Attention or MultiHeadAttention
# to build a transformer. However, in this case let's write our own layer
# as an exercise.
class CustomAttention(keras.layers.Layer):
    def __init__(self, d_model, d_k, num_heads, **kwargs):
        super().__init__(**kwargs)
        # Important values
        self.d_k = d_k
        self.num_heads = num_heads
        # Q, K and V weight matrices for each head.
        # Fill in the number of nodes in the W^Q, W^K and W^V dense layers
        # by replacing the "None" values in the three lines below
        self.Wq = [keras.layers.Dense(None, use_bias=False) for _ in range(self.num_heads)]
        self.Wk = [keras.layers.Dense(None, use_bias=False) for _ in range(self.num_heads)]
        self.Wv = [keras.layers.Dense(None, use_bias=False) for _ in range(self.num_heads)]
        # Final weight matrix applied to concatenated output of all heads
        self.W0 = keras.layers.Dense(None, use_bias=False)
        # Softmax
        self.softmax = keras.layers.Softmax()
        # Layer normalisation
        self.layernorm = keras.layers.LayerNormalization()
        # Dropout
        self.dropout = keras.layers.Dropout(0.1)
        # Addition layer
        self.add = keras.layers.Add()

    # We take three inputs here, the values that will be projected into q, k and v
    # For self-attention these are all the same, but for cross-attention q comes
    # the masked attention, and k and v from the encoder.
    def call(self, q, k, v, mask=None):
        # Z will store the output from each head in a list
        Z = []
        # Loop over all heads
        for head in range(self.num_heads):
            # Q, K and V projections. Pass the correct inputs through the
            # corresponding dense layers by replacing "None"
            Q = self.Wq[head](None)
            K = self.Wk[head](None)
            V = self.Wv[head](None)
            # Multiply Q by the transpose of K
            QKT = None
            # The mask is added to QKT at this point. This None is actually
            # part of the code, and not something to change!
            if mask is not None: # DON'T CHANGE THIS NONE!
                QKT = self.add([QKT, mask])
            # Now we apply the normalisation
            QKT = QKT / tf.math.sqrt(float(self.d_k))
            # Apply the softmax activation to get the attention matrix
            A = None
            A = self.dropout(A)
            # Final output for each head. We need to multiply matrices A and V
            output = None
            # Add the result for this head to the list of results
            Z.append(output)

        # Concatenate outputs from the heads
        concZ = tf.concat(Z, axis=-1)
        # Mutliply by the final weight matrix W0 to give us a shape equal to the input
        output = None
        # Perform some dropout
        output = self.dropout(output)
        # Make the residual connection that connects input q to the output and
        # normalise the weights
        output = self.layernorm(self.add([q,output]))
        return output




---

Since this is the most important part of the transformer, let's perform a quick test to see if we get the expected answer. If your attention layer is correct then you will see the following output:

A tensor of shape `(1, 5, 4)` with the following values:
```
[[[ 0.22355375  0.6402806  -1.6862876   0.822453  ]
  [ 0.7611287   1.0543339  -1.4768872  -0.33857557]
  [ 0.8607119   0.61222994 -1.68159     0.20864806]
  [ 0.34176865  1.0672501  -1.634869    0.2258502 ]
  [ 1.1459001   0.6712856  -1.4445571  -0.37262878]]]
```
As expected, the output from the layer has the same shape as the input.


In [5]:
# This is like a unit test of the layer
fake_input = np.random.rand(1,5,4)
test_model_input = keras.layers.Input(shape=(5,4), dtype="float32")
test_model_output = CustomAttention(d_model=4, d_k=3, num_heads=8)(test_model_input, test_model_input, test_model_input, mask=None)
test_model = keras.Model(test_model_input, test_model_output)
pred = test_model.predict((fake_input))
print(pred.shape)
print(pred)

NameError: ignored

---

Now we create the feed forward network. This is a very simple network consisting of just two `Dense` layers. The first layer expands the dimensions of the input to `ff_dim` and the second contracts it back down to the size of the input, `d_model`.

In [None]:
# Feed forward network including residual connection. This is just a two layer
# neural network using dense layers
class FeedForward(keras.layers.Layer):
    def __init__(self, d_model, ff_dim, **kwargs):
        super().__init__(**kwargs)
        # Dense layers - fill in the number of neurons
        self.dense_1 = keras.layers.Dense(None, activation='relu')
        self.dense_2 = keras.layers.Dense(None)
        # Layer normalisation
        self.layernorm = keras.layers.LayerNormalization()
        # Dropout
        self.dropout = keras.layers.Dropout(0.1)
        # Addition layer
        self.add = keras.layers.Add()

    def call(self, inputs):
        # First dense layer followed by some dropout
        output = self.dense_1(inputs)
        output = self.dropout(output)
        # Second dense layer followed by some dropout
        output = self.dense_2(output)
        output = self.dropout(output)
        # Perform the residual connection and normalise
        output = self.layernorm(self.add([inputs,output]))
        return output



---


Let's add a second little test. We'll pass the output of the attention through the feed forward layer and see what we get.
A tensor with shape `(1, 5, 4)` and values:
```
[[[ 0.01011109  0.74887556 -1.634079    0.87509245]
  [ 0.48468572  1.1789591  -1.5345951  -0.12904972]
  [ 0.592366    0.7920997  -1.7069805   0.32251465]
  [ 0.10495934  1.130093   -1.6039987   0.36894622]
  [ 0.9480371   0.85631114 -1.5087731  -0.29557496]]]
  ```
   Again, the output size will match the original input size, and the elements should be as above


In [None]:
test_model_output = FeedForward(d_model=4, ff_dim=20)(test_model_output)
test_model = keras.Model(test_model_input, test_model_output)
pred = test_model.predict((fake_input))
print(pred.shape)
print(pred)

---

Now we have our building blocks, we just need to put them together to make the encoder and decoder. The encoder is now very simple - we just need to perform two steps:
1. Self-attention: The same input is used for the  `𝑞` ,  `𝑘`  and  `𝑣`  inputs to the `CustomAttention(q, k, v, mask)` layer we defined earlier. We need to make sure to pass on the `mask` here too.
2. The output from the self-attention goes through the `FeedForward(d_model, ff_dim)` layer.

In [None]:
# Now we have the building blocks that we need, let's make our Encoder
class Encoder(keras.layers.Layer):
    def __init__(self, d_model, d_k, ff_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        # Attention
        self.attention = None
        # Feed-forward network
        self.feedforward = None

    def call(self, inputs, mask=None):
        # Call the attention function with the correct arguments
        z = None
        # Feed forward network
        output = None
        return output

---

The decoder is marginally more complex than the encoder, since we need to have two attention layers:
1. Masked self-attention: input to the decoder is used for the `𝑞`, `𝑘` and `𝑣` inputs to the `CustomAttention` layer we defined earlier. We need to make sure we are using the mask for the decoder input here.
2. Cross-attention: this uses the output of the encoder as the $k$ and $v$ inputs, but the output of the masked attention as $q$. We use the `CustomAttention(q, k, v, mask)` layer again here, making sure to use the mask that we created for the encoder input.
3. The output is passed into a feed-forward network to give the final ouput of the decoder, using the `FeedForward(d_model, ff_dim)` layer.


In [None]:
# Now let's make the decoder.
class Decoder(keras.layers.Layer):
    def __init__(self, d_model, d_k, ff_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        # Attention, both masked and cross
        self.masked_attention = None
        self.cross_attention = None
        # Feed-forward network
        self.feedforward = None

    # We have the encoder output to include here
    def call(self, inputs, encoder_output, decoder_mask=None, encoder_mask=None):
        # Call the masked attention function with the correct input and mask
        z = None
        # Call the cross-attention function with the correct input and mask
        c = None
        # Feed forward network
        output = None
        return output

---

Now all we need to do is put the building blocks together to create the full transformer model.
1. Create the three masks that we need to make use of:
> 1.   Padding mask for the encoder input
> 2.   Padding mask for the decoder input
> 3.   Causal mask for the decoder (prevents us looking into the future when predicting the output)
2. Perform the sentence embedding for encoder (English) and decoder (Spanish) inputs
3. Call the encoder with the English input and mask
4. Call the decoder with the Spanish input, theoutput from the encoder, the combination of the decoder masks and the encoder mask
5. Make the predictions from the output of the decoder
6. Sit back and enjoy some translations soon

In [None]:
# Finally we build the transformer
class Transformer(keras.layers.Layer):
    def __init__(self, sequence_length, d_model, d_k, ff_dim, num_heads, vocab_size_english, vocab_size_spanish, learned_encoding, **kwargs):
        super().__init__(**kwargs)
        # The sequence lengths for the encoder and decoder inputs
        self.l_e = sequence_length
        self.l_d = sequence_length + 1
        # Layers to encode the inputs for the encoder and decoder
        self.english_embed = EmbedAndEncode(self.l_e, d_model, vocab_size_english, learned_encoding)
        self.spanish_embed = EmbedAndEncode(self.l_d, d_model, vocab_size_spanish, learned_encoding)
        # The encoder and decoder
        self.encoder = Encoder(d_model, d_k, ff_dim, num_heads)
        self.decoder = Decoder(d_model, d_k, ff_dim, num_heads)
        self.dropout = keras.layers.Dropout(0.1)
        # The final output layer to make the preditions
        self.classifier = keras.layers.Dense(vocab_size_spanish, activation='softmax')

    def call(self, enc_in, dec_in):
        # We need to create masks before making the embeddings to prevent the
        # attention mechanism from considering padded values
        encoder_padding_mask = self.create_padding_mask(enc_in)
        decoder_padding_mask = self.create_padding_mask(dec_in)
        decoder_causal_mask = self.create_causal_mask()
        # Combine the two masks for the decoder
        decoder_mask = tf.minimum(decoder_padding_mask, decoder_causal_mask)
        # Prepare the encoder input and run it
        enc_in = self.english_embed(enc_in)
        enc_in = self.dropout(enc_in)
        enc_out = self.encoder(enc_in, encoder_padding_mask)
        # Prepare the decoder input and run it
        dec_in = self.spanish_embed(dec_in)
        dec_in = self.dropout(dec_in)
        dec_out = self.decoder(dec_in, enc_out, decoder_mask, encoder_padding_mask)
        # Make the predictions
        dec_out = self.dropout(dec_out)
        dec_out = self.classifier(dec_out)
        return dec_out

    def create_causal_mask(self):
        # The causal mask is an upper triangular matrix with values set
        # to a very large negative number
        mask = np.triu(np.ones((self.l_d, self.l_d)) * -1.0e20, k=1)
        return mask

    def create_padding_mask(self, inputs):
        # Determine which positions are non-zero
        mask = tf.math.equal(inputs, 0)
        mask = tf.cast(mask, tf.float64)
        # Ensure the mask is flexible enough to broadcast over different size matrices
        mask = mask[:, tf.newaxis, :]
        # Set the values we want to mask to a very large negative number
        mask *= -1.0e20
        return mask

---

Now lets define the parameters of the model and build it. To keep things reasonably sized. At some point you could try changing these if you have a GPU that you can use for training, but you will need to use these values to load my model weights later.

*   $d_\textrm{model} = 256$
*   $n_\textrm{heads} = 8$
*   $d_k = 32$
*   $\textrm{ff_dim} = 1024$

In this example I have set $d_k = d_v$, so we won't see $d_v$ in this code. This follows the method used in the original transformer, but you could modify the layers above to allow for a different value of $d_v$

Using these values, you should find that the network has `13385624` parameters


In [None]:
# Fill in the model parameter values. Use those given about if you want to be
# able to load that weights from my trained network later
d_model = None
num_heads = None
d_k = None
ff_dim = None

# Define the two input layers that we need for the model
encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")

transformer_outputs = Transformer(sequence_length, d_model, d_k, ff_dim, num_heads, vocab_size_english, vocab_size_spanish, learned_encoding=True)(encoder_inputs,decoder_inputs)
transformer = keras.Model([encoder_inputs,decoder_inputs], transformer_outputs)
transformer.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer (Transformer)      (None, 21, 15000)    13385624    ['encoder_inputs[0][0]',         
                                                                  'decoder_inputs[0][0]']         
                                                                                                  
Total params: 13,385,624
Trainable params: 13,385,624
Non-trainable params: 0
________________

In [None]:
# We can use a learning rate scheduler to reduce the learning rate if necessary
# Not really necessary here as it will take quite a few epochs to be useful,
# but I leave it here for completeness
lr_schedule = keras.callbacks.ReduceLROnPlateau(patience=3)

# Use the optimiser settings from the original paper (but not the lr mechanism)
optimiser = keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.98, epsilon=1.0e-9)

# This custom loss function helps us to converge better. We ignore the padding class
# in the loss function and smooth the labels a little. It would still work ok
# just using the standard keras.losses.SparseCategoricalCrossentropy() loss
def masked_and_padded_scce_loss(y_true, y_pred, label_smoothing=0.1):
    # Convert from a sparse to a one-hot representation of the truth
    y_true_one_hot = tf.one_hot(tf.cast(y_true, tf.int32), depth=tf.shape(y_pred)[-1])
    # We want to ignore the padding class, which is the first element of the one-hot vector
    smoothed_loss = tf.keras.losses.categorical_crossentropy(
        y_true_one_hot[:,:,1:], y_pred[:,:,1:], from_logits=False,
        label_smoothing=label_smoothing)
    return smoothed_loss

# Compile the model ready for training
transformer.compile(optimiser, loss=masked_and_padded_scce_loss, metrics=["accuracy"])


---

Now we can train the network! Unfortunately this will be rather slow so in the next block you can load some weights from a network I trained previously. One a V100 it trains very quickly - less than 1 minute per epoch! If you have a GPU available, go ahead and train it for 10 epochs or so, otherwise feel free to skip this block


In [1]:
# Now we can actually train our transformer!
num_epochs = 1
transformer.fit(training_dataset, epochs=num_epochs, validation_data=validation_dataset, callbacks=[lr_schedule])
# Save the weights if you want to keep them
transformer.save_weights("my_transformer_weights.keras")

NameError: ignored

---

Training is unfortunately very slow on a CPU, so if you don't have any GPU access then feel free to load some model weights that I produced earlier


In [3]:
# Instead of training, we can load my weights
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1MPdEgvcCSCvKrrzWON_L0ILVaRwa2G0N' -O lhw_weights.keras
transformer.load_weights("lhw_weights.keras")

--2023-09-05 08:41:02--  https://docs.google.com/uc?export=download&id=1MPdEgvcCSCvKrrzWON_L0ILVaRwa2G0N
Resolving docs.google.com (docs.google.com)... 173.194.74.113, 173.194.74.138, 173.194.74.139, ...
Connecting to docs.google.com (docs.google.com)|173.194.74.113|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-00-0o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/lhfijj16d4bpu9f6lds61rnlmedld1ou/1693903200000/15250864113763849298/*/1MPdEgvcCSCvKrrzWON_L0ILVaRwa2G0N?e=download&uuid=291d970f-aad9-4782-b176-0e41922a4e99 [following]
--2023-09-05 08:41:05--  https://doc-00-0o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/lhfijj16d4bpu9f6lds61rnlmedld1ou/1693903200000/15250864113763849298/*/1MPdEgvcCSCvKrrzWON_L0ILVaRwa2G0N?e=download&uuid=291d970f-aad9-4782-b176-0e41922a4e99
Resolving doc-00-0o-docs.googleusercontent.com (doc-00-0o-docs.googleusercontent.com)... 142.251.161.132, 2607:f8b0

NameError: ignored


---

Now all we need to do is look at how to perform inference. As I said in the lectures, this is done in an iterative way. We pass the english sentence into the encoder, and then pass the `<sos>` (start of sentence) token to the decoder. The decoder then predicts the output word, which we then append to the `<sos>` token and run things again with this new input.

In [None]:

# Get our vocabulary from the vectorisation layer we made right back at the top
# of the notebook
spanish_vocab = spanish_vectorisation.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spanish_vocab)), spanish_vocab))

def run_inference(input_sentence):
    # Get the tokenised input sentence ready for embedding
    encoded_input = english_vectorisation([input_sentence])
    # The input to the decoder is initially just "<sos>". In the second
    # iteration the input becomes "<sos> <first_predicted_word>"
    decoded_sentence = "<sos>"
    for i in range(sequence_length):
        decoder_input = spanish_vectorisation([decoded_sentence])
        predictions = transformer([encoded_input, decoder_input])
        # Get the index of the word with the highest probability. The
        # index i here ensures we look at the correct row of the output
        # since it has size (1, len(decoded_sentence), vocab_size_spanish)
        predicted_word_index = np.argmax(predictions[0, i, :])
        predicted_word = spa_index_lookup[predicted_word_index]
        decoded_sentence += " " + predicted_word

        if predicted_word == "<eos>":
            break
    # Print the input and output, removing the <sos> and <eos> tokens
    print(input_sentence," :: ",decoded_sentence[6:-6])

Now for the fun part! I've added a few sentences for the transformer to try to translate. The first sentence is, of course, the one from the lectures. Feel free to play around and add your own sentences and see how it does (if you can't speak spanish, you could ask Lorena, Zahari, Google, and in a crisis, me.)

If you've loaded my weights, then the five sentences below are translated correctly. As a bit of a language technicality, in the lectures I said that `I have a big cat` should translate to `Yo tengo un gato grande`. You may notice that `Yo`, meaning `I`, is missing - pronouns such as `yo` are often dropped from spanish since the conjugation of the verb (`tengo`) tells you that the pronoun is `yo` anyway.

In [4]:
run_inference("i have a big cat")
run_inference("i have a small dog")
run_inference("are there horses here")
run_inference("my car is broken")
run_inference("im going to translate this sentence")
run_inference("Please translate this sentence")

NameError: ignored