We’ll demonstrate sequence-to-sequence modeling on a machine translation task.
Machine translation is precisely what Transformer was developed for! We’ll start with a
recurrent sequence model, and we’ll follow up with the full Transformer architecture.

In [None]:
!wget http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
!unzip -q spa-eng.zip

--2024-01-01 05:08:48--  http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.125.207, 142.250.136.207, 142.250.148.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.125.207|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2638744 (2.5M) [application/zip]
Saving to: ‘spa-eng.zip’


2024-01-01 05:08:48 (195 MB/s) - ‘spa-eng.zip’ saved [2638744/2638744]



In [None]:
text_file = "spa-eng/spa.txt"
with open(text_file) as f:
 lines = f.read().split("\n")[:-1]
text_pairs = []
for line in lines:
 english, spanish = line.split("\t") # Each line contains an English phrase and its Spanish translation, tab-separated.
 spanish = "[start] " + spanish + " [end]" # We prepend "[start]" and append "[end]" to the Spanish sentence, to match the template of decoder
 text_pairs.append((english, spanish))

In [None]:
import random
print(random.choice(text_pairs))

("Please don't bother.", '[start] Por favor, no se moleste. [end]')


Let’s shuffle them and split them into the usual training, validation, and test sets

In [None]:
import random
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples
train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]

Next, let’s prepare two separate TextVectorization layers: one for English and one
for Spanish. We’re going to need to customize the way strings are preprocessed:


1) We need to preserve the "[start]" and "[end]" tokens that we’ve inserted. By
default, the characters [ and ] would be stripped, but we want to keep them
around so we can tell apart the word “start” and the start token "[start]".


2) Punctuation is different from language to language! In the Spanish TextVectorization layer, if we’re going to strip punctuation characters, we need to
also strip the character ¿.

**Important**: Note that for a non-toy translation model, we would treat punctuation characters as separate tokens rather than stripping them, since we would want to be able to generate correctly punctuated sentences. In our case, for simplicity, we’ll get rid of all punctuation.

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf

In [None]:
import tensorflow as tf
import string
import re

"""
Prepare a custom string
standardization function for the
Spanish TextVectorization layer:
it preserves [ and ] but strips ¿
(as well as all other characters
from strings.punctuation).
"""
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

In [None]:
def custom_standardization(input_string):
 lowercase = tf.strings.lower(input_string)
 return tf.strings.regex_replace(lowercase, f"[{re.escape(strip_chars)}]", "")

In [None]:
# To keep things simple, we’ll only look at the top 15,000 words in each language, and we’ll restrict sentences to 20 words.

vocab_size = 15000
sequence_length = 20

# The English layer
source_vectorization = layers.TextVectorization(
 max_tokens=vocab_size,
 output_mode="int",
 output_sequence_length=sequence_length,
)

# The Spanish layer
target_vectorization = layers.TextVectorization(
 max_tokens=vocab_size,
 output_mode="int",
 output_sequence_length=sequence_length + 1, # Generate Spanish sentences that have one extra token, since we’ll need to offset the sentence by one step during training.
 standardize=custom_standardization,
)

train_english_texts = [pair[0] for pair in train_pairs]
train_spanish_texts = [pair[1] for pair in train_pairs]

# Learn the vocabulary of each language.
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_spanish_texts)

Finally, we can turn our data into a tf.data pipeline.

We want it to return a tuple
(inputs, target) where inputs is a dict with two keys, “encoder_inputs” (the English
sentence) and “decoder_inputs” (the Spanish sentence), and target is the Spanish
sentence offset by one step ahead.

In [None]:
# Preparing datasets for the translation task

batch_size = 64
def format_dataset(eng, spa):
 eng = source_vectorization(eng)
 spa = target_vectorization(spa)
 return ({
 "english": eng,
 "spanish": spa[:, :-1], # The input Spanish sentence doesn’t include the last token to keep inputs and targets at the same length.
 }, spa[:, 1:]) # The target Spanish sentence is one step ahead. Both are still the same length (20 words).


def make_dataset(pairs):
 eng_texts, spa_texts = zip(*pairs)
 eng_texts = list(eng_texts)
 spa_texts = list(spa_texts)
 dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
 dataset = dataset.batch(batch_size)
 dataset = dataset.map(format_dataset, num_parallel_calls=4)
 return dataset.shuffle(2048).prefetch(16).cache() # Use in-memory caching to speed up preprocessing


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [None]:
for inputs, targets in train_ds.take(1):
    print(f"inputs['english'].shape: {inputs['english'].shape}")
    print(f"inputs['spanish'].shape: {inputs['spanish'].shape}")
    print(f"targets.shape: {targets.shape}")

inputs['english'].shape: (64, 20)
inputs['spanish'].shape: (64, 20)
targets.shape: (64, 20)


The data is now ready—time to build some models. We’ll start with a recurrent
sequence-to-sequence model before moving on to a Transformer.

# Sequence-to-sequence learning with RNNs

The simplest, naive way to use RNNs to turn a sequence into another sequence is
to keep the output of the RNN at each time step.

 In a proper sequence-to-sequence setup (see figure 11.13), you would first use an
RNN (the encoder) to turn the entire source sequence into a single vector (or set of
vectors). This could be the last output of the RNN, or alternatively, its final internal
state vectors. Then you would use this vector (or vectors) as the initial state of another RNN (the decoder), which would look at elements 0…N in the target sequence, and
try to predict step N+1 in the target sequence.

 Let’s implement this in Keras with GRU-based encoders and decoders. The choice
of GRU rather than LSTM makes things a bit simpler, since GRU only has a single
state vector, whereas LSTM has multiple. Let’s start with the encoder.

In [None]:
embed_dim = 256
latent_dim = 1024

source = keras.Input(shape=(None,), dtype="int64", name="english") # The English source sentence goes here. Specifying the name of the input enables us to fit() the model with a dict of inputs.
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(source) # Don’t forget masking: it’s critical in this setup.
encoded_source = layers.Bidirectional(layers.GRU(latent_dim), merge_mode="sum")(x) # Our encoded source sentence is the last output of a bidirectional GRU.

'sum': The outputs of the forward and backward passes are summed element-wise. This means that for each time step, the output is the sum of the corresponding forward and backward outputs.

[What is Merge Mode and it's usage](https://chat.openai.com/share/d31fbc96-e223-4263-a887-d5972be62337)

Next, let’s add the decoder—a simple GRU layer that takes as its initial state the
encoded source sentence. On top of it, we add a Dense layer that produces for each
output step a probability distribution over the Spanish vocabulary.

In [None]:
past_target = keras.Input(shape=(None,), dtype="int64", name="spanish") # The Spanish target sentence goes here.
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(past_target)
decoder_gru = layers.GRU(latent_dim, return_sequences=True)
x = decoder_gru(x, initial_state=encoded_source) # The encoded source sentence serves as the initial state of the decoder GRU.
x = layers.Dropout(0.5)(x)
target_next_step = layers.Dense(vocab_size, activation="softmax")(x) # Predicts the next token
seq2seq_rnn = keras.Model([source, past_target], target_next_step) # End-to-end model: maps the source sentence and the target sentence to the target sentence one step in the future

During training, the decoder takes as input the entire target sequence, but thanks to
the step-by-step nature of RNNs, it only looks at tokens 0…N in the input to predict token N in the output (which corresponds to the next token in the sequence, since
the output is intended to be offset by one step). This means we only use information
from the past to predict the future, as we should; otherwise we’d be cheating, and our
model would not work at inference time.

The term "offset by one step" means that the target sequence used during training is shifted by one position compared to the input sequence. In other words, the model is trained to predict the next token in the target sequence given the information from the input sequence up to the current step.

In the context of sequence-to-sequence models and recurrent neural networks (RNNs), the term "offset" refers to the misalignment or time shift between the input and output sequences during training. The idea is that when training a model to generate a sequence of tokens (e.g., translating a sentence from one language to another), you want the model to predict the next token in the output sequence based on the tokens observed so far in the input sequence.

In [None]:
seq2seq_rnn.compile(
 optimizer="rmsprop",
 loss="sparse_categorical_crossentropy",
 metrics=["accuracy"])
seq2seq_rnn.fit(train_ds, epochs=15, validation_data=val_ds)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x78185826add0>

We picked accuracy as a crude way to monitor validation-set performance during
training. We get to 64% accuracy: on average, the model predicts the next word in the
Spanish sentence correctly 64% of the time. However, in practice, next-token accuracy
isn’t a great metric for machine translation models, in particular because it makes the
assumption that the correct target tokens from 0 to N are already known when predicting token N+1.

In reality, during inference, you’re generating the target sentence
from scratch, and you can’t rely on previously generated tokens being 100% correct.
If you work on a real-world machine translation system, you will likely use “BLEU
scores” to evaluate your models—a metric that looks at entire generated sequences
and that seems to correlate well with human perception of translation quality.

[What is BLEU and how to use it ](https://chat.openai.com/share/38f198b2-21b0-41fe-b1de-6a31f31a0602)

 At last, let’s use our model for inference. We’ll pick a few sentences in the test set
and check how our model translates them. We’ll start from the seed token, "[start]",
and feed it into the decoder model, together with the encoded English source sentence. We’ll retrieve a next-token prediction, and we’ll re-inject it into the decoder
repeatedly, sampling one new target token at each iteration, until we get to "[end]"
or reach the maximum sentence length.

In [None]:
import numpy as np
spa_vocab = target_vectorization.get_vocabulary()

# Prepare a dict to convert token index predictions to string tokens
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
max_decoded_sentence_length = 20

In [None]:
def decode_sequence(input_sentence):
  tokenized_input_sentence = source_vectorization([input_sentence])
  decoded_sentence = "[start]" # Seed token
  for i in range(max_decoded_sentence_length):
    tokenized_target_sentence = target_vectorization([decoded_sentence])
    next_token_predictions = seq2seq_rnn.predict([tokenized_input_sentence, tokenized_target_sentence])
    sampled_token_index = np.argmax(next_token_predictions[0, i, :]) # Sample the next token.
    sampled_token = spa_index_lookup[sampled_token_index]
    decoded_sentence += " " + sampled_token # Convert the next token prediction to a string and append it to the generated sentence.
    if sampled_token == "[end]": # Exit condition: either hit max length or sample a stop character
      break
  return decoded_sentence

In [None]:
test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(20):
 input_sentence = random.choice(test_eng_texts)
 print("-")
 print(input_sentence)
 print(decode_sequence(input_sentence))

-
I'm afraid I didn't explain it too well.
[start] me temo que no tenía tanto así [end]
-
Cheer up! Things are not as bad as you think.
[start] no te [UNK] tan bien como la gente [end]
-
My watch stopped, so I didn't know the time.
[start] mi reloj no se lo di cuenta pero no pudo encontrar [end]
-
I'm not saying that it's not possible.
[start] no estoy diciendo que eso no es posible [end]
-
I judged you too quickly.
[start] te [UNK] muy rápido [end]
-
Do you think I need to go?
[start] piensas que me tengo que ir [end]
-
Tom spent a few years in Boston when he was in college.
[start] tom se comió tres años en la semana que estaba en la cárcel [end]
-
Tom was hungry.
[start] tom estaba hambriento [end]
-
Can you help me wash these dishes?
[start] me puedes ayudar a estos animales [end]
-
I get goose bumps when I see a horror movie.
[start] me [UNK] la [UNK] cuando te [UNK] una película [end]
-
He is one of my neighbours.
[start] Él es uno de mis [UNK] [end]
-
I gave him a present in ret

Note that this inference setup, while very simple, is rather inefficient, since we reprocess the entire source sentence and the entire generated target sentence every time
we sample a new word. In a practical application, you’d factor the encoder and the
decoder as two separate models, and your decoder would only run a single step at
each token-sampling iteration, reusing its previous internal state.

# Sequence-to-sequence learning with Transformer

In [None]:
class TransformerEncoder(layers.Layer):
  def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
    super().__init__(**kwargs)
    self.embed_dim = embed_dim # Size of the input token vectors
    self.dense_dim = dense_dim # Size of the inner dense layer
    self.num_heads = num_heads # Number of attention heads
    self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    self.dense_proj = keras.Sequential(
        [layers.Dense(dense_dim, activation="relu"),
        layers.Dense(embed_dim),]
        )
    self.layernorm_1 = layers.LayerNormalization()
    self.layernorm_2 = layers.LayerNormalization()
  def call(self, inputs, mask=None): # Computation goes in call().
    if mask is not None: # The mask that will be generated by the Embedding layer will be 2D, but the attention layer expects to be 3D or 4D, so we expand its rank.
      mask = mask[:, tf.newaxis, :]
    attention_output = self.attention(
    inputs, inputs, attention_mask=mask)
    proj_input = self.layernorm_1(inputs + attention_output)
    proj_output = self.dense_proj(proj_input)
    return self.layernorm_2(proj_input + proj_output)
  def get_config(self): # Implement serialization so we can save the model.
    config = super().get_config()
    config.update({"embed_dim": self.embed_dim,"num_heads": self.num_heads,"dense_dim": self.dense_dim,})
    return config

In [None]:
# The TransformerDecoder
class TransformerDecoder(layers.Layer):
 def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
    super().__init__(**kwargs)
    self.embed_dim = embed_dim
    self.dense_dim = dense_dim
    self.num_heads = num_heads
    self.attention_1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    self.attention_2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    self.dense_proj = keras.Sequential(
        [layers.Dense(dense_dim, activation="relu"),
        layers.Dense(embed_dim),]
    )
    self.layernorm_1 = layers.LayerNormalization()
    self.layernorm_2 = layers.LayerNormalization()
    self.layernorm_3 = layers.LayerNormalization()
    self.supports_masking = True # This attribute ensures that the layer will propagate its input mask to its outputs; masking in Keras is explicitly opt-in. If you pass a mask to a layer that doesn’t
    # Implement compute_mask() and that doesn’t expose this supports_masking attribute, that’s an error.
 def get_config(self):
    config = super().get_config()
    config.update({
      "embed_dim": self.embed_dim,
      "num_heads": self.num_heads,
      "dense_dim": self.dense_dim,
    })
    return config
 def get_causal_attention_mask(self, inputs):
    input_shape = tf.shape(inputs)
    batch_size, sequence_length = input_shape[0], input_shape[1]
    i = tf.range(sequence_length)[:, tf.newaxis]
    j = tf.range(sequence_length)
    mask = tf.cast(i >= j, dtype="int32") # Generate matrix of shape (sequence_length, sequence_length) with 1s in one half and 0s in the other

    # Replicate it along the batch axis to get a matrix of shape (batch_size, sequence_length, sequence_length)
    mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
    mult = tf.concat([tf.expand_dims(batch_size, -1),tf.constant([1, 1], dtype=tf.int32)], axis=0)
    return tf.tile(mask, mult)

 def call(self, inputs, encoder_outputs, mask=None):
    causal_mask = self.get_causal_attention_mask(inputs)
    if mask is not None:
      # Prepare the input mask (that describes padding locations in the target sequence).
      padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
      padding_mask = tf.minimum(padding_mask, causal_mask) # Merge the two masks together.
    attention_output_1 = self.attention_1(query=inputs,value=inputs,key=inputs,attention_mask=causal_mask) # Pass the causal mask to the first attention layer, which performs self-attention over the target sequence.
    attention_output_1 = self.layernorm_1(inputs + attention_output_1)
    attention_output_2 = self.attention_2(
    query=attention_output_1,value=encoder_outputs,key=encoder_outputs,attention_mask=padding_mask,) # Pass the combined mask to the second attention layer, which relates the source sequence to the target sequence.
    attention_output_2 = self.layernorm_2(attention_output_1 + attention_output_2)
    proj_output = self.dense_proj(attention_output_2)
    return self.layernorm_3(attention_output_2 + proj_output)


 Causal padding is absolutely critical to successfully training
a sequence-to-sequence Transformer. Unlike an RNN, which looks at its input one
step at a time, and thus will only have access to steps 0...N to generate output step N
(which is token N+1 in the target sequence), the TransformerDecoder is order-agnostic: it looks at the entire target sequence at once. If it were allowed to use its entire
input, it would simply learn to copy input step N+1 to location N in the output. The
model would thus achieve perfect training accuracy, but of course, when running
inference, it would be completely useless, since input steps beyond N aren’t available.

The fix is simple: we’ll mask the upper half of the pairwise attention matrix to prevent the model from paying any attention to information from the future—only information from tokens 0...N in the target sequence should be used when generating
target token N+1. To do this, we’ll add a get_causal_attention_mask(self, inputs)
method to our TransformerDecoder to retrieve an attention mask that we can pass to
our MultiHeadAttention layers.

In [None]:
class PositionalEmbedding(layers.Layer):
 def __init__(self, sequence_length, input_dim, output_dim, **kwargs): # A downside of position embeddings is that the sequence length needs to be known in advance
    super().__init__(**kwargs)
    self.token_embeddings = layers.Embedding(input_dim=input_dim, output_dim=output_dim) # Prepare an Embedding layer for the token indices.
    self.position_embeddings = layers.Embedding(input_dim=sequence_length, output_dim=output_dim) # And another one for the token positions
    self.sequence_length = sequence_length
    self.input_dim = input_dim
    self.output_dim = output_dim

 def call(self, inputs):
    length = tf.shape(inputs)[-1]
    positions = tf.range(start=0, limit=length, delta=1) # delta=1: The step size between consecutive values in the sequence is 1.
    embedded_tokens = self.token_embeddings(inputs)
    embedded_positions = self.position_embeddings(positions)
    return embedded_tokens + embedded_positions # Add both embedding vectors together.

 def compute_mask(self, inputs, mask=None):
    # Like the Embedding layer, this layer should be able to generate a mask so we can ignore padding 0s in the inputs. The compute_mask method will called automatically by the framework, and the mask will get propagated
    # to the next layer.
    return tf.math.not_equal(inputs, 0)

 def get_config(self):
    # Implement serialization so we can save the model.
    config = super().get_config()
    config.update({
      "output_dim": self.output_dim,
      "sequence_length": self.sequence_length,
      "input_dim": self.input_dim,
    })
    return config

The end-to-end Transformer is the model we’ll be training. It maps the source
sequence and the target sequence to the target sequence one step in the future. It
straightforwardly combines the pieces we’ve built so far: PositionalEmbedding layers,
the TransformerEncoder, and the TransformerDecoder. Note that both the TransformerEncoder and the TransformerDecoder are shape-invariant, so you could be
stacking many of them to create a more powerful encoder or decoder.

In [None]:
embed_dim = 256
dense_dim = 2048
num_heads = 8


encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="english")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x) # Encode the source sentence.


decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="spanish")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs) # Encode the target sentence and combine it with the encoded source sentence
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)  # Predict a word for each output position
transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)


In [None]:
transformer.compile(
 optimizer="rmsprop",
 loss="sparse_categorical_crossentropy",
 metrics=["accuracy"])
transformer.fit(train_ds, epochs=30, validation_data=val_ds)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7ca67c11ab60>

Finally, let’s try using our model to translate never-seen-before English sentences from
the test set. The setup is identical to what we used for the sequence-to-sequence RNN
model.

In [None]:
import numpy as np

spa_vocab = target_vectorization.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
 tokenized_input_sentence = source_vectorization([input_sentence])
 decoded_sentence = "[start]"
 for i in range(max_decoded_sentence_length):
    tokenized_target_sentence = target_vectorization([decoded_sentence])[:, :-1]
    # Sample the next token.
    predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])
    sampled_token_index = np.argmax(predictions[0, i, :])

    # Convert the next token prediction to a string, and append it to the generated sentence.
    sampled_token = spa_index_lookup[sampled_token_index]
    decoded_sentence += " " + sampled_token
    if sampled_token == "[end]":
      break
 return decoded_sentence

test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(20):
 input_sentence = random.choice(test_eng_texts)
 print("-")
 print(input_sentence)
 print(decode_sequence(input_sentence))

-
I was framed.
[start] fui un [UNK] [end]
-
Tom wanted you to think he'd died.
[start] tom quería que [UNK] [end]
-
We don't believe that Tom will be able to master French.
[start] no queremos que tom sea capaz de aprender francés tan pronto [end]
-
Cancer can be cured if discovered in time.
[start] el cáncer puede ser [UNK] en este momento [end]
-
Please make five copies of this document.
[start] por favor hagas cinco [UNK] de este avión [end]
-
She looked around the room.
[start] ella miró alrededor de la habitación [end]
-
I don't want visitors.
[start] no quiero quince [end]
-
Black and white photos have a special charm.
[start] negro y la fotos tienen un viejo [UNK] [end]
-
The wind blew hard.
[start] el viento le pasó duro [end]
-
We have a colleague in Spain.
[start] tenemos una naranja en alemania [end]
-
You ask questions about everything.
[start] tú hagas preguntas acerca de todo [end]
-
I'm from Canada.
[start] soy de canadá [end]
-
I assure you everything will be ready on 

Subjectively, the Transformer seems to perform significantly better than the GRUbased translation model. It’s still a toy model, but it’s a better toy model.

While the source sentence wasn’t
gendered, this translation assumes
a male speaker. Keep in mind that
translation models will often make
unwarranted assumptions about
their input data, which leads to
algorithmic bias. In the worst
cases, a model might hallucinate
memorized information that has
nothing to do with the data it’s
currently processing