# Seq2Seq Machine Translation (English → Portuguese)

In this notebook, we implement a **Sequence-to-Sequence (Seq2Seq)** model with Attention for translating **English sentences into Portuguese**.

### Why Seq2Seq?
- Unlike plain RNN/LSTM/GRU, Seq2Seq is designed for **input and output sequences of different lengths**.
- Example: An English sentence may have 5 words, while the Portuguese translation may have 8 words.

### Key Idea
- **Encoder** reads the input (English) and converts it into context vectors.
- **Decoder** generates the output (Portuguese), word by word, using **Attention** to focus on relevant input words.

### Applications
- Machine Translation (Google Translate, etc.)
- Text Summarization
- Chatbots
- Speech-to-Text systems


In [1]:
# Step 1: Import libraries & load dataset
import tensorflow as tf
import tensorflow_datasets as tfds

examples, metadata = tfds.load(
    'ted_hrlr_translate/pt_to_en',
    with_info=True,
    as_supervised=True
)

# Use small sample (300 train, 50 val) for fast testing
train_examples = examples['train'].take(300)
val_examples   = examples['validation'].take(50)

print("✅ Dataset loaded")


✅ Dataset loaded


In [2]:
# Step 2: Tokenize
tokenizer_pt = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (pt.numpy() for pt, en in train_examples), target_vocab_size=1000)

tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (en.numpy() for pt, en in train_examples), target_vocab_size=1000)

print("Portuguese vocab size:", tokenizer_pt.vocab_size)
print("English vocab size:", tokenizer_en.vocab_size)


Portuguese vocab size: 1032
English vocab size: 969


In [3]:
# Step 3: Encode and batch
MAX_LEN = 20
BATCH_SIZE = 16

def encode(pt, en):
    pt = [tokenizer_pt.vocab_size] + tokenizer_pt.encode(pt.numpy())[:MAX_LEN] + [tokenizer_pt.vocab_size+1]
    en = [tokenizer_en.vocab_size] + tokenizer_en.encode(en.numpy())[:MAX_LEN] + [tokenizer_en.vocab_size+1]
    return pt, en

def tf_encode(pt, en):
    return tf.py_function(encode, [pt, en], [tf.int64, tf.int64])

train_dataset = (train_examples.map(tf_encode)
                 .padded_batch(BATCH_SIZE, padded_shapes=([None], [None])))

val_dataset = (val_examples.map(tf_encode)
               .padded_batch(BATCH_SIZE, padded_shapes=([None], [None])))

print("✅ Dataset prepared")


✅ Dataset prepared


In [4]:
# Step 4: Define Encoder
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, units):
        super().__init__()
        self.units = units
        self.embedding = tf.keras.layers.Embedding(vocab_size+2, embedding_dim)
        self.gru = tf.keras.layers.GRU(units, return_sequences=True, return_state=True)

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state

    def init_hidden(self, batch_sz):
        return tf.zeros((batch_sz, self.units))


In [5]:
# Step 5: Define Decoder
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, units):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size+2, embedding_dim)
        self.gru = tf.keras.layers.GRU(units, return_sequences=True, return_state=True)
        self.fc = tf.keras.layers.Dense(vocab_size+2)

    def call(self, x, hidden, enc_output):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        x = self.fc(output)
        return x, state


In [6]:
# Step 6: Initialize models
embedding_dim = 32
units = 128

encoder = Encoder(tokenizer_pt.vocab_size, embedding_dim, units)
decoder = Decoder(tokenizer_en.vocab_size, embedding_dim, units)

optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)


In [7]:
# Step 7: Define training step
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([tokenizer_en.vocab_size] * inp.shape[0], 1)

        for t in range(1, targ.shape[1]):
            predictions, dec_hidden = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_object(targ[:, t], predictions[:, -1, :])
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = loss / int(targ.shape[1])
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss


In [8]:
# Step 8: Train (only 2 epochs, small dataset)
EPOCHS = 5
for epoch in range(EPOCHS):
    total_loss = 0
    for (batch, (inp, targ)) in enumerate(train_dataset.take(20)):
        enc_hidden = encoder.init_hidden(inp.shape[0])
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss
    print(f"Epoch {epoch+1}, Loss: {total_loss/(batch+1):.4f}")


Epoch 1, Loss: 6.4513
Epoch 2, Loss: 5.2274
Epoch 3, Loss: 4.9889
Epoch 4, Loss: 4.8332
Epoch 5, Loss: 4.6881


In [9]:
# Step 9: Test translation
def translate(sentence):
    inputs = [tokenizer_pt.vocab_size] + tokenizer_pt.encode(sentence)[:MAX_LEN] + [tokenizer_pt.vocab_size+1]
    inputs = tf.expand_dims(inputs, 0)

    result = []
    hidden = encoder.init_hidden(1)
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([tokenizer_en.vocab_size], 0)

    for t in range(MAX_LEN):
        predictions, dec_hidden = decoder(dec_input, dec_hidden, enc_out)
        predicted_id = tf.argmax(predictions[0, -1, :]).numpy()
        if predicted_id == tokenizer_en.vocab_size+1:
            break
        result.append(predicted_id)
        dec_input = tf.expand_dims([predicted_id], 0)

    return tokenizer_en.decode(result)

print("✅ Example:", translate("olá mundo"))


✅ Example: and and  , 


In [14]:
def translate(sentence):
    # Encode the Portuguese sentence
    inputs = [tokenizer_pt.vocab_size] + tokenizer_pt.encode(sentence) + [tokenizer_pt.vocab_size + 1]
    inputs = tf.expand_dims(inputs, 0)  # batch size = 1

    result = []

    hidden = tf.zeros((1, units))  # batch size = 1
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([tokenizer_en.vocab_size], 0)  # start token

    for t in range(50):
        predictions, dec_hidden = decoder(dec_input, dec_hidden, enc_out)

        # Remove batch dimension
        predictions = tf.squeeze(predictions, axis=0)  # shape -> (seq_len, vocab_size)
        predicted_id = tf.argmax(predictions, axis=-1).numpy()[0]  # first token

        if predicted_id == tokenizer_en.vocab_size + 1:  # end token
            break

        result.append(tokenizer_en.decode([predicted_id]))
        dec_input = tf.expand_dims([predicted_id], 0)

    return ' '.join(result)

# Test
test_sentences = [
    "olá",
    "como você está?",
    "bom dia",
    "eu gosto de aprender"
]

for sent in test_sentences:
    print(f"PT: {sent}")
    print(f"EN: {translate(sent)}\n")


PT: olá
EN:                                                  

PT: como você está?
EN: and  and   ,   ,   ,   

PT: bom dia
EN: and                                                  

PT: eu gosto de aprender
EN: and  and   ,   ,   ,   



## Dataset and Training Overview

- **Dataset Size:** 300–500 sentences (Portuguese ↔ English)
- **Purpose:** Learn Seq2Seq with Attention quickly
- **Training:** Few epochs (5) for fast demonstration
- **What We Learn:** 
  - Tokenization & embeddings
  - Encoder-Decoder structure
  - Attention mechanism
  - Sample translations & loss reduction
- **Note:** Small dataset & few epochs → faster training, not production-ready
