<a href="https://colab.research.google.com/github/Michael-Sylvester/Ashesi-Deep-Learning/blob/main/Kaggle_Translation_RNN_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import kagglehub
rajpulapakura_english_to_french_small_dataset_path = kagglehub.dataset_download('rajpulapakura/english-to-french-small-dataset')

print('Data source import complete.')


Downloading from https://www.kaggle.com/api/v1/datasets/download/rajpulapakura/english-to-french-small-dataset?dataset_version_number=3...


100%|██████████| 4.51M/4.51M [00:00<00:00, 107MB/s]

Extracting files...
Data source import complete.





In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchaudio
from datasets import load_dataset, DatasetDict
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle
import nltk
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from kagglehub import KaggleDatasetAdapter
import spacy

print(f"PyTorch version: {torch.__version__}")
print(f"Torchaudio version: {torchaudio.__version__}")


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

PyTorch version: 2.8.0+cu126
Torchaudio version: 2.8.0+cu126
Device: cpu


In [None]:

# Set the path to the file you'd like to load
file_path = ""

# Load the latest version
# Download latest version
path = kagglehub.dataset_download("rajpulapakura/english-to-french-small-dataset")
print("Path to dataset files:", path)

# dataset.load_dataset(path)

# print("First 5 records:", df.head())

Using Colab cache for faster access to the 'english-to-french-small-dataset' dataset.
Path to dataset files: /kaggle/input/english-to-french-small-dataset


In [None]:
dataset = load_dataset(path)
print(dataset)


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['English', 'French'],
        num_rows: 229803
    })
})


In [None]:
# step 1: Pick 2000 random data points
sampled_dataset = dataset["train"].shuffle(seed=42).select(range(2000))

# step 2
split = sampled_dataset.train_test_split(test_size=0.2, seed=42)

# Step 3: split['test'] now acts as validation
train_valid = DatasetDict({
    "train": split["train"],
    "validation": split["test"]
})

# Step 4: create a separate test split (e.g., from training portion)
split2 = train_valid["train"].train_test_split(test_size=0.1, seed=42)

# Step 5: merge all together
dataset = DatasetDict({
    "train": split2["train"],
    "validation": train_valid["validation"],
    "test": split2["test"]
})

dataset

DatasetDict({
    train: Dataset({
        features: ['English', 'French'],
        num_rows: 1440
    })
    validation: Dataset({
        features: ['English', 'French'],
        num_rows: 400
    })
    test: Dataset({
        features: ['English', 'French'],
        num_rows: 160
    })
})

In [None]:
# Different tokenizer
from spacy.lang.fr import French
from spacy.lang.en import English
#Old tokenizer
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize


def preprocess_function(examples):
    return {
        "en_tokens": [word_tokenize(s.lower()) for s in examples["English"]],
        "fr_tokens": [word_tokenize(s.lower()) for s in examples["French"]],
    }

french_tokenizer = French()
english_tokenizer = English()

def newpreprocess_function(examples):
    return {
        "en_tokens": [[token.text for token in english_tokenizer(s.lower())] for s in examples["English"]],
        "fr_tokens": [[token.text for token in french_tokenizer(s.lower())] for s in examples["French"]],
    }

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset["train"][0]

Map:   0%|          | 0/2160 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

{'English': 'Am I a suspect?',
 'French': 'Suis-je un suspect\xa0?',
 'en_tokens': ['am', 'i', 'a', 'suspect', '?'],
 'fr_tokens': ['suis-je', 'un', 'suspect', '?']}

In [None]:
from collections import Counter

def build_vocab(sentences, min_freq=2):
    vocab = Counter()
    for s in sentences:
        vocab.update(s)
    # Filter words based on minimum frequency
    vocab_list = ["<pad>", "<sos>", "<eos>", "<unk>"] + [w for w, count in sorted(vocab.items()) if count >= min_freq]
    word2idx = {w: i for i, w in enumerate(vocab_list)}
    idx2word = {i: w for w, i in word2idx.items()}
    return word2idx, idx2word

en_word2idx, en_idx2word = build_vocab(tokenized_dataset["train"]["en_tokens"])
fr_word2idx, fr_idx2word = build_vocab(tokenized_dataset["train"]["fr_tokens"], min_freq=1)


print(en_word2idx)
print(en_idx2word)
print("---------------")
print(fr_word2idx)
print(fr_idx2word)

{'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3, '!': 4, "''": 5, "'d": 6, "'ll": 7, "'m": 8, "'re": 9, "'s": 10, "'ve": 11, ',': 12, '.': 13, '100': 14, '?': 15, '``': 16, 'a': 17, 'able': 18, 'about': 19, 'abroad': 20, 'absent': 21, 'accident': 22, 'acquainted': 23, 'acted': 24, 'actually': 25, 'addict': 26, 'advice': 27, 'advised': 28, 'afford': 29, 'afraid': 30, 'after': 31, 'afternoon': 32, 'again': 33, 'against': 34, 'agree': 35, 'ahead': 36, 'air': 37, 'airport': 38, 'all': 39, 'allow': 40, 'almost': 41, 'alone': 42, 'along': 43, 'already': 44, 'also': 45, 'always': 46, 'am': 47, 'amusing': 48, 'an': 49, 'and': 50, 'another': 51, 'answer': 52, 'answers': 53, 'any': 54, 'anybody': 55, 'anymore': 56, 'anyone': 57, 'anything': 58, 'anywhere': 59, 'apartment': 60, 'apples': 61, 'appreciate': 62, 'are': 63, 'argue': 64, 'argument': 65, 'arm': 66, 'around': 67, 'arrived': 68, 'art': 69, 'as': 70, 'ashamed': 71, 'ask': 72, 'asked': 73, 'asleep': 74, 'at': 75, 'attention': 76, 'australi

In [None]:
def encode(tokens, vocab, max_len=15):
    ids = [vocab.get("<sos>")] + [vocab.get(t, vocab["<unk>"]) for t in tokens] + [vocab.get("<eos>")]
    if len(ids) < max_len:
        ids += [vocab["<pad>"]] * (max_len - len(ids))
    else:
        ids = ids[:max_len]
    return ids

def encode_dataset(example):
    example["en_ids"] = encode(example["en_tokens"], en_word2idx)
    example["fr_ids"] = encode(example["fr_tokens"], fr_word2idx)
    return example

encoded_dataset = tokenized_dataset.map(encode_dataset)
encoded_dataset["train"][8]


Map:   0%|          | 0/2160 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

{'English': "Bartender, I'd like to have a drink.",
 'French': 'Barmaid, je voudrais à boire.',
 'en_tokens': ['bartender',
  ',',
  'i',
  "'d",
  'like',
  'to',
  'have',
  'a',
  'drink',
  '.'],
 'fr_tokens': ['barmaid', ',', 'je', 'voudrais', 'à', 'boire', '.'],
 'en_ids': [1, 83, 12, 398, 6, 463, 820, 367, 17, 235, 13, 2, 0, 0, 0],
 'fr_ids': [1, 258, 6, 1342, 2924, 2962, 299, 7, 2, 0, 0, 0, 0, 0, 0]}

In [None]:
def collate_fn(batch):
    en_batch = torch.tensor([b["en_ids"] for b in batch]).to(device)
    fr_batch = torch.tensor([b["fr_ids"] for b in batch]).to(device)
    return en_batch, fr_batch

train_loader = DataLoader(encoded_dataset["train"], batch_size=128, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(encoded_dataset["validation"], batch_size=128, collate_fn=collate_fn)

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, rnn_type='RNN'):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        if rnn_type == 'GRU':
            self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)
        else:
            self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)

    def forward(self, x):
        # x shape: (batch_size, seq_len)
        embedded = self.embedding(x)
        # embedded shape: (batch_size, seq_len, embed_size)
        outputs, hidden = self.rnn(embedded)
        # outputs shape: (batch_size, seq_len, hidden_size)
        # hidden shape: (num_layers, batch_size, hidden_size) - for RNN/GRU
        return hidden

class DecoderRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, rnn_type='RNN'):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        if rnn_type == 'GRU':
            self.rnn = nn.GRU(embed_size, hidden_size, batch_first=True)
        else:
            self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        # x shape: (batch_size) - unsqueeze to (batch_size, 1) for seq_len = 1
        x = x.unsqueeze(1)
        # x shape now: (batch_size, 1)
        embedded = self.embedding(x)
        # embedded shape: (batch_size, 1, embed_size)
        output, hidden = self.rnn(embedded, hidden)
        # output shape: (batch_size, 1, hidden_size)
        # hidden shape: (num_layers, batch_size, hidden_size)
        prediction = self.fc(output.squeeze(1))
        # prediction shape: (batch_size, vocab_size)
        return prediction, hidden

In [None]:
import random
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, teacher_force_ratio):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.teacher_force_ratio = teacher_force_ratio

    def forward(self, src, tgt):
        # src shape: (batch_size, src_len)
        # tgt shape: (batch_size, tgt_len)

        batch_size = src.shape[0] # corrected to batch_size dimension
        tgt_len = tgt.shape[1] # corrected to seq_len dimension
        tgt_vocab_size = len(fr_word2idx)

        # Get encoder's final hidden state
        # Encoder expects (batch_size, seq_len)
        hidden = self.encoder(src)

        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(src.device)

        # first input to the decoder is the <sos> token for the whole batch
        x = tgt[:, 0] # shape (batch_size)

        # Input the words from the encoder into the decoder, word by word for the whole batch
        for t in range(1, tgt_len):
            # decoder_output shape: (batch_size, vocab_size)
            # hidden shape: (num_layers, batch_size, hidden_size)
            decoder_output, hidden = self.decoder(x, hidden)

            # Store predictions for the current time step
            outputs[:, t, :] = decoder_output

            # Decide whether to use teacher forcing
            teacher_force = random.random() < self.teacher_force_ratio

            # Get the predicted next word for the whole batch
            best_guess = decoder_output.argmax(1) # shape (batch_size)

            # Use ground truth word or predicted word as next input
            x = tgt[:, t] if teacher_force else best_guess

        return outputs

In [None]:
# Set up hyperparameters
teacher_forcing_rate = 0.5
num_epochs = 20
learning_rate = 0.01
hidden_size = 256
embedded_size = 256
teacher_force = 0.5 # This variable is not used in the Seq2Seq forward pass anymore.
batch_size = 128 # Already defined and used in DataLoader

encoder_vocabsize = len(en_word2idx)
decoder_vocabsize = len(fr_word2idx)

encoder = EncoderRNN(encoder_vocabsize,embedded_size, hidden_size, 'GRU').to(device)
decoder = DecoderRNN(decoder_vocabsize, embedded_size, hidden_size, 'GRU').to(device)
model = Seq2Seq(encoder, decoder, teacher_forcing_rate).to(device)

# Ensure ignore_index is for the target language (<pad> token in French)
criterion = nn.CrossEntropyLoss(ignore_index=fr_word2idx["<pad>"])
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
    print(f"Epoch [{epoch+1}/{num_epochs}]")
    model.train()
    total_loss = 0

    for i, (src, tgt) in enumerate(train_loader):
        # DataLoaders provide batches with shape (batch_size, seq_len) when batch_first=True.
        # No need to transpose here as Encoder/Decoder are updated to handle batch_first.
        # src shape: (batch_size, src_len)
        # tgt shape: (batch_size, tgt_len)

        # Get output
        # output shape: (batch_size, tgt_len, vocab_size)
        output = model(src, tgt)

        # Reshape outputs and targets for loss calculation
        # output shape required for CrossEntropyLoss: (N, C) where C is num_classes
        # target shape required for CrossEntropyLoss: (N)
        # We exclude the <sos> token from the target and output for loss calculation (index 0)
        output = output[:, 1:].reshape(-1, output.shape[2]) # exclude <sos> time step, flatten batch*seq_len to N
        target = tgt[:, 1:].reshape(-1) # exclude <sos> time step, flatten batch*seq_len to N


        optimizer.zero_grad()
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Calculate and print average loss for the epoch
    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} average loss: {average_loss:.4f}")

Epoch [1/20]
Epoch 1 average loss: 6.0740
Epoch [2/20]
Epoch 2 average loss: 5.0581
Epoch [3/20]
Epoch 3 average loss: 4.4027
Epoch [4/20]
Epoch 4 average loss: 3.9831
Epoch [5/20]
Epoch 5 average loss: 3.4489
Epoch [6/20]
Epoch 6 average loss: 2.9559
Epoch [7/20]
Epoch 7 average loss: 2.7821
Epoch [8/20]
Epoch 8 average loss: 2.3479
Epoch [9/20]
Epoch 9 average loss: 2.1715
Epoch [10/20]
Epoch 10 average loss: 1.8613
Epoch [11/20]
Epoch 11 average loss: 1.6193
Epoch [12/20]
Epoch 12 average loss: 1.5500
Epoch [13/20]
Epoch 13 average loss: 1.1537
Epoch [14/20]
Epoch 14 average loss: 1.0010
Epoch [15/20]
Epoch 15 average loss: 0.7732
Epoch [16/20]
Epoch 16 average loss: 0.6319
Epoch [17/20]
Epoch 17 average loss: 0.5135
Epoch [18/20]
Epoch 18 average loss: 0.4916
Epoch [19/20]
Epoch 19 average loss: 0.3850
Epoch [20/20]
Epoch 20 average loss: 0.3327


In [None]:
def decode(ids, idx2word, pad_id=None, sos_id=None, eos_id=None):
    # optional: pass pad/sos/eos ids to avoid relying on global vars
    words = []
    for i in ids:
        if pad_id is not None and i == pad_id:
            continue
        if sos_id is not None and i == sos_id:
            continue
        if eos_id is not None and i == eos_id:
            break
        words.append(idx2word[i])
    return " ".join(words)


In [None]:
def translate_sentence(sentence_id, model, en_word2idx,fr_word2idx, fr_idx2word, max_len=15):
    model.eval()
    with torch.no_grad():
        # encode English sentence -> source ids using English vocab
        src_ids = torch.tensor(sentence_id).unsqueeze(0).to(device)
        # src_ids shape: (1, 1, seq_len) -> should be (batch_size, seq_len) -> (1, seq_len) after unsqueeze(0)

        # correct shape: (1, max_len) -> (batch_size, seq_len)
        src_ids = torch.tensor(sentence_id).unsqueeze(0).to(device)
        # src_ids shape: (1, max_len)

        # get encoder hidden state
        hidden = model.encoder(src_ids)
        # hidden shape: (num_layers, batch_size, hidden_size) -> (1, 1, 128)

        # start decoder with French <sos> token
        # Input to decoder is (batch_size)
        decoder_input = torch.tensor([fr_word2idx["<sos>"]], device=device) # shape (1)

        output_sentence = []
        for _ in range(max_len):  # limit sentence length
            # decoder_output shape: (batch_size, vocab_size) -> (1, vocab_size)
            # hidden shape: (num_layers, batch_size, hidden_size) -> (1, 1, 128)
            decoder_output, hidden = model.decoder(decoder_input, hidden)

            # Get the predicted next word ID
            next_word_id = decoder_output.argmax(1).item()

            # Convert ID to word
            next_word = fr_idx2word[next_word_id]

            if next_word == "<eos>":
                break

            output_sentence.append(next_word)

            # Use the predicted word as the next input to the decoder
            decoder_input = torch.tensor([next_word_id], device=device) # shape (1)

        return " ".join(output_sentence)

In [None]:
encoded_dataset["test"][0]

{'English': 'Quit acting like a child.',
 'French': 'Arrête de te comporter comme un enfant.',
 'en_tokens': ['quit', 'acting', 'like', 'a', 'child', '.'],
 'fr_tokens': ['arrête',
  'de',
  'te',
  'comporter',
  'comme',
  'un',
  'enfant',
  '.'],
 'en_ids': [1, 645, 3, 463, 17, 162, 13, 2, 0, 0, 0, 0, 0, 0, 0],
 'fr_ids': [1, 168, 717, 2704, 508, 491, 2827, 928, 7, 2, 0, 0, 0, 0, 0]}

In [None]:
references = []
hypotheses = []

# Set the model to evaluation mode
model.eval()

with torch.no_grad():
    # Iterate through the first 5 examples in the test set using slicing
    for example in encoded_dataset["test"].select(range(5)):
        english_sentence_id = example["en_ids"]
        English_sentence = example["English"]
        reference_french = example["French"]


        # Translate the tokenized English sentence to French
        translated_french_sentence = translate_sentence(english_sentence_id, model, en_word2idx, fr_word2idx, fr_idx2word)
        print(f"English: {English_sentence.lower()}")
        print(f"Translated French: {translated_french_sentence}")
        print(f"Expected French Sentence:  {reference_french.lower()}")

        french_tokenised = example['fr_tokens']
        references.append([word_tokenize(reference_french.lower())]) # corpus_bleu expects a list of lists for references
        hypotheses.append(word_tokenize(translated_french_sentence.lower()))


# Calculate BLEU score
bleu_score = corpus_bleu(references, hypotheses)

print(f"Corpus BLEU score: {bleu_score:.4f}")

English: quit acting like a child.
Translated French: c'est un plage publique .
Expected French Sentence:  arrête de te comporter comme un enfant.
English: they were somewhere else.
Translated French: ils ont été attaqués .
Expected French Sentence:  ils étaient ailleurs.
English: no one loves you.
Translated French: personne ne t ' a appelé .
Expected French Sentence:  personne ne t'aime.
English: i got lucky.
Translated French: je suis hâlée .
Expected French Sentence:  j'ai eu du bol.
English: we're a bit late.
Translated French: nous nous sommes , nous sommes , nous sommes arrivés au lac .
Expected French Sentence:  nous sommes un tantinet en retard.
Corpus BLEU score: 0.0000


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
