In [5]:
# Basic packages
import os
import re
import random
import time
import gc
from collections import Counter, defaultdict
import heapq

# Torch & DL utilities
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence

# NLP utilities
import nltk
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import corpus_bleu
from datasets import load_dataset

# Progress bar
from tqdm import tqdm


In [6]:
from datasets import load_dataset

dataset = load_dataset("opus100","en-it")

train_data = dataset["train"].shuffle(seed=42).select(range(500_000))
val_data = dataset["validation"]
test_data = dataset["test"]

print("Train size:", len(train_data))
print("Validation size:", len(val_data))
print("Test size:", len(test_data))


README.md:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/223k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/91.7M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/220k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Train size: 500000
Validation size: 2000
Test size: 2000


In [7]:
def clean_text(text):
    text = re.sub(r"[^a-zA-ZàèéìòùÀÈÉÌÒÙ\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def apply_cleaning(dataset_split):
    return dataset_split.map(lambda x: {
        "translation": {
            "en": clean_text(x["translation"]["en"]),
            "it": clean_text(x["translation"]["it"]),
        }
    })

train_data = apply_cleaning(train_data)
val_data = apply_cleaning(val_data)
test_data = apply_cleaning(test_data)


Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [8]:
def clean_and_tokenize(example):
    # Lowercase and whitespace-tokenize both source and target
    src = example["translation"]["en"].lower().strip().split()
    tgt = example["translation"]["it"].lower().strip().split()
    return {"src_tokens": src, "tgt_tokens": tgt}

# Apply to all splits
train_data = train_data.map(clean_and_tokenize)
val_data = val_data.map(clean_and_tokenize)
test_data = test_data.map(clean_and_tokenize)


Map:   0%|          | 0/500000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [9]:
def remove_long_sentences(example, max_len=10):
    return len(example["src_tokens"]) <= max_len and len(example["tgt_tokens"]) <= max_len
train_data = train_data.filter(remove_long_sentences)
val_data = val_data.filter(remove_long_sentences)
test_data = test_data.filter(remove_long_sentences)


Filter:   0%|          | 0/500000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [10]:
def build_vocab(token_lists, max_vocab_size=12000):
    counter = Counter()
    for tokens in token_lists:
        counter.update(tokens)
    most_common = counter.most_common(max_vocab_size - 4)

    vocab = {
        "<PAD>": 0,
        "<SOS>": 1,
        "<EOS>": 2,
        "<UNK>": 3
    }

    for idx, (token, _) in enumerate(most_common):
        vocab[token] = idx + 4

    return vocab

# Build English (source) and Italian (target) vocabularies
en_vocab = build_vocab(train_data["src_tokens"], max_vocab_size=10000)
it_vocab = build_vocab(train_data["tgt_tokens"], max_vocab_size=10000)
en_itos = {i: s for s, i in en_vocab.items()}
it_itos = {i: s for s, i in it_vocab.items()}

In [11]:
# Count token frequencies
src_counter = Counter(tok for sent in train_data["src_tokens"] for tok in sent)
tgt_counter = Counter(tok for sent in train_data["tgt_tokens"] for tok in sent)

# Replace infrequent tokens with <UNK>
def mark_rare_tokens(tokens, counter, threshold):
    return [tok if counter[tok] >= threshold else "<UNK>" for tok in tokens]

def replace_with_unk(example):
    src = mark_rare_tokens(example["src_tokens"], src_counter, threshold=2)
    tgt = mark_rare_tokens(example["tgt_tokens"], tgt_counter, threshold=2)
    return {"src_tokens": src, "tgt_tokens": tgt}

# Apply UNK replacement
train_data = train_data.map(replace_with_unk)
val_data = val_data.map(replace_with_unk)
test_data = test_data.map(replace_with_unk)

# Remove pairs containing <UNK>
def remove_unk_pairs(example):
    return "<UNK>" not in example["src_tokens"] and "<UNK>" not in example["tgt_tokens"]

train_data = train_data.filter(remove_unk_pairs)
val_data = val_data.filter(remove_unk_pairs)
test_data = test_data.filter(remove_unk_pairs)

# Show resulting dataset sizes
print(f"Filtered Train Size: {len(train_data):,}")
print(f"Filtered Validation Size: {len(val_data):,}")
print(f"Filtered Test Size: {len(test_data):,}")


Map:   0%|          | 0/348175 [00:00<?, ? examples/s]

Map:   0%|          | 0/1226 [00:00<?, ? examples/s]

Map:   0%|          | 0/1206 [00:00<?, ? examples/s]

Filter:   0%|          | 0/348175 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1226 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1206 [00:00<?, ? examples/s]

Filtered Train Size: 297,057
Filtered Validation Size: 883
Filtered Test Size: 863


In [12]:
print(train_data[12])

{'translation': {'en': 'Dont ever say that again', 'it': 'Non dirlo mai piu'}, 'src_tokens': ['dont', 'ever', 'say', 'that', 'again'], 'tgt_tokens': ['non', 'dirlo', 'mai', 'piu']}


In [13]:
class TranslationDataset(Dataset):
    def __init__(self, src_tokens_list, tgt_tokens_list, src_vocab, tgt_vocab, max_len=10):
        self.src_tokens = src_tokens_list
        self.tgt_tokens = tgt_tokens_list
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.src_tokens)

    def __getitem__(self, idx):
        src_seq = self.src_tokens[idx]
        tgt_seq = self.tgt_tokens[idx]

        src_ids = [self.src_vocab.get(token, self.src_vocab["<UNK>"]) for token in src_seq]
        tgt_ids = [self.tgt_vocab["<SOS>"]] + \
                  [self.tgt_vocab.get(token, self.tgt_vocab["<UNK>"]) for token in tgt_seq] + \
                  [self.tgt_vocab["<EOS>"]]

        # Pad or truncate
        src_ids = self._pad_or_truncate(src_ids, self.src_vocab["<PAD>"])
        tgt_ids = self._pad_or_truncate(tgt_ids, self.tgt_vocab["<PAD>"])

        return torch.tensor(src_ids, dtype=torch.long), torch.tensor(tgt_ids, dtype=torch.long)

    def _pad_or_truncate(self, ids, pad_idx):
        if len(ids) > self.max_len:
            return ids[:self.max_len]
        else:
            return ids + [pad_idx] * (self.max_len - len(ids))


In [14]:
# Create dataset instances
train_dataset = TranslationDataset(
    src_tokens_list=train_data["src_tokens"],
    tgt_tokens_list=train_data["tgt_tokens"],
    src_vocab=en_vocab,
    tgt_vocab=it_vocab,
    max_len=10
)

val_dataset = TranslationDataset(
    src_tokens_list=val_data["src_tokens"],
    tgt_tokens_list=val_data["tgt_tokens"],
    src_vocab=en_vocab,
    tgt_vocab=it_vocab,
    max_len=10
)

test_dataset = TranslationDataset(
    src_tokens_list=test_data["src_tokens"],
    tgt_tokens_list=test_data["tgt_tokens"],
    src_vocab=en_vocab,
    tgt_vocab=it_vocab,
    max_len=10
)


In [15]:
def get_dataloader(dataset, batch_size=64, shuffle=True):
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=4,
        pin_memory=True
    )

batch_size = 128

train_loader = get_dataloader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = get_dataloader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = get_dataloader(test_dataset, batch_size=batch_size, shuffle=False)


In [16]:
class Seq2SeqAttention(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, embedding_dim, hidden_dim,
                 src_pad_idx, tgt_sos_idx, tgt_eos_idx, n_layers=1, dropout=0.2):
        super().__init__()

        # Embedding layers
        self.encoder_embedding = nn.Embedding(src_vocab_size, embedding_dim, padding_idx=src_pad_idx)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, embedding_dim, padding_idx=src_pad_idx)

        # Encoder: bidirectional LSTM
        self.encoder_lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=n_layers,
            dropout=dropout if n_layers > 1 else 0,
            batch_first=True,
            bidirectional=True
        )

        # Decoder: unidirectional LSTM
        self.decoder_lstm = nn.LSTM(
            input_size=embedding_dim + hidden_dim * 2,
            hidden_size=hidden_dim,
            num_layers=n_layers,
            dropout=dropout if n_layers > 1 else 0,
            batch_first=True
        )

        # Attention: project decoder hidden → same size as encoder outputs
        self.attn = nn.Linear(hidden_dim, hidden_dim * 2)

        # Final output layer
        self.fc_out = nn.Linear(hidden_dim + hidden_dim * 2, tgt_vocab_size)

        # Project encoder hidden/cell → decoder hidden/cell
        self.bridge = nn.Linear(hidden_dim * 2, hidden_dim)

        self.dropout = nn.Dropout(dropout)
        self.hidden_dim = hidden_dim
        self.tgt_sos_idx = tgt_sos_idx
        self.tgt_eos_idx = tgt_eos_idx
        self.n_layers = n_layers

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size, tgt_len = tgt.size()
        vocab_size = self.fc_out.out_features

        # ---- Encoder ----
        embedded_src = self.dropout(self.encoder_embedding(src))  # [B, src_len, emb]
        encoder_outputs, (hidden, cell) = self.encoder_lstm(embedded_src)  # encoder_outputs: [B, src_len, 2H]

        hidden = self._combine_directions(hidden)  # [n_layers, B, 2H]
        cell = self._combine_directions(cell)

        hidden = torch.tanh(self.bridge(hidden))  # [n_layers, B, H]
        cell = torch.tanh(self.bridge(cell))      # [n_layers, B, H]

        # ---- Decoder ----
        inputs = tgt[:, 0]  # <SOS>
        outputs = torch.zeros(batch_size, tgt_len - 1, vocab_size).to(src.device)

        for t in range(1, tgt_len):
            input_emb = self.dropout(self.decoder_embedding(inputs)).unsqueeze(1)  # [B, 1, emb]

            # Attention
            attn_query = self.attn(hidden[-1]).unsqueeze(1)  # [B, 1, 2H]
            attn_weights = torch.bmm(attn_query, encoder_outputs.transpose(1, 2))  # [B, 1, src_len]
            attn_weights = F.softmax(attn_weights, dim=2)
            context = torch.bmm(attn_weights, encoder_outputs)  # [B, 1, 2H]

            # Decoder step
            rnn_input = torch.cat((input_emb, context), dim=2)  # [B, 1, emb + 2H]
            output, (hidden, cell) = self.decoder_lstm(rnn_input, (hidden, cell))  # output: [B, 1, H]

            pred = self.fc_out(torch.cat((output.squeeze(1), context.squeeze(1)), dim=1))  # [B, vocab]
            outputs[:, t - 1] = pred

            # Next input
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = pred.argmax(1)
            inputs = tgt[:, t] if teacher_force else top1

        return outputs  # [B, tgt_len - 1, vocab]

    def predict(self, src, max_len=50, device='cpu'):
        self.eval()
        src = src.unsqueeze(0).to(device)  # [1, src_len]

        with torch.no_grad():
            embedded_src = self.encoder_embedding(src)  # [1, src_len, emb]
            encoder_outputs, (hidden, cell) = self.encoder_lstm(embedded_src)

            hidden = self._combine_directions(hidden)  # [n_layers, 1, 2H]
            cell = self._combine_directions(cell)

            hidden = torch.tanh(self.bridge(hidden))  # [n_layers, 1, H]
            cell = torch.tanh(self.bridge(cell))      # [n_layers, 1, H]

            inputs = torch.tensor([self.tgt_sos_idx], device=device)  # [1]
            outputs = []

            for _ in range(max_len):
                input_emb = self.decoder_embedding(inputs).unsqueeze(1)  # [1, 1, emb]

                attn_query = self.attn(hidden[-1]).unsqueeze(1)  # [1, 1, 2H]
                attn_weights = torch.bmm(attn_query, encoder_outputs.transpose(1, 2))  # [1, 1, src_len]
                attn_weights = F.softmax(attn_weights, dim=2)
                context = torch.bmm(attn_weights, encoder_outputs)  # [1, 1, 2H]

                rnn_input = torch.cat((input_emb, context), dim=2)  # [1, 1, emb + 2H]
                output, (hidden, cell) = self.decoder_lstm(rnn_input, (hidden, cell))  # output: [1, 1, H]

                pred = self.fc_out(torch.cat((output.squeeze(1), context.squeeze(1)), dim=1))  # [1, vocab]
                top1 = pred.argmax(1).item()

                if top1 == self.tgt_eos_idx:
                    break
                outputs.append(top1)
                inputs = torch.tensor([top1], device=device)

        return outputs  # List[int]

    def _combine_directions(self, states):
        # [2*n_layers, B, H] → [n_layers, B, 2H]
        return torch.cat((states[0::2], states[1::2]), dim=2)


In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = Seq2SeqAttention(
    src_vocab_size=len(en_vocab),
    tgt_vocab_size=len(it_vocab),
    embedding_dim=256,
    hidden_dim=512,
    src_pad_idx=en_vocab["<PAD>"],
    tgt_sos_idx=it_vocab["<SOS>"],
    tgt_eos_idx=it_vocab["<EOS>"],
    n_layers=2,
    dropout=0.3
).to(device)


In [18]:
# Padding index to be ignored in loss calculation
pad_idx = it_vocab["<PAD>"]

# Loss function: ignores padded positions
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Aliases for vocab access during translation/decoding
src_vocab = en_vocab
tgt_vocab = it_vocab


In [19]:
def train_one_epoch(model, dataloader, optimizer, criterion, device, clip=1.0, teacher_forcing_ratio=0.5):
    model.train()
    epoch_loss = 0

    for src, tgt in tqdm(dataloader, desc="Training"):
        src, tgt = src.to(device), tgt.to(device)

        optimizer.zero_grad()

        # Forward pass with teacher forcing
        output = model(src, tgt, teacher_forcing_ratio)  # [B, tgt_len-1, vocab]

        # Align shapes for loss computation
        output = output.contiguous().view(-1, output.shape[-1])  # [B*(tgt_len-1), vocab]
        tgt = tgt[:, 1:].contiguous().view(-1)                  # [B*(tgt_len-1)]

        loss = criterion(output, tgt)
        loss.backward()

        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(dataloader)


In [20]:
def evaluate_one_epoch(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        for src, tgt in tqdm(dataloader, desc="Validation"):
            src, tgt = src.to(device), tgt.to(device)

            # No teacher forcing during evaluation
            output = model(src, tgt, teacher_forcing_ratio=0.0)  # [B, tgt_len-1, vocab]

            # Align for loss computation
            output = output.contiguous().view(-1, output.shape[-1])  # [B*(tgt_len-1), vocab]
            tgt = tgt[:, 1:].contiguous().view(-1)                  # [B*(tgt_len-1)]

            loss = criterion(output, tgt)
            epoch_loss += loss.item()

    return epoch_loss / len(dataloader)


In [21]:
def train_loop(model, train_loader, val_loader, optimizer, criterion, device,
               n_epochs=10, clip=1.0, teacher_forcing_ratio=0.5):
    
    best_val_loss = float('inf')

    for epoch in range(n_epochs):
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device,
                                     clip=clip, teacher_forcing_ratio=teacher_forcing_ratio)
        val_loss = evaluate_one_epoch(model, val_loader, criterion, device)

        end_time = time.time()
        mins, secs = divmod(int(end_time - start_time), 60)

        print(f"\nEpoch {epoch + 1:02} | Time: {mins}m {secs}s")
        print(f"  Train Loss: {train_loss:.4f}")
        print(f"  Val   Loss: {val_loss:.4f}")

        # Save the model if it has improved
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "best_model.pt")
            print("Saved best model.")


In [22]:
train_loop(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    criterion=criterion,
    n_epochs=10,
    clip=1.0,
    teacher_forcing_ratio=0.4,
    device="cuda" if torch.cuda.is_available() else "cpu"
)


Training: 100%|██████████| 2321/2321 [04:14<00:00,  9.12it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 11.96it/s]



Epoch 01 | Time: 4m 15s
  Train Loss: 4.9452
  Val   Loss: 4.6843
Saved best model.


Training: 100%|██████████| 2321/2321 [04:12<00:00,  9.18it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 10.66it/s]



Epoch 02 | Time: 4m 13s
  Train Loss: 4.1295
  Val   Loss: 4.3157
Saved best model.


Training: 100%|██████████| 2321/2321 [04:11<00:00,  9.24it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 12.02it/s]



Epoch 03 | Time: 4m 11s
  Train Loss: 3.7791
  Val   Loss: 4.1158
Saved best model.


Training: 100%|██████████| 2321/2321 [04:10<00:00,  9.25it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 11.24it/s]



Epoch 04 | Time: 4m 11s
  Train Loss: 3.5443
  Val   Loss: 3.9616
Saved best model.


Training: 100%|██████████| 2321/2321 [04:11<00:00,  9.21it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 11.72it/s]



Epoch 05 | Time: 4m 12s
  Train Loss: 3.3719
  Val   Loss: 3.9011
Saved best model.


Training: 100%|██████████| 2321/2321 [04:11<00:00,  9.21it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 11.49it/s]



Epoch 06 | Time: 4m 12s
  Train Loss: 3.2350
  Val   Loss: 3.8525
Saved best model.


Training: 100%|██████████| 2321/2321 [04:11<00:00,  9.24it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 11.68it/s]



Epoch 07 | Time: 4m 11s
  Train Loss: 3.1256
  Val   Loss: 3.7992
Saved best model.


Training: 100%|██████████| 2321/2321 [04:11<00:00,  9.23it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 11.65it/s]



Epoch 08 | Time: 4m 12s
  Train Loss: 3.0350
  Val   Loss: 3.7498
Saved best model.


Training: 100%|██████████| 2321/2321 [04:11<00:00,  9.22it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 11.37it/s]



Epoch 09 | Time: 4m 12s
  Train Loss: 2.9486
  Val   Loss: 3.7433
Saved best model.


Training: 100%|██████████| 2321/2321 [04:11<00:00,  9.22it/s]
Validation: 100%|██████████| 7/7 [00:00<00:00, 11.18it/s]



Epoch 10 | Time: 4m 12s
  Train Loss: 2.8781
  Val   Loss: 3.7002
Saved best model.


In [28]:
from nltk.translate.bleu_score import corpus_bleu

def compute_bleu(model, dataloader, tgt_vocab, idx_to_tgt, device, max_len=50, num_samples=100):
    model.eval()
    references = []
    hypotheses = []

    sos_idx = tgt_vocab["<SOS>"]
    eos_idx = tgt_vocab["<EOS>"]
    pad_idx = tgt_vocab["<PAD>"]

    with torch.no_grad():
        for i, (src_batch, tgt_batch) in enumerate(dataloader):
            if i * src_batch.size(0) > num_samples:
                break

            src_batch = src_batch.to(device)
            tgt_batch = tgt_batch.to(device)

            for src, tgt in zip(src_batch, tgt_batch):
                # Inference
                pred_ids = model.predict(src, max_len=max_len, device=device)

                # Reference (target) - skip <SOS>, <EOS>, <PAD>
                ref = [idx_to_tgt.get(idx.item(), "<UNK>") 
                       for idx in tgt if idx.item() not in {pad_idx, sos_idx, eos_idx}]

                # Hypothesis (prediction) - skip <EOS>
                hyp = [idx_to_tgt.get(idx, "<UNK>") 
                       for idx in pred_ids if idx != eos_idx]

                references.append([ref])  # reference must be a list of lists
                hypotheses.append(hyp)

    bleu = corpus_bleu(references, hypotheses)
    return bleu  # optionally return bleu * 100


In [29]:
def translate_and_print(model, sentence, src_vocab, tgt_vocab, idx_to_tgt, device, max_len=50):
    model.eval()

    # Preprocess and tokenize
    tokens = sentence.lower().strip().split()
    src_ids = [src_vocab.get(tok, src_vocab["<UNK>"]) for tok in tokens]

    # Convert to tensor and run prediction
    src_tensor = torch.tensor(src_ids, dtype=torch.long).to(device)
    pred_ids = model.predict(src_tensor, max_len=max_len, device=device)

    # Decode predicted token ids
    pred_tokens = [idx_to_tgt.get(idx, "<UNK>") for idx in pred_ids]

    print("Source:      ", sentence)
    print("Translation: ", " ".join(pred_tokens))


In [30]:
# Inverse vocab
idx_to_tgt = {idx: tok for tok, idx in it_vocab.items()}

# BLEU score on validation set (first 100 samples)
bleu_score = compute_bleu(model, test_loader, it_vocab, idx_to_tgt, device, num_samples=500)
print(f"\nBLEU Score (100 samples): {bleu_score:.4f}")

# Example translation
example_sentence = "how are you"
translate_and_print(model, example_sentence, en_vocab, it_vocab, idx_to_tgt, device)



BLEU Score (100 samples): 0.1146
Source:       how are you
Translation:  come sei come come


In [34]:
sentences = [
    "good morning",
    "how are you",
    "do you have money",
    "I am very happy",
    "nice to meet you ",
    "the weather is nice",
    "i have been waiting for you"
]
for sentence in sentences:
    translate_and_print(model,sentence,en_vocab,it_vocab,idx_to_tgt,device)

Source:       good morning
Translation:  buongiorno
Source:       how are you
Translation:  come sei come come
Source:       do you have money
Translation:  hai dei soldi
Source:       I am very happy
Translation:  sono molto felice felice
Source:       nice to meet you 
Translation:  piacere di conoscerti
Source:       the weather is nice
Translation:  la <UNK> è è un
Source:       i have been waiting for you
Translation:  ho aspettato aspettando per te


In [32]:

src_batch, tgt_batch = next(iter(train_loader))
output = model(src_batch.to(device), tgt_batch.to(device), teacher_forcing_ratio=0.0)

predictions = output.argmax(dim=-1)  # shape: [B, T]
unk_id = tgt_vocab["<UNK>"]
unk_ratio = (predictions == unk_id).float().mean().item()

print(f"Ratio of predicted <UNK>: {unk_ratio:.2%}")


Ratio of predicted <UNK>: 2.34%
