In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
################################################################################
# Part 0: Setup & Imports
################################################################################
!pip install datasets sacrebleu torchtext --quiet

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
from datasets import load_dataset
from sacrebleu import corpus_bleu

################################################################################
# Part 1: Implement Scaled Dot-Product Attention
################################################################################
def scaled_dot_product_attention(query, key, value, mask=None):
    """
    query: (batch_size, seq_len_q, d_k)
    key:   (batch_size, seq_len_k, d_k)
    value: (batch_size, seq_len_k, d_v)
    mask:  (batch_size, seq_len_q, seq_len_k) [optional]

    Returns:
        attention_output: (batch_size, seq_len_q, d_v)
        attention_weights: (batch_size, seq_len_q, seq_len_k)
    """
    d_k = query.size(-1)  # dimension of key
    # (batch_size, seq_len_q, seq_len_k)
    scores = torch.bmm(query, key.transpose(1, 2)) / (d_k ** 0.5)

    if mask is not None:
        scores = scores.masked_fill(mask == 0, float('-inf'))

    # (batch_size, seq_len_q, seq_len_k)
    attn_weights = F.softmax(scores, dim=-1)
    # (batch_size, seq_len_q, d_v)
    output = torch.bmm(attn_weights, value)
    return output, attn_weights

################################################################################
# Part 2: Seq2Seq Model with Encoder, Decoder & Attention
################################################################################

# ------------------- 2.1 Simple Tokenizer -------------------
def tokenize(text):
    # Very naive splitting on whitespace
    return text.lower().strip().split()

# ------------------- 2.2 Vocabulary Building -------------------
# We'll build minimal vocab from a small subset for demonstration
def build_vocab(lines, max_tokens=5000):
    freq = {}
    for line in lines:
        for token in tokenize(line):
            freq[token] = freq.get(token, 0) + 1
    # Sort by frequency
    sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
    # Special tokens
    vocab = ["<pad>", "<bos>", "<eos>", "<unk>"]
    for word, _ in sorted_freq[: max_tokens - len(vocab)]:
        vocab.append(word)
    word2idx = {w: i for i, w in enumerate(vocab)}
    idx2word = {i: w for w, i in word2idx.items()}
    return vocab, word2idx, idx2word

def numericalize(line, word2idx):
    tokens = ["<bos>"] + tokenize(line) + ["<eos>"]
    return [word2idx.get(t, word2idx["<unk>"]) for t in tokens]

# ------------------- 2.3 Encoder -------------------
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.GRU(embed_dim, hidden_dim, batch_first=True)

    def forward(self, src):
        # src: (batch_size, seq_len)
        embedded = self.embed(src)  # (batch_size, seq_len, embed_dim)
        outputs, hidden = self.rnn(embedded)
        # outputs: (batch_size, seq_len, hidden_dim)
        # hidden: (1, batch_size, hidden_dim)
        return outputs, hidden

# ------------------- 2.4 Decoder with Scaled Dot-Product Attention -------------------
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, vocab_size)
        self.hidden_dim = hidden_dim

    def forward(self, tgt, hidden, encoder_outputs):
        # tgt: (batch_size, tgt_seq_len)
        # hidden: (1, batch_size, hidden_dim)
        # encoder_outputs: (batch_size, src_seq_len, hidden_dim)
        embedded = self.embed(tgt)  # (batch_size, tgt_seq_len, embed_dim)
        rnn_out, hidden = self.rnn(embedded, hidden)  # rnn_out: (batch_size, tgt_seq_len, hidden_dim)

        # For simplicity, we apply scaled dot-product attention
        # with query=rnn_out, key=encoder_outputs, value=encoder_outputs
        # shapes:
        #  query: (batch_size, tgt_seq_len, hidden_dim)
        #  key:   (batch_size, src_seq_len, hidden_dim)
        #  value: (batch_size, src_seq_len, hidden_dim)
        context, attn_weights = scaled_dot_product_attention(rnn_out, encoder_outputs, encoder_outputs)

        # Combine context with rnn_out
        combined = rnn_out + context  # (batch_size, tgt_seq_len, hidden_dim)

        logits = self.fc_out(combined)  # (batch_size, tgt_seq_len, vocab_size)
        return logits, hidden, attn_weights

# ------------------- 2.5 Full Seq2Seq -------------------
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        # Encode
        enc_outputs, enc_hidden = self.encoder(src)
        # Decode
        logits, dec_hidden, attn_weights = self.decoder(tgt, enc_hidden, enc_outputs)
        return logits

################################################################################
# Part 3: Machine Translation with a Subset of IWSLT2017 + Evaluate BLEU
################################################################################
def pad_batch(batch, pad_idx=0):
    """Pad a list of token lists to the same length."""
    max_len = max(len(seq) for seq in batch)
    padded = []
    for seq in batch:
        seq = seq + [pad_idx] * (max_len - len(seq))
        padded.append(seq)
    return torch.tensor(padded, dtype=torch.long)

# Minimal training loop
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for src_batch, tgt_batch in dataloader:
        src_batch = src_batch.to(device)
        tgt_batch = tgt_batch.to(device)

        # teacher forcing:
        # let input to decoder be everything except last token
        dec_in = tgt_batch[:, :-1]
        # we want to predict everything except first token
        dec_target = tgt_batch[:, 1:].contiguous().view(-1)

        optimizer.zero_grad()
        logits = model(src_batch, dec_in)  # (batch_size, seq_len, vocab_size)
        logits = logits.view(-1, logits.size(-1))  # flatten
        loss = criterion(logits, dec_target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate_bleu(model, src_list, tgt_list, src_word2idx, tgt_word2idx, tgt_idx2word, device, max_len=50):
    """Simple greedy decode + BLEU calculation using sacrebleu."""
    model.eval()
    preds = []
    refs = []
    with torch.no_grad():
        for i in range(len(src_list)):
            # Convert to tensor
            src_tensor = torch.tensor(src_list[i], dtype=torch.long).unsqueeze(0).to(device)
            enc_outputs, enc_hidden = model.encoder(src_tensor)

            # Greedy decode
            dec_input = torch.tensor([tgt_word2idx["<bos>"]], dtype=torch.long).unsqueeze(0).to(device)
            hidden = enc_hidden
            pred_tokens = []
            for _ in range(max_len):
                logits, hidden, attn_weights = model.decoder(dec_input, hidden, enc_outputs)
                next_token = logits[:, -1, :].argmax(dim=-1)
                token_id = next_token.item()
                if token_id == tgt_word2idx["<eos>"]:
                    break
                pred_tokens.append(token_id)
                dec_input = torch.cat([dec_input, next_token.unsqueeze(0)], dim=1)

            # Convert IDs to text
            pred_words = [tgt_idx2word.get(idx, "<unk>") for idx in pred_tokens]
            preds.append(" ".join(pred_words))

            # Prepare reference
            ref_line = [t for t in tokenize(tgt_list[i]) if t != ""]
            refs.append([" ".join(ref_line)])  # sacrebleu expects a list of references
    # Calculate BLEU
    bleu = corpus_bleu(preds, refs)
    return bleu.score

def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)

    ############################################################################
    # 1. Load IWSLT 2017 De-En
    ############################################################################
    dataset = load_dataset("iwslt2017", "iwslt2017-de-en")
    train_data = dataset["train"]
    valid_data = dataset["validation"]
    test_data = dataset["test"]

    # 2. Subset the dataset (e.g., 2000 train, 200 valid, 200 test)
    #    for demonstration
    train_subset = train_data.select(range(2000))
    valid_subset = valid_data.select(range(200))
    test_subset  = test_data.select(range(200))

    # Extract raw text
    train_src = [ex["translation"]["de"] for ex in train_subset]
    train_tgt = [ex["translation"]["en"] for ex in train_subset]

    valid_src = [ex["translation"]["de"] for ex in valid_subset]
    valid_tgt = [ex["translation"]["en"] for ex in valid_subset]

    test_src = [ex["translation"]["de"] for ex in test_subset]
    test_tgt = [ex["translation"]["en"] for ex in test_subset]

    ############################################################################
    # 3. Build Vocabulary (tiny for demonstration)
    ############################################################################
    src_vocab, src_w2i, src_i2w = build_vocab(train_src, max_tokens=3000)
    tgt_vocab, tgt_w2i, tgt_i2w = build_vocab(train_tgt, max_tokens=3000)

    print(f"Source vocab size: {len(src_vocab)}, Target vocab size: {len(tgt_vocab)}")

    # 4. Numericalize
    train_src_num = [numericalize(line, src_w2i) for line in train_src]
    train_tgt_num = [numericalize(line, tgt_w2i) for line in train_tgt]

    valid_src_num = [numericalize(line, src_w2i) for line in valid_src]
    valid_tgt_num = [numericalize(line, tgt_w2i) for line in valid_tgt]

    test_src_num  = [numericalize(line, src_w2i) for line in test_src]
    test_tgt_num  = [numericalize(line, tgt_w2i) for line in test_tgt]

    ############################################################################
    # 5. Create Dataloaders
    ############################################################################
    BATCH_SIZE = 32

    def collate_fn(batch):
        # batch: list of tuples (src_seq, tgt_seq)
        src_seqs, tgt_seqs = zip(*batch)
        src_pad = pad_batch(src_seqs, pad_idx=0)
        tgt_pad = pad_batch(tgt_seqs, pad_idx=0)
        return src_pad, tgt_pad

    train_pairs = list(zip(train_src_num, train_tgt_num))
    valid_pairs = list(zip(valid_src_num, valid_tgt_num))

    train_loader = torch.utils.data.DataLoader(
        train_pairs, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn
    )
    valid_loader = torch.utils.data.DataLoader(
        valid_pairs, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn
    )

    ############################################################################
    # 6. Initialize Model, Optimizer, Loss
    ############################################################################
    SRC_VOCAB_SIZE = len(src_vocab)
    TGT_VOCAB_SIZE = len(tgt_vocab)
    EMBED_DIM = 128
    HIDDEN_DIM = 256

    encoder = Encoder(SRC_VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM)
    decoder = Decoder(TGT_VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM)
    model = Seq2Seq(encoder, decoder).to(device)

    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss(ignore_index=0)  # ignore <pad>

    ############################################################################
    # 7. Training Loop
    ############################################################################
    EPOCHS = 500
    for epoch in range(1, EPOCHS + 1):
        train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        print(f"Epoch [{epoch}/{EPOCHS}] - Train Loss: {train_loss:.4f}")

    ############################################################################
    # 8. Evaluate on a small valid subset (Optional)
    ############################################################################
    # We won't do a thorough validation loop here for brevity,
    # but you could decode and compute BLEU on valid pairs similarly to test.

    ############################################################################
    # 9. Test BLEU
    ############################################################################
    bleu_score = evaluate_bleu(
        model, test_src_num, test_tgt, src_w2i, tgt_w2i, tgt_i2w, device
    )
    print(f"\nTest BLEU Score: {bleu_score:.2f}")

if __name__ == "__main__":
    main()


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Source vocab size: 3000, Target vocab size: 3000
Epoch [1/500] - Train Loss: 6.0901
Epoch [2/500] - Train Loss: 5.2638
Epoch [3/500] - Train Loss: 4.8908
Epoch [4/500] - Train Loss: 4.5682
Epoch [5/500] - Train Loss: 4.2611
Epoch [6/500] - Train Loss: 3.9774
Epoch [7/500] - Train Loss: 3.7032
Epoch [8/500] - Train Loss: 3.4368
Epoch [9/500] - Train Loss: 3.1687
Epoch [10/500] - Train Loss: 2.9063
Epoch [11/500] - Train Loss: 2.6494
Epoch [12/500] - Train Loss: 2.4052
Epoch [13/500] - Train Loss: 2.1720
Epoch [14/500] - Train Loss: 1.9537
Epoch [15/500] - Train Loss: 1.7422
Epoch [16/500] - Train Loss: 1.5488
Epoch [17/500] - Train Loss: 1.3690
Epoch [18/500] - Train Loss: 1.2042
Epoch [19/500] - Train Loss: 1.0496
Epoch [20/500] - Train Loss: 0.9182
Epoch [21/500] - Train Loss: 0.7890
Epoch [22/500] - Train Loss: 0.6800
Epoch [23/500] - Train Loss: 0.5902
Epoch [24/500] - Train Loss: 0.5001
Epoch [25/500] - Train Loss: 0.4244
Epoch [26/500] - Train Loss: 0.3599
Epoch [27/500] - Train L

In [3]:
################################################################################
# Part 4: Simplified Transformer Model (2 Layers, 2 Heads)
################################################################################

import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import random
import numpy as np
from datasets import load_dataset
from sacrebleu import corpus_bleu

################################################################################
# 1. Hyperparameters & Basic Setup
################################################################################
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

EMBED_DIM = 256        # Embedding dimension
FF_DIM = 512           # Feedforward dimension
N_HEADS = 2            # Multi-head attention heads
ENC_LAYERS = 2         # Number of encoder layers
DEC_LAYERS = 2         # Number of decoder layers
BATCH_SIZE = 32
EPOCHS = 500
LR = 1e-4
MAX_TOKENS = 10000  # for building vocab
MAX_SEQ_LEN = 100   # maximum sequence length

################################################################################
# 2. Minimal Data Loading: IWSLT 2017 (German-English)
#    We'll take a small subset (~10k lines) to keep it manageable
################################################################################
dataset = load_dataset("iwslt2017", "iwslt2017-de-en")
train_data_full = dataset["train"]
valid_data_full = dataset["validation"]
test_data_full  = dataset["test"]

# We'll choose ~10k lines from train, ~1k from valid, ~1k from test
train_subset = train_data_full.select(range(min(len(train_data_full), 10000)))
valid_subset = valid_data_full.select(range(min(len(valid_data_full), 1000)))
test_subset  = test_data_full.select(range(min(len(test_data_full), 1000)))

train_de = [ex["translation"]["de"] for ex in train_subset]
train_en = [ex["translation"]["en"] for ex in train_subset]

valid_de = [ex["translation"]["de"] for ex in valid_subset]
valid_en = [ex["translation"]["en"] for ex in valid_subset]

test_de  = [ex["translation"]["de"] for ex in test_subset]
test_en  = [ex["translation"]["en"] for ex in test_subset]

################################################################################
# 3. Tokenization & Vocab
################################################################################
def tokenize(text):
    return text.lower().split()

def build_vocab(lines, max_tokens=MAX_TOKENS):
    freq = {}
    for line in lines:
        for token in tokenize(line):
            freq[token] = freq.get(token, 0) + 1
    # sort by frequency
    sorted_freq = sorted(freq.items(), key=lambda x: x[1], reverse=True)
    vocab = ["<pad>", "<bos>", "<eos>", "<unk>"]
    for w, _ in sorted_freq[: (max_tokens - len(vocab))]:
        vocab.append(w)
    word2idx = {w: i for i, w in enumerate(vocab)}
    idx2word = {i: w for w, i in word2idx.items()}
    return vocab, word2idx, idx2word

src_vocab, src_w2i, src_i2w = build_vocab(train_de)
tgt_vocab, tgt_w2i, tgt_i2w = build_vocab(train_en)

PAD_IDX = src_w2i["<pad>"]
BOS_IDX = src_w2i["<bos>"] if "<bos>" in src_w2i else 1
EOS_IDX = src_w2i["<eos>"] if "<eos>" in src_w2i else 2

def numericalize(line, w2i):
    tokens = ["<bos>"] + tokenize(line) + ["<eos>"]
    return [w2i.get(t, w2i["<unk>"]) for t in tokens][:MAX_SEQ_LEN]

train_src_num = [numericalize(line, src_w2i) for line in train_de]
train_tgt_num = [numericalize(line, tgt_w2i) for line in train_en]

valid_src_num = [numericalize(line, src_w2i) for line in valid_de]
valid_tgt_num = [numericalize(line, tgt_w2i) for line in valid_en]

test_src_num  = [numericalize(line, src_w2i) for line in test_de]
test_tgt_num  = [numericalize(line, tgt_w2i) for line in test_en]

################################################################################
# 4. Batching & Padding
################################################################################
def pad_batch(batch, pad_idx=0):
    max_len = max(len(seq) for seq in batch)
    padded = []
    for seq in batch:
        seq = seq + [pad_idx]*(max_len - len(seq))
        padded.append(seq)
    return torch.tensor(padded, dtype=torch.long)

def collate_fn(batch):
    src_seqs, tgt_seqs = zip(*batch)
    src_pad = pad_batch(src_seqs, pad_idx=0)
    tgt_pad = pad_batch(tgt_seqs, pad_idx=0)
    return src_pad, tgt_pad

train_pairs = list(zip(train_src_num, train_tgt_num))
valid_pairs = list(zip(valid_src_num, valid_tgt_num))

train_loader = torch.utils.data.DataLoader(
    train_pairs, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn
)
valid_loader = torch.utils.data.DataLoader(
    valid_pairs, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn
)

SRC_VOCAB_SIZE = len(src_vocab)
TGT_VOCAB_SIZE = len(tgt_vocab)

################################################################################
# 5. Positional Encoding
################################################################################
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # shape (1, max_len, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x shape: (batch_size, seq_len, d_model)
        seq_len = x.size(1)
        return x + self.pe[:, :seq_len, :]

################################################################################
# 6. Multi-Head Attention (scaled dot-product)
################################################################################
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        assert d_model % n_heads == 0
        self.d_k = d_model // n_heads
        self.n_heads = n_heads

        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask=None):
        # query/key/value: (batch_size, seq_len, d_model)
        B, L, D = query.size()
        # 1) Linear projections
        Q = self.w_q(query)  # (B, L, D)
        K = self.w_k(key)
        V = self.w_v(value)

        # 2) Split into heads
        Q = Q.view(B, L, self.n_heads, self.d_k).transpose(1,2)  # (B, n_heads, L, d_k)
        K = K.view(B, -1, self.n_heads, self.d_k).transpose(1,2)  # (B, n_heads, L, d_k)
        V = V.view(B, -1, self.n_heads, self.d_k).transpose(1,2)

        # 3) Scaled dot-product
        # Q @ K^T => (B, n_heads, L, L_k)
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        if mask is not None:
            # mask shape should match (B, 1, L, L_k) or broadcastable
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn = F.softmax(scores, dim=-1)
        out = torch.matmul(attn, V)  # (B, n_heads, L, d_k)

        # 4) Concat heads
        out = out.transpose(1,2).contiguous().view(B, L, D)
        out = self.w_o(out)
        return out, attn

################################################################################
# 7. Feed-Forward Network
################################################################################
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, dim_feedforward=512):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),
            nn.ReLU(),
            nn.Linear(dim_feedforward, d_model),
        )
    def forward(self, x):
        return self.net(x)

################################################################################
# 8. Transformer Encoder/Decoder Layers
################################################################################
class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, dim_feedforward):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, n_heads)
        self.ff = PositionwiseFeedForward(d_model, dim_feedforward)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x, src_mask=None):
        # Self-attention
        attn_out, _ = self.self_attn(x, x, x, mask=src_mask)
        x = self.norm1(x + attn_out)
        # Feed-forward
        ff_out = self.ff(x)
        x = self.norm2(x + ff_out)
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, dim_feedforward):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, n_heads)
        self.cross_attn = MultiHeadAttention(d_model, n_heads)
        self.ff = PositionwiseFeedForward(d_model, dim_feedforward)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)

    def forward(self, x, memory, tgt_mask=None, memory_mask=None):
        # 1) Self-attention (mask future tokens)
        self_attn_out, _ = self.self_attn(x, x, x, mask=tgt_mask)
        x = self.norm1(x + self_attn_out)

        # 2) Cross-attention
        cross_attn_out, _ = self.cross_attn(x, memory, memory, mask=memory_mask)
        x = self.norm2(x + cross_attn_out)

        # 3) Feed-forward
        ff_out = self.ff(x)
        x = self.norm3(x + ff_out)
        return x

################################################################################
# 9. Full Encoder / Decoder
################################################################################
class TransformerEncoder(nn.Module):
    def __init__(self, d_model, n_heads, dim_feedforward, num_layers, vocab_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pos_encoding = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, n_heads, dim_feedforward) for _ in range(num_layers)
        ])
    def forward(self, src, src_mask=None):
        # src shape: (batch_size, seq_len)
        x = self.embed(src)  # (B, L, d_model)
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x, src_mask)
        return x

class TransformerDecoder(nn.Module):
    def __init__(self, d_model, n_heads, dim_feedforward, num_layers, vocab_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pos_encoding = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, n_heads, dim_feedforward) for _ in range(num_layers)
        ])
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None):
        x = self.embed(tgt)
        x = self.pos_encoding(x)
        for layer in self.layers:
            x = layer(x, memory, tgt_mask, memory_mask)
        logits = self.fc_out(x)  # (B, L, vocab_size)
        return logits

################################################################################
# 10. Full Transformer Model
################################################################################
class TransformerModel(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size,
                 d_model=256, n_heads=2, ff_dim=512,
                 enc_layers=2, dec_layers=2):
        super().__init__()
        self.encoder = TransformerEncoder(d_model, n_heads, ff_dim, enc_layers, src_vocab_size)
        self.decoder = TransformerDecoder(d_model, n_heads, ff_dim, dec_layers, tgt_vocab_size)

    def make_subsequent_mask(self, sz):
        """Mask out future positions for target tokens (causal mask)."""
        mask = torch.ones(sz, sz).triu(1)
        return mask == 0  # True = keep, False = mask out

    def forward(self, src, tgt):
        # src: (B, src_len)
        # tgt: (B, tgt_len)
        # 1) encode
        memory = self.encoder(src)  # (B, src_len, d_model)
        # 2) decode
        # create subsequent mask for target
        tgt_len = tgt.size(1)
        subseq_mask = self.make_subsequent_mask(tgt_len).to(tgt.device)
        # expand to (B, n_heads, tgt_len, tgt_len) or broadcast
        # for simplicity, we do (1, tgt_len, tgt_len)
        subseq_mask = subseq_mask.unsqueeze(0)

        logits = self.decoder(tgt, memory, tgt_mask=subseq_mask)
        return logits

################################################################################
# 11. Training Loop
################################################################################
model = TransformerModel(
    src_vocab_size=SRC_VOCAB_SIZE,
    tgt_vocab_size=TGT_VOCAB_SIZE,
    d_model=EMBED_DIM,
    n_heads=N_HEADS,
    ff_dim=FF_DIM,
    enc_layers=ENC_LAYERS,
    dec_layers=DEC_LAYERS
).to(DEVICE)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

def train_epoch():
    model.train()
    total_loss = 0
    for src_batch, tgt_batch in train_loader:
        src_batch = src_batch.to(DEVICE)
        tgt_batch = tgt_batch.to(DEVICE)

        # Teacher forcing:
        # decoder inputs: all but last token
        # we want to predict all but first token
        dec_in = tgt_batch[:, :-1]
        dec_target = tgt_batch[:, 1:].contiguous().view(-1)

        optimizer.zero_grad()
        logits = model(src_batch, dec_in)  # (B, dec_in_len, vocab_size)
        logits = logits.view(-1, logits.size(-1))
        loss = criterion(logits, dec_target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

################################################################################
# 12. Inference / Greedy Decode for BLEU
################################################################################
def greedy_decode(src_seq, max_len=50):
    model.eval()
    with torch.no_grad():
        src_seq = torch.tensor(src_seq, dtype=torch.long).unsqueeze(0).to(DEVICE)
        memory = model.encoder(src_seq)
        # Start with <bos>
        ys = torch.tensor([[tgt_w2i["<bos>"]]], dtype=torch.long).to(DEVICE)
        for _ in range(max_len):
            tgt_mask = model.make_subsequent_mask(ys.size(1)).to(DEVICE).unsqueeze(0)
            out = model.decoder(ys, memory, tgt_mask=tgt_mask)
            next_token = out[:, -1, :].argmax(dim=-1)
            next_token_id = next_token.item()
            ys = torch.cat([ys, next_token.unsqueeze(1)], dim=1)
            if next_token_id == tgt_w2i["<eos>"]:
                break
        return ys[0].cpu().numpy()

def evaluate_bleu(src_list, tgt_list, max_len=50):
    preds = []
    refs = []
    for i in range(len(src_list)):
        pred_ids = greedy_decode(src_list[i], max_len=max_len)
        # remove <bos>, <eos>, <pad>
        pred_tokens = []
        for tid in pred_ids:
            if tid == tgt_w2i["<bos>"] or tid == tgt_w2i["<pad>"]:
                continue
            if tid == tgt_w2i["<eos>"]:
                break
            pred_tokens.append(tid)
        # Use tgt_i2w instead of tgt_idx2word
        pred_words = [tgt_i2w.get(x, "<unk>") for x in pred_tokens]
        preds.append(" ".join(pred_words))

        # reference
        ref_line = tokenize(tgt_list[i])
        refs.append([" ".join(ref_line)])
    bleu = corpus_bleu(preds, refs)
    return bleu.score

################################################################################
# 13. Run Training
################################################################################
for epoch in range(1, EPOCHS + 1):
    loss = train_epoch()
    print(f"Epoch {epoch}/{EPOCHS}, Train Loss: {loss:.4f}")

# Optional: Evaluate on a small subset of valid to see if it is learning
val_bleu = evaluate_bleu(valid_src_num[:100], valid_en[:100], max_len=50)
print(f"Validation BLEU (first 100 lines): {val_bleu:.2f}")

################################################################################
# 14. Test BLEU on a small subset
################################################################################
test_bleu = evaluate_bleu(test_src_num[:100], test_en[:100], max_len=50)
print(f"Test BLEU (first 100 lines): {test_bleu:.2f}")


Using device: cuda
Epoch 1/500, Train Loss: 6.6489
Epoch 2/500, Train Loss: 5.7209
Epoch 3/500, Train Loss: 5.3786
Epoch 4/500, Train Loss: 5.1234
Epoch 5/500, Train Loss: 4.9113
Epoch 6/500, Train Loss: 4.7384
Epoch 7/500, Train Loss: 4.5726
Epoch 8/500, Train Loss: 4.4215
Epoch 9/500, Train Loss: 4.2761
Epoch 10/500, Train Loss: 4.1292
Epoch 11/500, Train Loss: 3.9898
Epoch 12/500, Train Loss: 3.8553
Epoch 13/500, Train Loss: 3.7207
Epoch 14/500, Train Loss: 3.5885
Epoch 15/500, Train Loss: 3.4544
Epoch 16/500, Train Loss: 3.3263
Epoch 17/500, Train Loss: 3.1943
Epoch 18/500, Train Loss: 3.0657
Epoch 19/500, Train Loss: 2.9372
Epoch 20/500, Train Loss: 2.8101
Epoch 21/500, Train Loss: 2.6809
Epoch 22/500, Train Loss: 2.5527
Epoch 23/500, Train Loss: 2.4287
Epoch 24/500, Train Loss: 2.3043
Epoch 25/500, Train Loss: 2.1790
Epoch 26/500, Train Loss: 2.0599
Epoch 27/500, Train Loss: 1.9430
Epoch 28/500, Train Loss: 1.8271
Epoch 29/500, Train Loss: 1.7146
Epoch 30/500, Train Loss: 1.6018


## Discussion

### 1. BLEU Score Comparison
After training the **Part 4** simplified Transformer (2 encoder layers, 2 decoder layers, 2 attention heads) on the same subset of the IWSLT dataset used in **Part 3**, we observe that the Transformer’s **validation BLEU** is **lower** than the RNN seq2seq model’s BLEU from Part 3. While exact numbers vary, the Transformer often ends up a few points behind the RNN-based model on this small-scale setup.

### 2. Possible Reasons for Performance Differences

1. **Data Scale & Model Complexity**  
   Transformers typically require **larger datasets** to fully realize their advantage over RNNs. On a small subset of IWSLT, the minimal Transformer architecture may not outperform a simpler RNN approach.

2. **Hyperparameter Sensitivity**  
   Transformers benefit from **learning rate warmups**, **label smoothing**, and **dropout** to converge effectively. Without careful tuning, they can underperform simpler RNNs that are more forgiving of suboptimal hyperparameters.

3. **Tokenization Method**  
   Both models use naive whitespace tokenization, which can yield high out-of-vocabulary rates and hamper translation performance. However, RNNs sometimes adapt more gracefully to limited vocabularies, while Transformers might struggle with OOV tokens.

4. **Training Epochs & Schedule**  
   Although both models may train for the same number of epochs, Transformers often need more specialized schedules (e.g., **Noam** or step-based decays) to reach optimal performance, whereas an RNN might converge quickly with a constant or simpler schedule.

### 3. Other Observations (Runtime, Resource Usage, etc.)

1. **Runtime**  
   Even though the simplified Transformer is relatively shallow (2 layers each for encoder and decoder), it often runs slower **per epoch** compared to an RNN of similar size. This is due to multi-head attention operations and matrix multiplications that may not be as efficient on small sequences or small batch sizes.

2. **Memory Footprint**  
   Multi-head attention can increase memory usage. For a small dataset or short sequences, the difference might be negligible, but in principle, Transformers often have a higher memory footprint than RNNs, especially for larger batch sizes or longer sequences.

3. **Scalability**  
   On larger datasets or with more powerful hardware, Transformers typically scale better, benefiting from parallelization in the attention layers. RNNs, by contrast, process tokens sequentially, limiting parallel speedups. This advantage, however, is less visible in a small-scale experiment.

### 4. Summary of Key Differences
- **BLEU Score**: The RNN seq2seq model from Part 3 outperforms the simplified Transformer in Part 4 on a small subset of data, likely due to the Transformer’s greater reliance on data size, hyperparameter tuning, and advanced tokenization.  
- **Training & Inference Speed**: The Transformer can be slower on small batches/sequences, whereas the RNN is straightforward and efficient for limited data. On large data or longer sequences, the Transformer would typically shine due to parallelization.  
- **Implementation Complexity**: Part 4’s model is conceptually more involved (attention layers, positional encodings), but the code can be modular. Part 3’s RNN-based approach is simpler, which may also contribute to more stable performance on small tasks.

**In conclusion**, the **Part 4** simplified Transformer underperforms the **Part 3** RNN seq2seq model on small data primarily due to **model complexity, limited data,** and **sensitivity to hyperparameters**. However, with **larger datasets**, **better tokenization**, and **careful tuning**, Transformers typically surpass RNN-based models. The current results demonstrate the importance of matching the model’s complexity and training setup to the available data size and computational resources.


**Developing a Movie Recommendation System for Enhancing User Experience**

Our project aims to build a personalized movie recommendation system that leverages **Graph Neural Networks (GNNs)** to better capture user–item relationships. We have selected the **MovieLens** dataset, which provides a rich set of user ratings and metadata such as genres and demographic information. After initial data exploration, we observed that user ratings skew toward higher values (3–5) and that a small subset of “power users” contributes most of the ratings. These insights guided us to focus on filtering out inactive users and less-rated movies, thereby streamlining the data and forming a clearer user–movie interaction graph.

To transform the dataset into a form suitable for graph-based models, we performed **preprocessing** steps that include normalizing movie titles, encoding genres, and converting user–item interactions into edges in a graph. This graph representation highlights the potential to capture nuanced relationships, such as how users with similar tastes or movies with overlapping genres might cluster. We are now finalizing our GNN architecture, which includes graph convolution layers to learn embeddings for users and movies, followed by an MLP or related structure to generate rating predictions or top-K recommendations.

For **benchmarking**, we have set up a **LightGCN** pipeline. LightGCN is a popular collaborative filtering approach that uses simplified graph convolutions. Initial experiments on a small subset suggest that LightGCN outperforms a naive matrix factorization model, reinforcing our expectation that graph-based representations can improve recommendation quality. Moving forward, we will compare our final GNN model against LightGCN using metrics such as **RMSE**, **Recall@K**, and **Precision@K** to assess rating prediction accuracy and recommendation relevance.

Looking ahead, our **next steps** include fine-tuning hyperparameters (e.g., number of GNN layers, embedding size, dropout rate) and possibly incorporating additional features, such as user demographics or advanced weighting schemes for edges. We also plan to evaluate more sophisticated approaches like using temporal information or context features if time allows. Ultimately, our goal is to produce a concise performance comparison between our GNN-based recommendation model and LightGCN, highlighting whether the enhanced ability to capture user–movie relationships translates into a better user experience.