## Scaled dot-product Attention Mechanism

In [34]:
# Implement Scaled dot-procuct attention

# Imports
import numpy as np

# Softmax
def softmax(x):
    exp_x = np.exp(x)
    return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

# Scaled dot product attention (reference [1])
# Q, K, V are numpy arrays with shape (batch_size, seq_len, d_k)
def scaled_dot_product_attention(Q, K, V):
    d_k = Q.shape[-1]
    scores = np.matmul(Q, K.transpose(0, 2, 1)) / np.sqrt(d_k)
    weights = softmax(scores)
    output = np.matmul(weights, V)
    return output, weights


## Integrating our Scaled dot-product Attention Mechanism into the encoder of a Seq2Seq Encoder/Decoder Style model based on Bahdanau attention (see refernce[2])

In [35]:
# Imports
import torch
import torch.nn as nn
import numpy as np

class EncoderRNNWithAttention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(0.2)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.hidden_dim = hidden_dim

        # Linear layers to produce Q, K, V
        self.to_Q = nn.Linear(hidden_dim, hidden_dim)
        self.to_K = nn.Linear(hidden_dim, hidden_dim)
        self.to_V = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, src):
        # src: (batch_size, seq_len)
        embedded = self.embedding(src)   # (batch_size, seq_len, embedding_dim)
        embedded = self.dropout(embedded)   # apply dropout
        outputs, hidden = self.rnn(embedded)  # (batch, seq_len, hidden_dim)

        # Q, K, V from encoder hidden states
        Q = self.to_Q(outputs).detach().cpu().numpy()
        K = self.to_K(outputs).detach().cpu().numpy()
        V = self.to_V(outputs).detach().cpu().numpy()

        # Use our attention from earlier
        with torch.no_grad():
            context_np, _ = scaled_dot_product_attention(Q, K, V)
        context = torch.tensor(context_np, dtype=torch.float32).to(outputs.device)

        return context, hidden

class DecoderRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(0.2)
        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, vocab_size)

    def forward(self, tgt, context, hidden):
        # encoders final state (1, batch, hidden_dim)
        init_hidden = hidden

        # embed all target tokens
        embedded = self.embedding(tgt)    # (batch, tgt_len, embedding_dim)
        embedded = self.dropout(embedded)   # apply dropout

        # decode using encoder state as init
        outputs, last_hidden = self.rnn(embedded, init_hidden)

        # map to vocabulary logits
        logits = self.fc_out(outputs)  # (batch, tgt_len, vocab_size)

        return logits, last_hidden


In [36]:
# Training loop for encoder-decoder model
def train(model_enc, model_dec, dataloader, optimizer, criterion, device):
    model_enc.train()
    model_dec.train()

    total_loss = 0

    for src, tgt in dataloader:
        # Move input and target to device
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()

        # Forward pass through encoder
        context, enc_hidden = model_enc(src)

        # Forward pass through decoder
        logits, _ = model_dec(tgt[:, :-1], context, enc_hidden)

        # Compute loss
        loss = criterion(logits.reshape(-1, logits.shape[-1]), tgt[:, 1:].reshape(-1))

        # Backprop and update
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


In [37]:
# Load data (see reference[3])

# Imports
import pandas as pd

# Load CSV 
df = pd.read_csv("eng_-french.csv")
df = df.dropna().sample(n=50000, random_state=42)

# Rename columns for ease of use
df.columns = ["english", "french"]

# Create sentence pairs
sentence_pairs = list(zip(df["english"], df["french"]))

# Preview
print("Example pair:", sentence_pairs[0])


Example pair: ('Take a seat.', 'Prends place !')


In [None]:
# Imports
from collections import defaultdict

# Tokenizer
def tokenize(sentence):
    return sentence.lower().strip().split()

# Build vocab (see reference[4])
def build_vocab(sentences):
    vocab = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
    index = 4
    for sent in sentences:
        for token in tokenize(sent):
            if token not in vocab:
                vocab[token] = index
                index += 1
    return vocab

# Encode a sentence as token IDs
def encode(sentence, vocab, max_len=100):
    tokens = ["<SOS>"] + tokenize(sentence) + ["<EOS>"]
    ids = [vocab.get(t, vocab["<UNK>"]) for t in tokens]
    ids = ids[:max_len] + [vocab["<PAD>"]] * (max_len - len(ids))
    return ids


In [39]:
# Split source and target sentences
src_sentences = [src for src, _ in sentence_pairs]
tgt_sentences = [tgt for _, tgt in sentence_pairs]

# Build vocabularies
source_vocab = build_vocab(src_sentences)
target_vocab = build_vocab(tgt_sentences)


In [40]:
# Imports
import torch
from torch.utils.data import Dataset

# Dataset for loading and encoding translation pairs
class TranslationDataset(Dataset):
    def __init__(self, pairs, src_vocab, tgt_vocab, max_len=100):
        # Store sentence pairs and vocabularies
        self.pairs = pairs
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab
        self.max_len = max_len

    def __len__(self):
        # Return total number of examples
        return len(self.pairs)

    def __getitem__(self, idx):
        # Retrieve source and target sentences
        src, tgt = self.pairs[idx]
        # Encode to fixed‐length token ID lists
        src_ids = encode(src, self.src_vocab, self.max_len)
        tgt_ids = encode(tgt, self.tgt_vocab, self.max_len)
        # Convert lists to long tensors
        return (
            torch.tensor(src_ids, dtype=torch.long),
            torch.tensor(tgt_ids, dtype=torch.long),
        )


In [None]:
# Imports
from torch.utils.data import DataLoader

# Create DataLoader for batching and shuffling
train_dataset = TranslationDataset(sentence_pairs, source_vocab, target_vocab)
train_loader = DataLoader(
    train_dataset,
    batch_size=16,  
    shuffle=True   
)


In [42]:
# Imports
import torch.nn as nn

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize encoder and decoder models on device
encoder = EncoderRNNWithAttention(len(source_vocab), 128, 256).to(device)
decoder = DecoderRNN(len(target_vocab), 128, 256).to(device)

# Loss function (ignore padding) and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=target_vocab["<PAD>"])
optimizer = torch.optim.Adam(
    list(encoder.parameters()) + list(decoder.parameters()),
    lr=0.001  # learning rate
)


In [43]:
NUM_EPOCHS = 5

# Training loop
for epoch in range(1, NUM_EPOCHS + 1):
    loss = train(encoder, decoder, train_loader, optimizer, criterion, device)
    # Print loss 
    if epoch % 1 == 0:
        print(f"Epoch {epoch}: Loss = {loss:.4f}")


Epoch 1: Loss = 4.7643
Epoch 2: Loss = 3.5288
Epoch 3: Loss = 2.9924
Epoch 4: Loss = 2.6298
Epoch 5: Loss = 2.3622


In [56]:
# Imports for BLEU evaluation
from nltk.translate.bleu_score import corpus_bleu

def greedy_translate(src_sentence):
    encoder.eval()
    decoder.eval()

    # Prepare source tensor
    src_ids = encode(src_sentence, source_vocab)
    src_tensor = torch.tensor(src_ids).unsqueeze(0).to(device)

    with torch.no_grad():
        # Get context and initial hidden state from encoder
        context, enc_hidden = encoder(src_tensor)
        hidden = enc_hidden
        tgt_ids = [target_vocab["<SOS>"]]

        # Decode 
        for _ in range(40):
            tgt_tensor = torch.tensor(tgt_ids).unsqueeze(0).to(device)
            # Unpack logits and hidden state
            logits, hidden = decoder(tgt_tensor, context, hidden)
            next_token = logits[0, -1].argmax().item()
            if next_token == target_vocab["<EOS>"]:
                break
            tgt_ids.append(next_token)

    return tgt_ids[1:]

# Reverse vocab for decoding
inv_tgt_vocab = {i: w for w, i in target_vocab.items()}

# BLEU eval over 1000 examples
references = []
hypotheses = []

for src, tgt in sentence_pairs[:1000]:
    pred_ids = greedy_translate(src)
    ref_ids = encode(tgt, target_vocab)

    # Remove PAD, SOS, EOS tokens
    clean_ref = [i for i in ref_ids if i not in [0, 1, 2]]
    clean_pred = [i for i in pred_ids if i not in [0, 1, 2]]

    references.append([clean_ref])
    hypotheses.append(clean_pred)

bleu_score = corpus_bleu(references, hypotheses)
print(f"BLEU Score: {bleu_score:.4f}")


BLEU Score: 0.0397


# Evaluation

As you can see it did not have that great of a blue score. I want to attribute this to the fact that I do not have the attention mechanism in the decoder.
Maybe there is somthing else that I am missing. 

## Simplified Transformer Implementation (see reference[5])
This was difficult task so I heavily refernced [5] 

In [None]:
# Imports
import torch
import torch.nn as nn
import math

# Positional encoding using sinusoidal functions
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)  # (max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1).float()  # (max_len, 1)
        div = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)  # sine on even dimensions
        pe[:, 1::2] = torch.cos(pos * div)  # cosine on odd dimensions
        self.register_buffer('pe', pe)  # store encoding as buffer 

    def forward(self, x):
        # x: (batch, seq_len, d_model)
        seq_len = x.size(1)
        return x + self.pe[:seq_len]  # add positional encoding to input


In [None]:
class MultiHeadAttention(nn.Module): # https://docs.pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html
    def __init__(self, d_model=64, n_heads=2):
        super().__init__()
        assert d_model % n_heads == 0  # make sure heads divide evenly
        self.d_k = d_model // n_heads  # head dimension
        self.n_heads = n_heads

        self.W_Q = nn.Linear(d_model, d_model)  # linear projection for queries
        self.W_K = nn.Linear(d_model, d_model)  # linear projection for keys
        self.W_V = nn.Linear(d_model, d_model)  # linear projection for values
        self.W_O = nn.Linear(d_model, d_model)  # output projection

    def forward(self, Q, K, V, mask=None):
        batch = Q.size(0)

        # linear projection and head split
        def split(x, W):
            x = W(x)  # (batch, seq, d_model)
            x = x.view(batch, -1, self.n_heads, self.d_k).transpose(1, 2)  # (batch, heads, seq, d_k)
            return x

        Qh, Kh, Vh = split(Q, self.W_Q), split(K, self.W_K), split(V, self.W_V)

        # compute scaled dot-product attention
        scores = (Qh @ Kh.transpose(-2, -1)) / math.sqrt(self.d_k)  # (batch, heads, seq, seq)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)  # apply mask
        A = torch.softmax(scores, dim=-1)  # attention weights

        # combine heads
        out = (A @ Vh).transpose(1, 2).contiguous().view(batch, -1, self.n_heads * self.d_k)
        return self.W_O(out), A  # final output and attention weights


In [None]:
class FeedForward(nn.Module):
    def __init__(self, d_model=64, d_ff=128, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, d_ff),  # expand hidden size
            nn.ReLU(),  # non-linearity
            nn.Linear(d_ff, d_model),  # project back to model dim
            nn.Dropout(dropout)  # regularization
        )

    def forward(self, x):
        return self.net(x)  # apply feedforward network


In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model=64, n_heads=2, d_ff=128, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, n_heads)  # multi-head self attention
        self.ff = FeedForward(d_model, d_ff, dropout)  # position-wise feedforward
        self.norm1 = nn.LayerNorm(d_model)  # layer norm after attention
        self.norm2 = nn.LayerNorm(d_model)  # layer norm after FFN
        self.dropout = nn.Dropout(dropout)  # dropout for regularization

    def forward(self, x, mask=None):
        # apply self-attention with residual and norm
        attn_out, _ = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_out))
        # apply feedforward with residual and norm
        ff_out = self.ff(x)
        return self.norm2(x + self.dropout(ff_out))


In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model=64, n_heads=2, d_ff=128, dropout=0.1):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, n_heads)  # masked self-attention
        self.cross_attn = MultiHeadAttention(d_model, n_heads)  # encoder-decoder attention
        self.ff = FeedForward(d_model, d_ff, dropout)  # position-wise feedforward
        self.norm1 = nn.LayerNorm(d_model)  # norm after self-attn
        self.norm2 = nn.LayerNorm(d_model)  # norm after cross-attn
        self.norm3 = nn.LayerNorm(d_model)  # norm after FFN
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask=None, tgt_mask=None):
        sa_out, _ = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(sa_out))
        # apply encoder-decoder cross-attn with residual and norm
        ca_out, _ = self.cross_attn(x, enc_out, enc_out, src_mask)
        x = self.norm2(x + self.dropout(ca_out))
        # apply feedforward with residual and norm
        ff_out = self.ff(x)
        return self.norm3(x + self.dropout(ff_out))


In [None]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=64, n_heads=2,
                 d_ff=128, num_enc=2, num_dec=2, max_len=100, dropout=0.1):
        super().__init__()
        self.src_tok = nn.Embedding(src_vocab_size, d_model)  # source token embedding
        self.tgt_tok = nn.Embedding(tgt_vocab_size, d_model)  # target token embedding
        self.pos_enc = PositionalEncoding(d_model, max_len)  # sinusoidal position encoding

        self.enc_layers = nn.ModuleList([EncoderLayer(d_model,n_heads,d_ff,dropout)
                                         for _ in range(num_enc)])  # encoder stack
        self.dec_layers = nn.ModuleList([DecoderLayer(d_model,n_heads,d_ff,dropout)
                                         for _ in range(num_dec)])  # decoder stack
        self.out_proj = nn.Linear(d_model, tgt_vocab_size)  # final projection layer

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        # encode source
        enc = self.pos_enc(self.src_tok(src))
        for layer in self.enc_layers:
            enc = layer(enc, src_mask)
        # decode target
        dec = self.pos_enc(self.tgt_tok(tgt))
        for layer in self.dec_layers:
            dec = layer(dec, enc, src_mask, tgt_mask)
        # output vocab distribution
        return self.out_proj(dec)


In [None]:
# Imports
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from nltk.translate.bleu_score import corpus_bleu

# Dataset definition
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, pairs, src_vocab, tgt_vocab, max_len=50):
        self.pairs = pairs
        self.sv = src_vocab
        self.tv = tgt_vocab
        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, i):
        src, tgt = self.pairs[i]
        src_ids = encode(src, self.sv, self.max_len)
        tgt_ids = encode(tgt, self.tv, self.max_len)
        return torch.tensor(src_ids), torch.tensor(tgt_ids)

# Collate function
def collate(batch):
    src_batch, tgt_batch = zip(*batch)
    src = torch.stack(src_batch)
    tgt = torch.stack(tgt_batch)
    return src, tgt

# Create dataloader
dataset = SimpleDataset(sentence_pairs, source_vocab, target_vocab)
loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate)

# Create masks
def make_src_mask(src):
    return (src != source_vocab['<PAD>']).unsqueeze(1).unsqueeze(2)

def make_tgt_mask(tgt):
    seq_len = tgt.size(1)
    pad_mask = (tgt != target_vocab['<PAD>']).unsqueeze(1).unsqueeze(3)
    subseq = torch.triu(torch.ones((seq_len, seq_len)), diagonal=1).bool()
    return pad_mask & ~subseq.to(pad_mask.device)

# Initialize model, optimizer, loss
model = Transformer(len(source_vocab), len(target_vocab), d_model=64, n_heads=2,
                    d_ff=128, num_enc=2, num_dec=2, max_len=50, dropout=0.1).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
crit = nn.CrossEntropyLoss(ignore_index=target_vocab['<PAD>'])

# Training loop
NUM_EPOCHS = 10
for epoch in range(1, NUM_EPOCHS+1):
    model.train()
    total_loss = 0
    for src, tgt in loader:
        src, tgt = src.to(device), tgt.to(device)
        opt.zero_grad()
        src_mask = make_src_mask(src)
        tgt_mask = make_tgt_mask(tgt[:,:-1])
        out = model(src, tgt[:,:-1], src_mask, tgt_mask)
        loss = crit(out.view(-1, out.size(-1)), tgt[:,1:].reshape(-1))
        loss.backward()
        opt.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss = {total_loss/len(loader):.4f}")


Epoch 1, Loss = 4.9036
Epoch 2, Loss = 3.4696
Epoch 3, Loss = 2.8391
Epoch 4, Loss = 2.4010
Epoch 5, Loss = 2.0820
Epoch 6, Loss = 1.8397
Epoch 7, Loss = 1.6505
Epoch 8, Loss = 1.5103
Epoch 9, Loss = 1.4020
Epoch 10, Loss = 1.3156


In [None]:
# set model to evaluation mode
model.eval()

refs, hyps = [], []

# number of evaluation pairs
MAX_EVAL_PAIRS = 1000
processed = 0

with torch.no_grad():
    for src, tgt in loader:
        if processed >= MAX_EVAL_PAIRS:
            break

        # clip batch if it would exceed limit
        batch_size = src.size(0)
        if processed + batch_size > MAX_EVAL_PAIRS:
            batch_size = MAX_EVAL_PAIRS - processed
            src = src[:batch_size]
            tgt = tgt[:batch_size]

        src = src.to(device)
        src_mask = make_src_mask(src)

        # start sequence with <SOS> token
        ys = torch.full((batch_size, 1), target_vocab['<SOS>'], dtype=torch.long).to(device)

        # autoregressive decoding
        for _ in range(49):
            tgt_mask = make_tgt_mask(ys)
            out = model(src, ys, src_mask, tgt_mask)
            next_tok = out[:, -1, :].argmax(dim=-1, keepdim=True)
            ys = torch.cat([ys, next_tok], dim=1)

        # decode predictions and references
        for i in range(batch_size):
            pred = ys[i, 1:].cpu().tolist()
            ref  = tgt[i, 1:].cpu().tolist()
            pred = [t for t in pred if t not in [0, 1, 2]]
            ref  = [t for t in ref  if t not in [0, 1, 2]]
            refs.append([ref])
            hyps.append(pred)

        processed += batch_size

# BLEU score
bleu = corpus_bleu(refs, hyps)
print(f"Transformer BLEU Score: {bleu:.4f}")


Transformer BLEU Score: 0.1779


### BLEU Score Comparison

- RNN with Scaled Dot-Product Attention: BLEU = 0.0397
- Simplified Transformer: BLEU = 0.1779

### Explanation of Differences in Performance

- The transformer handles longer sequences better because of self-attention unlike the RNN which processes sequentially.
- Multi-head attention in the transformer allows it to capture multiple types of relationships in the data.
- The transformer includes attention in both the encoder and decoder, while the RNN used attention only in the encoder.
- Positional encoding in the transformer helps preserve word order without recurrence.

### Runtime Differences

- The transformer trains faster because it processes sequences in parallel.


## References

1. Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, Ł., & Polosukhin, I. (2017). *Attention Is All You Need*.  
   Retrieved from https://arxiv.org/pdf/1706.03762

2. PyTorch. (n.d.). *Sequence to Sequence Translation Tutorial*.  
   Retrieved from https://docs.pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

3. Devicharith. (2021). *Language Translation - English to French* [Dataset].  
   Retrieved from https://www.kaggle.com/datasets/devicharith/language-translation-englishfrench

4. Battu, A. (2022). *Understanding How a Seq2Seq Model Works for Machine Translation — Comprehensive Explanation for Each Component*.  
   Retrieved from https://medium.com/@abhinavbattu88/understanding-how-a-seq2seq-model-works-for-machine-translation-comprehensive-explanation-for-each-d1d872d67e9a

5. Bird of Paradise. (2023). *Transformer from Scratch Tutorial* [Notebook].  
   Retrieved from https://huggingface.co/datasets/bird-of-paradise/transformer-from-scratch-tutorial/blob/main/Transformer_Implementation_Tutorial.ipynb
