<a href="https://colab.research.google.com/github/Hamza-Ali0237/PyTorch-Transformer-From-Scratch/blob/main/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Encoder-Decoder Tranformer From Scratch Using PyTorch

Implemeting The Encoder-Decoder Transformer Architecture From The 2017 Paper Published By Google ["*Attention Is All You Need* "](https://arxiv.org/abs/1706.03762)

In [12]:
!pip install -U datasets fsspec huggingface_hub

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.32.3-py3-none-any.whl.metadata (14 kB)
Collecting fsspec
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface_hub)
  Downloading hf_xet-1.1.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (879 bytes)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.32.3-py3-none-any.whl (512 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.1/512.1

In [1]:
# Importing Libraries
import math
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

from transformers import AutoTokenizer
import datasets
from datasets import load_dataset

In [2]:
class InputEmbeddings(nn.Module):
  def __init__(self, vocab_size, d_model):
    super().__init__()

    self.d_model = d_model
    self.vocab_size = vocab_size
    self.embeddings = nn.Embedding(
        vocab_size, d_model
    )

  def forward(self, x):
    return self.embeddings(x) * math.sqrt(self.d_model)

In [3]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_seq_len):
    super().__init__()

    pe = torch.zeros(max_seq_len, d_model)
    position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0)/d_model))

    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)

    self.register_buffer('pe', pe.unsqueeze(0))

  def forward(self, x):
    return x + self.pe[:, :x.size(1)]

In [4]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_model, num_heads):
    super().__init__()

    print(f"Initializing MultiHeadAttention with d_model={d_model} and num_heads={num_heads}")

    assert d_model % num_heads == 0, 'd_model must be divisible by num_heads.'

    self.num_heads = num_heads
    self.d_model = d_model
    self.head_dim = d_model // num_heads

    self.query_linear = nn.Linear(d_model, d_model, bias=False)
    self.key_linear = nn.Linear(d_model, d_model, bias=False)
    self.value_linear = nn.Linear(d_model, d_model, bias=False)

    self.output_linear = nn.Linear(d_model, d_model)

  def split_heads(self, x, batch_size):
    seq_len = x.size(1)
    x = x.reshape(batch_size, seq_len, self.num_heads, self.head_dim)

    return x.permute(0, 2, 1, 3)

  def compute_attention(self, query, key, value, mask=None):
    scores = torch.matmul(query, key.transpose(-2, -1)) / (self.head_dim ** 0.5)

    if mask is not None:
      scores = scores.masked_fill(mask == 0, float('-inf'))

    attention_weights = F.softmax(scores, dim=-1)

    return torch.matmul(attention_weights, value)

  def combine_heads(self, x, batch_size):
    x = x.permute(0, 2, 1, 3).contiguous()
    return x.view(batch_size, -1, self.d_model)

  def forward(self, q, k, v, mask=None):
    batch_size = q.size(0)

    query = self.split_heads(self.query_linear(q), batch_size)
    key = self.split_heads(self.key_linear(k), batch_size)
    value = self.split_heads(self.value_linear(v), batch_size)

    attention_weights = self.compute_attention(query, key, value, mask)

    output = self.combine_heads(attention_weights, batch_size)

    return self.output_linear(output)

In [5]:
class FeedForwardSubLayer(nn.Module):
  def __init__(self, d_model, d_ff):
    super().__init__()
    self.fc1 = nn.Linear(d_model, d_ff)
    self.fc2 = nn.Linear(d_ff, d_model)
    self.relu = nn.ReLU()

  def forward(self, x):
    return self.fc2(self.relu(self.fc1(x)))

In [6]:
class EncoderLayer(nn.Module):
  def __init__(self, d_model, num_heads, d_ff, dropout):
    super().__init__()

    self.self_attn = MultiHeadAttention(d_model, num_heads)

    self.ff_sublayer = FeedForwardSubLayer(d_model, d_ff)

    self.norm1 = nn.LayerNorm(d_model)
    self.norm2 = nn.LayerNorm(d_model)

    self.dropout = nn.Dropout(dropout)

  def forward(self, x, src_mask):
    attn_output = self.self_attn(x, x, x, src_mask)

    x = self.norm1(x + self.dropout(attn_output))

    ff_output = self.ff_sublayer(x)

    x = self.norm2(x + self.dropout(ff_output))

    return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()

        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.ff_sublayer = FeedForwardSubLayer(d_model, d_ff)

        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, encoder_output, tgt_mask, cross_mask):
        # Self-attention
        self_attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(self_attn_output))

        # Cross-attention
        cross_attn_output = self.cross_attn(x, encoder_output, encoder_output, cross_mask)
        x = self.norm2(x + self.dropout(cross_attn_output))

        # Feed-forward
        ff_output = self.ff_sublayer(x)
        x = self.norm3(x + self.dropout(ff_output))

        return x

In [7]:
class TransformerEncoder(nn.Module):
  def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_seq_length):
    super().__init__()

    self.embedding = InputEmbeddings(vocab_size, d_model)

    self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

    self.layers = nn.ModuleList([
        EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)
    ])

  def forward(self, x, src_mask):
    x = self.embedding(x)

    x = self.positional_encoding(x)

    for layer in self.layers:
      x = layer(x, src_mask)

    return x

class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_seq_length):
        super().__init__()

        self.embedding = InputEmbeddings(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x, encoder_output, tgt_mask, cross_mask):
        x = self.embedding(x)
        x = self.positional_encoding(x)

        for layer in self.layers:
            x = layer(x, encoder_output, tgt_mask, cross_mask)

        x = self.fc(x)

        return x

In [8]:
class ClassificationHead(nn.Module):
  def __init__(self, d_model, num_classes):
    super().__init__()
    self.fc = nn.Linear(d_model, num_classes)

  def forward(self, x):
    logits = self.fc(x)
    return F.log_softmax(logits, dim=-1)

In [9]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_len, dropout):
        super().__init__()

        self.encoder = TransformerEncoder(vocab_size, d_model, num_heads, num_layers, d_ff, dropout, max_seq_len)
        self.decoder = TransformerDecoder(vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_seq_len)

    def forward(self, src, src_mask, tgt, tgt_mask):
        encoder_output = self.encoder(src, src_mask)
        decoder_output = self.decoder(tgt, encoder_output, tgt_mask, src_mask)
        return decoder_output

In [10]:
# Load "WMT 2014 English-to-German" Dataset
dataset = load_dataset("wmt14", "de-en")

train_dataset = dataset['train']
test_dataset = dataset['test']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/280M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/265M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/273M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/474k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/509k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4508785 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3003 [00:00<?, ? examples/s]

In [11]:
print(train_dataset.features)

{'translation': Translation(languages=['de', 'en'], id=None)}


In [12]:
# Tokenize the data
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

def preprocess(batch):
    src_texts = [example["en"] for example in batch["translation"]]
    tgt_texts = [example["de"] for example in batch["translation"]]

    src = tokenizer(
        src_texts, padding="max_length",
        truncation=True, max_length=128, return_tensors="pt"
    )
    tgt = tokenizer(
        tgt_texts, padding="max_length",
        truncation=True, max_length=128, return_tensors="pt"
    )

    return {
        "src_input_ids": src["input_ids"].tolist(),
        "tgt_input_ids": tgt["input_ids"].tolist()
    }



train_data = train_dataset.map(preprocess, batched=True, batch_size=64)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/4508785 [00:00<?, ? examples/s]

In [13]:
print(train_data[0])

{'translation': {'de': 'Wiederaufnahme der Sitzungsperiode', 'en': 'Resumption of the session'}, 'src_input_ids': [101, 32070, 94118, 10108, 10105, 30066, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'tgt_input_ids': [101, 23789, 24053, 21598, 26413, 10118, 21564, 10716, 34185, 10638, 10253, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [20]:
# Define collate_fn to pad sequences dynamically
def collate_fn(batch):
    src_batch = [torch.tensor(item["src_input_ids"]) for item in batch]
    tgt_batch = [torch.tensor(item["tgt_input_ids"]) for item in batch]

    # Pad sequences in each batch
    src_batch = torch.nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=tokenizer.pad_token_id)
    tgt_batch = torch.nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=tokenizer.pad_token_id)

    return src_batch, tgt_batch

train_dataloader = DataLoader(train_data, batch_size=8, collate_fn=collate_fn, shuffle=True)

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
# Define Hyperparameters
vocab_size = tokenizer.vocab_size
d_model = 256
num_layers = 4
num_heads = 8
d_ff = 2048
max_seq_len = 128
dropout = 0.1


model = Transformer(vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_len, dropout)
model.to(device)

Initializing MultiHeadAttention with d_model=256 and num_heads=4
Initializing MultiHeadAttention with d_model=256 and num_heads=4
Initializing MultiHeadAttention with d_model=256 and num_heads=4
Initializing MultiHeadAttention with d_model=256 and num_heads=4
Initializing MultiHeadAttention with d_model=256 and num_heads=4
Initializing MultiHeadAttention with d_model=256 and num_heads=4
Initializing MultiHeadAttention with d_model=256 and num_heads=4
Initializing MultiHeadAttention with d_model=256 and num_heads=4
Initializing MultiHeadAttention with d_model=256 and num_heads=8
Initializing MultiHeadAttention with d_model=256 and num_heads=8
Initializing MultiHeadAttention with d_model=256 and num_heads=8
Initializing MultiHeadAttention with d_model=256 and num_heads=8
Initializing MultiHeadAttention with d_model=256 and num_heads=8
Initializing MultiHeadAttention with d_model=256 and num_heads=8
Initializing MultiHeadAttention with d_model=256 and num_heads=8
Initializing MultiHeadAtt

Transformer(
  (encoder): TransformerEncoder(
    (embedding): InputEmbeddings(
      (embeddings): Embedding(119547, 256)
    )
    (positional_encoding): PositionalEncoding()
    (layers): ModuleList(
      (0-7): 8 x EncoderLayer(
        (self_attn): MultiHeadAttention(
          (query_linear): Linear(in_features=256, out_features=256, bias=False)
          (key_linear): Linear(in_features=256, out_features=256, bias=False)
          (value_linear): Linear(in_features=256, out_features=256, bias=False)
          (output_linear): Linear(in_features=256, out_features=256, bias=True)
        )
        (ff_sublayer): FeedForwardSubLayer(
          (fc1): Linear(in_features=256, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=256, bias=True)
          (relu): ReLU()
        )
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, 

In [17]:
# Define loss function
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-9)

In [22]:
# Define function to train the model
def train(model, dataloader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        for src_batch, tgt_batch in dataloader:
            src_batch, tgt_batch = src_batch.to("cuda"), tgt_batch.to("cuda")

            # Create masks
            src_mask = (src_batch != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2)

            # Shift target for teacher forcing
            tgt_input = tgt_batch[:, :-1]
            tgt_output = tgt_batch[:, 1:]

            # Create tgt_mask for tgt_input
            tgt_mask = (tgt_input != tokenizer.pad_token_id).unsqueeze(1).unsqueeze(2)

            # Forward pass
            outputs = model(src_batch, src_mask, tgt_input, tgt_mask)
            loss = criterion(outputs.reshape(-1, vocab_size), tgt_output.reshape(-1))

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            torch.cuda.empty_cache()

            epoch_loss += loss.item()

        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss / len(dataloader):.4f}")

In [None]:
train(model, train_dataloader, criterion, optimizer, num_epochs=10)