# Transformers



In [2]:
#Example
import torch.nn as nn
model = nn.Transformer(d_model=512,
                       nhead=8,
                       num_encoder_layers=6,
                       num_decoder_layers=6)
print(model)



Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, o

# Text Summariser

**Note:** This is an **extractive summarizer** that uses transformer components to score and select 
existing sentences from the input text based on heuristics. It does NOT perform true learned 
summarization like BART or T5 (which can paraphrase, combine, or generate new sentences). 
This demonstrates transformer architecture without requiring training on large datasets.

In [None]:
"""
Transformer Model for Text Summarization


This script implements a Transformer architecture using PyTorch, demonstrating
the actual components used in models like BERT, GPT, and T5.

IMPORTANT: This is an EXTRACTIVE summarizer - it selects and ranks existing sentences
from the input text using transformer-based representations. It does NOT generate new 
text or paraphrase like modern seq2seq models (BART, T5, GPT). For true learned 
summarization, you would need a trained encoder-decoder model with a large dataset.

What is a Transformer? 

A transformer is a neural network architecture that uses:
1. Self-Attention: Learns which words relate to each other
2. Multi-Head Attention: Multiple attention mechanisms working in parallel
3. Positional Encoding: Encodes the position of words in the sequence
4. Feed-Forward Networks: Deep learning layers that process information
5. Layer Normalization: Keeps training stable
6. Residual Connections: Helps gradient flow during training

Unlike rule-based systems, transformers LEARN these patterns from data through training.

Architecture Overview:
INPUT TEXT → TOKENIZATION → EMBEDDINGS → ENCODER → DECODER → OUTPUT SUMMARY

Components:
- Encoder: Processes the input text, builds understanding
- Decoder: Generates the summary based on encoder's understanding
- Attention: Learns what to focus on at each step

Note: This is a minimal transformer for educational purposes. Production models
like GPT-4 or BERT have billions of parameters and are trained on massive datasets.
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import re
from typing import List, Dict
import math


class PositionalEncoding(nn.Module):
    """
    Positional Encoding: Adds information about word positions.
    
    Why do we need this?
    - Transformers process all words simultaneously (unlike RNNs which go one by one)
    - Without position info, "dog bites man" = "man bites dog" to the model
    - We add a unique position "signature" to each word's embedding
    
    How it works:
    - Uses sine and cosine functions at different frequencies
    - Creates a unique pattern for each position
    - This pattern is added to the word embeddings
    """
    
    def __init__(self, d_model, max_len=5000):
        """
        Args:
            d_model: Dimension of embeddings (e.g., 512)
            max_len: Maximum sequence length we'll handle
        """
        super(PositionalEncoding, self).__init__()
        
        # Create a matrix to hold positional encodings
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        
        # Create the division term for the sine/cosine functions
        # This creates different frequencies for different dimensions
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        # Apply sine to even indices, cosine to odd indices
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        # Add batch dimension
        pe = pe.unsqueeze(0)
        
        # Register as buffer (not a parameter, but part of model state)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        """
        Add positional encoding to input embeddings.
        
        Args:
            x: Input tensor of shape (batch_size, seq_len, d_model)
        """
        # Add positional encoding to input
        x = x + self.pe[:, :x.size(1), :]
        return x


class MultiHeadAttention(nn.Module):
    """
    Multi-Head Self-Attention: The core of the transformer!
    
    What is Attention?
    - A mechanism to focus on relevant parts of the input
    - Learns relationships between words (e.g., "it" refers to "car")
    - Each word can attend to (look at) every other word
    
    Why Multi-Head?
    - One attention head might focus on syntax (grammar)
    - Another might focus on semantics (meaning)
    - Another might track long-range dependencies
    - Multiple heads capture different types of relationships
    
    How it works:
    1. For each word, create Query (what I'm looking for), 
       Key (what I have), and Value (what information I carry)
    2. Compare Queries with Keys to find relevance (attention scores)
    3. Use attention scores to weight the Values
    4. This tells each word what to pay attention to
    """
    
    def __init__(self, d_model, num_heads):
        """
        Args:
            d_model: Dimension of embeddings (must be divisible by num_heads)
            num_heads: Number of attention heads
        """
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads  # Dimension per head
        
        # Linear layers to create Queries, Keys, Values
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        
        # Output projection
        self.W_o = nn.Linear(d_model, d_model)
    
    def split_heads(self, x, batch_size):
        """
        Split embeddings into multiple heads.
        
        Changes shape from (batch, seq_len, d_model) 
        to (batch, num_heads, seq_len, d_k)
        """
        x = x.view(batch_size, -1, self.num_heads, self.d_k)
        return x.transpose(1, 2)
    
    def forward(self, query, key, value, mask=None):
        """
        Compute multi-head attention.
        
        Args:
            query: What we're looking for
            key: What we're comparing against
            value: The information to extract
            mask: Optional mask to prevent attention to certain positions
        """
        batch_size = query.size(0)
        
        # 1. Linear projections to get Q, K, V
        Q = self.W_q(query)
        K = self.W_k(key)
        V = self.W_v(value)
        
        # 2. Split into multiple heads
        Q = self.split_heads(Q, batch_size)
        K = self.split_heads(K, batch_size)
        V = self.split_heads(V, batch_size)
        
        # 3. Compute attention scores
        # Scores = Q * K^T / sqrt(d_k)
        # This tells us how much each word should attend to every other word
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        # 4. Apply mask if provided (e.g., for padding or future tokens)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        
        # 5. Apply softmax to get attention weights (probabilities)
        attention_weights = F.softmax(scores, dim=-1)
        
        # 6. Apply attention to values
        # This creates the weighted sum based on attention
        output = torch.matmul(attention_weights, V)
        
        # 7. Concatenate heads back together
        output = output.transpose(1, 2).contiguous()
        output = output.view(batch_size, -1, self.d_model)
        
        # 8. Final linear projection
        output = self.W_o(output)
        
        return output


class FeedForward(nn.Module):
    """
    Feed-Forward Network: Processes each position independently.
    
    What does it do?
    - After attention tells us what to focus on, FFN processes that information
    - Two linear layers with ReLU activation in between
    - Applied to each position (word) separately
    - Adds non-linearity and complexity to the model
    
    Think of it as:
    - Attention = "What should I look at?"
    - Feed-Forward = "What should I do with what I'm looking at?"
    """
    
    def __init__(self, d_model, d_ff, dropout=0.1):
        """
        Args:
            d_model: Input/output dimension
            d_ff: Hidden layer dimension (usually 4x d_model)
            dropout: Dropout rate for regularization
        """
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        """
        Apply feed-forward network.
        
        x -> Linear -> ReLU -> Dropout -> Linear -> output
        """
        x = self.linear1(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x


class EncoderLayer(nn.Module):
    """
    Encoder Layer: One layer of the transformer encoder.
    
    Structure:
    1. Multi-Head Self-Attention
    2. Add & Normalize (residual connection + layer norm)
    3. Feed-Forward Network
    4. Add & Normalize
    
    Why residual connections?
    - Helps gradients flow during training (prevents vanishing gradients)
    - Allows model to learn identity function if needed
    - Makes training deeper networks possible
    
    Why layer normalization?
    - Stabilizes training
    - Makes optimization easier
    - Reduces internal covariate shift
    """
    
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):
        """
        Process input through one encoder layer.
        
        Args:
            x: Input tensor
            mask: Optional attention mask
        """
        # Self-attention with residual connection
        attn_output = self.self_attn(x, x, x, mask)
        x = x + self.dropout1(attn_output)  # Residual connection
        x = self.norm1(x)  # Layer normalization
        
        # Feed-forward with residual connection
        ff_output = self.feed_forward(x)
        x = x + self.dropout2(ff_output)  # Residual connection
        x = self.norm2(x)  # Layer normalization
        
        return x


class TransformerEncoder(nn.Module):
    """
    Transformer Encoder: Stack of encoder layers.
    
    The encoder's job:
    - Read and understand the input text
    - Build rich representations of each word in context
    - Output: contextualized embeddings that capture meaning
    """
    
    def __init__(self, vocab_size, d_model, num_heads, num_layers, d_ff, max_len, dropout=0.1):
        """
        Args:
            vocab_size: Size of vocabulary
            d_model: Embedding dimension
            num_heads: Number of attention heads
            num_layers: Number of encoder layers to stack
            d_ff: Feed-forward hidden dimension
            max_len: Maximum sequence length
            dropout: Dropout rate
        """
        super(TransformerEncoder, self).__init__()
        
        # Convert token IDs to dense vectors
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        
        # Stack of encoder layers
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout)
            for _ in range(num_layers)
        ])
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask=None):
        """
        Encode input sequence.
        
        Args:
            x: Input token IDs (batch_size, seq_len)
            mask: Optional attention mask
        """
        # Convert tokens to embeddings
        x = self.embedding(x) * math.sqrt(self.embedding.embedding_dim)
        
        # Add positional information
        x = self.pos_encoding(x)
        x = self.dropout(x)
        
        # Pass through all encoder layers
        for layer in self.layers:
            x = layer(x, mask)
        
        return x


class SimpleVocab:
    """
    Simple vocabulary for tokenization.
    
    In real transformers:
    - Use sophisticated tokenizers (BPE, WordPiece, SentencePiece)
    - Handle subwords (e.g., "unhappiness" → "un", "happiness")
    - Have vocabularies of 30k-50k tokens
    
    This is a simplified version for demonstration.
    """
    
    def __init__(self):
        self.word2idx = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
        self.idx2word = {0: '<PAD>', 1: '<SOS>', 2: '<EOS>', 3: '<UNK>'}
        self.next_idx = 4
    
    def add_word(self, word):
        """Add a word to vocabulary."""
        if word not in self.word2idx:
            self.word2idx[word] = self.next_idx
            self.idx2word[self.next_idx] = word
            self.next_idx += 1
    
    def encode(self, text):
        """Convert text to token IDs."""
        words = text.lower().split()
        return [self.word2idx.get(word, self.word2idx['<UNK>']) for word in words]
    
    def decode(self, indices):
        """Convert token IDs back to text."""
        return ' '.join([self.idx2word.get(idx, '<UNK>') for idx in indices])


class TransformerSummarizer:
    """
    Main summarizer class that ties everything together.
    
    This implements extractive summarization using a real transformer encoder.
    The encoder learns to understand the text, then we select important sentences.
    """
    
    def __init__(self, d_model=128, num_heads=4, num_layers=2, d_ff=512):
        """
        Initialize the transformer summarizer.
        
        Args:
            d_model: Embedding dimension (smaller than production for demo)
            num_heads: Number of attention heads
            num_layers: Number of transformer layers
            d_ff: Feed-forward hidden dimension
        """
        self.vocab = SimpleVocab()
        self.d_model = d_model
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.d_ff = d_ff
        self.encoder = None
    
    def build_vocab(self, texts):
        """Build vocabulary from training texts."""
        for text in texts:
            words = text.lower().split()
            for word in words:
                self.vocab.add_word(word)
    
    def initialize_model(self):
        """Initialize the transformer encoder."""
        self.encoder = TransformerEncoder(
            vocab_size=len(self.vocab.word2idx),
            d_model=self.d_model,
            num_heads=self.num_heads,
            num_layers=self.num_layers,
            d_ff=self.d_ff,
            max_len=512,
            dropout=0.1
        )
    
    def extract_sentences(self, text):
        """Extract sentences from text."""
        sentences = re.split(r'[.!?]+', text)
        return [s.strip() for s in sentences if s.strip()]
    
    def summarize(self, text, num_sentences=3):
        """
        Summarize text using the transformer encoder.
        
        Note: Since we're not training this model, it won't produce good summaries.
        This is to demonstrate the architecture. For real summarization, you'd need:
        1. A large dataset of (document, summary) pairs
        2. Training loop with loss function and optimizer
        3. Many hours/days of training on GPUs
        
        Args:
            text: Input text to summarize
            num_sentences: Number of sentences to extract
        """
        if self.encoder is None:
            self.build_vocab([text])
            self.initialize_model()
        
        sentences = self.extract_sentences(text)
        
        if len(sentences) <= num_sentences:
            return text
        
        # Encode the full text
        token_ids = self.vocab.encode(text)
        input_tensor = torch.tensor([token_ids])
        
        # Get encoder outputs (contextualized embeddings)
        with torch.no_grad():
            encoder_output = self.encoder(input_tensor)
        
        # Score sentences based on their representation strength
        # In a trained model, this would be learned. Here we use a heuristic.
        sentence_scores = []
        for idx, sentence in enumerate(sentences):
            sent_tokens = self.vocab.encode(sentence)
            if not sent_tokens:
                continue
            
            # Get embeddings for this sentence's tokens
            # Use mean pooling as a simple aggregation
            sent_embeddings = encoder_output[0, :len(sent_tokens), :]
            score = torch.norm(sent_embeddings.mean(dim=0)).item()
            
            sentence_scores.append((sentence, score, idx))
        
        # Select top sentences
        top_sentences = sorted(sentence_scores, key=lambda x: x[1], reverse=True)[:num_sentences]
        top_sentences = sorted(top_sentences, key=lambda x: x[2])
        
        summary = '. '.join(sent[0] for sent in top_sentences) + '.'
        return summary



if __name__ == "__main__":
    """
    Demonstration of the real transformer architecture.
    """
    
    
    print("TRANSFORMER ARCHITECTURE DEMO")
    print("\nThis implements actual transformer components:")
    print("- Multi-Head Self-Attention")
    print("- Positional Encoding")
    print("- Feed-Forward Networks")
    print("- Layer Normalization")
    print("- Residual Connections")
    
    
    # Sample text
    sample_text = """
    Artificial intelligence has revolutionized many industries in recent years. 
    Machine learning algorithms can now process vast amounts of data to identify 
    patterns that humans might miss. The transformer architecture became a 
    breakthrough in natural language processing. It uses self-attention mechanisms 
    to weigh the importance of different words in a sequence. This allows the model 
    to capture long-range dependencies in text. Companies are now using transformers 
    for translation and summarization. The technology continues to evolve rapidly 
    with new models being released frequently.
    """
    
    print("\nINPUT TEXT:")
    print(sample_text.strip())
    
    # Create summarizer
    summarizer = TransformerSummarizer(
        d_model=128,      # Embedding dimension
        num_heads=4,      # Number of attention heads
        num_layers=2,     # Number of encoder layers
        d_ff=512          # Feed-forward dimension
    )
    
    # Generate summary
    summary = summarizer.summarize(sample_text, num_sentences=3)
    
    print("SUMMARY:")
    print(summary)
    


TRANSFORMER ARCHITECTURE DEMO

This implements actual transformer components:
- Multi-Head Self-Attention
- Positional Encoding
- Feed-Forward Networks
- Layer Normalization
- Residual Connections

INPUT TEXT:
Artificial intelligence has revolutionized many industries in recent years. 
    Machine learning algorithms can now process vast amounts of data to identify 
    patterns that humans might miss. The transformer architecture became a 
    breakthrough in natural language processing. It uses self-attention mechanisms 
    to weigh the importance of different words in a sequence. This allows the model 
    to capture long-range dependencies in text. Companies are now using transformers 
    for translation and summarization. The technology continues to evolve rapidly 
    with new models being released frequently.
SUMMARY:
Artificial intelligence has revolutionized many industries in recent years. The transformer architecture became a 
    breakthrough in natural language processin

In [3]:
def interactive_mode(loop=False):
    """
    Interactive mode
    """
    print("INTERACTIVE MODE - TRY YOUR OWN TEXT")
    while True:
        if loop:
            print("\nOptions:")
            print("1. Enter your own text to summarize")
            print("2. Exit")
            
            choice = input("\nEnter your choice (1 or 2): ").strip()
            
            if choice == "2":
                print("\nGoodbye!")
                break
            elif choice != "1":
                print("\nInvalid choice. Please enter 1 or 2.")
                continue
        
        print("\nEnter your text (press Enter twice when done):")
        
        lines = []
        empty_count = 0
        while True:
            line = input()
            if line == "":
                empty_count += 1
                if empty_count >= 2:
                    break
            else:
                empty_count = 0
                lines.append(line)
        
        custom_text = " ".join(lines).strip()
        
        if custom_text:
            
            # Create a new summarizer for the custom text
            custom_summarizer = TransformerSummarizer(
                d_model=128,
                num_heads=4,
                num_layers=2,
                d_ff=512
            )
            
            # Ask how many sentences they want
            try:
                num_sent = int(input("\nHow many sentences in the summary? (default 3): ").strip() or "3")
            except:
                num_sent = 3
            
            custom_summary = custom_summarizer.summarize(custom_text, num_sentences=num_sent)
            
            print("YOUR SUMMARY:")
            print(custom_summary)
            
            # Statistics
            original_sentences = len(custom_summarizer.extract_sentences(custom_text))
            summary_sentences = len(custom_summarizer.extract_sentences(custom_summary))
            
            print(f"Original: {original_sentences} sentences")
            print(f"Summary: {summary_sentences} sentences")
            
            # Exit after one summarization if loop=False
            if not loop:
                print("\nDone!")
                break
        else:
            print("\nNo text entered. Please try again.")
            if not loop:
                break

In [16]:
interactive_mode()

INTERACTIVE MODE - TRY YOUR OWN TEXT

Enter your text (press Enter twice when done):
YOUR SUMMARY:
It's a vicious circle. The more useful our phones become, the more we use them. It's true for everyday tasks that are less high-stakes, too.
Original: 19 sentences
Summary: 3 sentences

Done!


## Mini Transformer
This exmaple trains a simple sentiment classifier on 10 movie reviews

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import re
from sklearn.model_selection import train_test_split

In [5]:
data = {
    "review": [
        "I loved this movie it was fantastic",
        "Absolutely terrible film I hated it",
        "This was a wonderful experience",
        "The plot was boring and predictable",
        "Amazing direction and great acting",
        "Worst movie ever a total waste of time",
        "Enjoyed every bit of this movie",
        "The movie was awful and disappointing",
        "One of the best films I have seen",
        "Poorly written and badly acted"
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # 1 = Positive, 0 = Negative
}

# We have text reviews paired with labels
# Each review needs to be converted to numbers (computers can't understand words directly)
# We'll create a "vocabulary" - a dictionary mapping words to numbers

In [6]:
import pandas as pd
df = pd.DataFrame(data)

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text) # Remove punctuation
    return text.strip()

df["review"] = df["review"].apply(clean_text)

Why clean text?

- Consistency: "Good", "good", and "GOOD" should be the same word
- Simplicity: Remove unnecessary characters
- Better learning: Model focuses on meaningful patterns

In [7]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [8]:
words = set(" ".join(train_df["review"]).split())
vocab = {"<PAD>": 0, "<UNK>": 1} #Special tokens
for i, w in enumerate(sorted(words)):
    vocab[w] = i + 2

#Special tokens:
# <PAD>: Padding token (fills empty spaces in shorter sentences)
# <UNK>: Unknown token (for words not seen during training)

vocab_size = len(vocab)
max_len = 8  # short reviews --> fixed length 8
print(f"Vocab size: {vocab_size}")

# Example Vocab:
#     "<PAD>": 0,
#     "<UNK>": 1,
#     "amazing": 2,
#     "awful": 3,
#     "movie": 4,
#     ...

Vocab size: 36


In [9]:
def text_to_indices(text, vocab, max_len):
  '''
    Input: "I loved this movie"
    Output: [45, 23, 67, 89, 0, 0, 0, 0]  # Padded to length 8
  '''
  tokens = [vocab.get(w, vocab["<UNK>"]) for w in text.split()]
  tokens = tokens[:max_len]
  tokens += [vocab["<PAD>"]] * (max_len - len(tokens))
  return tokens

X_train = torch.tensor([text_to_indices(t, vocab, max_len) for t in train_df["review"]])
y_train = torch.tensor(train_df["label"].values, dtype=torch.float32)
X_test = torch.tensor([text_to_indices(t, vocab, max_len) for t in test_df["review"]])
y_test = torch.tensor(test_df["label"].values, dtype=torch.float32)

In [10]:
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim):
      super().__init__()
      self.embed_dim = embed_dim

    def forward(self, x):
      # Add position information using sine and cosine functions
      batch, seq_len, d_model = x.size()
      pos = torch.arange(seq_len).unsqueeze(1)
      div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
      pe = torch.zeros(seq_len, d_model)
      pe[:, 0::2] = torch.sin(pos * div_term)
      pe[:, 1::2] = torch.cos(pos * div_term)
      return x + pe.unsqueeze(0)


Why do we need this?
Attention mechanisms don't inherently understand word order. These two sentences would look identical without positional encoding:

- "The cat chased the dog"
- "The dog chased the cat"

Positional encoding adds unique "position fingerprints" to each word using mathematical functions (sine/cosine waves at different frequencies).

In [11]:
class MiniTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim=16, num_heads=2):
      super().__init__()
      self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
      self.pos_encoding = PositionalEncoding(embed_dim)
      self.attn = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
      self.ff = nn.Sequential(
          nn.Linear(embed_dim, embed_dim * 2),
          nn.ReLU(),
          nn.Linear(embed_dim * 2, embed_dim)
      )
      self.fc = nn.Linear(embed_dim, 1)
      self.sigmoid = nn.Sigmoid()

    def forward(self, x):
      mask = (x == 0)
      x = self.embedding(x)
      x = self.pos_encoding(x)
      attn_out, _ = self.attn(x, x, x, key_padding_mask=mask)
      x = x + attn_out  # residual connection
      x = x + self.ff(x)
      x = x.mean(dim=1)  # average pooling
      return self.sigmoid(self.fc(x)).squeeze()

Input Numbers -> Embeddings -> + Positional Encoding -> Self-Attention
-> Feed-Forward -> Average Pooling -> Classification -> Output

Each component's role:

1. Embedding Layer:
  - Converts word numbers into dense vectors

  - Input: [4, 23, 67] (word IDs)
Output: [[0.2, -0.5, ...], [0.1, 0.3, ...], ...] (vectors)


2. Self-Attention: Lets words interact

- "not good" → attention helps "good" consider "not"
- Creates context-aware representations


3. Feed-Forward Network: Processes attended information

- Simple neural network:Linear -> ReLU -> Linear
- Adds non-linear transformations


4. Residual Connections: x = x + attention_output

- Helps training by preserving original information
- Prevents "vanishing gradient" problem


5. Average Pooling: Combines all word vectors into one

- Reduces sequence to single representation
- Used for classification tasks

In [12]:
device = torch.device("cpu")
model = MiniTransformer(vocab_size).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

for epoch in range(20):
    model.train()
    optimizer.zero_grad()
    preds = model(X_train)
    loss = criterion(preds, y_train)
    loss.backward()
    optimizer.step()
    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1:02d} | Loss: {loss.item():.4f}")

Epoch 05 | Loss: 0.4433
Epoch 10 | Loss: 0.0157
Epoch 15 | Loss: 0.0000
Epoch 20 | Loss: 0.0000


**Training Process:**

1. Forward Pass: Input flows through model → predictions
2. Loss Calculation: Compare predictions to true labels
3. Backward Pass: Calculate how to adjust weights (gradients)
4. Weight Update: Optimizer adjusts model parameters
5. Repeat: Do this for many epochs (full passes through data)

**What's an Epoch?**

- One complete pass through all training data
- More epochs = more learning (but risk overfitting)

**What's the Loss Function?**

- BCELoss(Binary Cross-Entropy):
- Measures prediction error
- Lower loss = better predictions
- Goal: Minimize this over training

In [39]:
model.eval()
with torch.no_grad():
    preds = model(X_test)
    preds_binary = (preds > 0.5).float()
    accuracy = (preds_binary == y_test).float().mean().item()

print(f"Test Accuracy: {accuracy:.2f}")

Test Accuracy: 0.50


In [40]:
examples = [
    "I really enjoyed this movie",
    "this was awful and boring",
    "great film with wonderful acting",
    "waste of time and energy"
]

print("Predictions:")
for sentence in examples:
    tokens = torch.tensor([text_to_indices(clean_text(sentence), vocab, max_len)])
    with torch.no_grad():
        pred = model(tokens).item()
    sentiment = "POSITIVE" if pred > 0.5 else "NEGATIVE"
    print(f"'{sentence}' --> {sentiment} ({pred:.2f})")

Predictions:
'I really enjoyed this movie' --> POSITIVE (1.00)
'this was awful and boring' --> POSITIVE (1.00)
'great film with wonderful acting' --> POSITIVE (1.00)
'waste of time and energy' --> NEGATIVE (0.00)


# Transformer practice: Sentiment Classification

Now lets scale up to a real dataset with 50,000 IMDB movie reviews.

This transformer is slightly more complex and is a simplified version of an end to end transformer.

**Disclaimer:**
Please use **GPU** to run the code blocks below. Using CPU will result in very long training time. If you do not have GPU locally in your device you can use use google Colab gpu by changing runtime type

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Set seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Load the IMDB dataset
# Note: Update the path if your CSV file is in a different location
df = pd.read_csv('IMDB Dataset.csv')


In [None]:
df = pd.read_csv('IMDB Dataset.csv')

print(f"Total samples: {len(df)}")
print(f"Columns: {df.columns.tolist()}")
print(f"Sentiment distribution:\n{df['sentiment'].value_counts()}\n")

Total samples: 50000
Columns: ['review', 'sentiment']
Sentiment distribution:
sentiment
positive    25000
negative    25000
Name: count, dtype: int64



In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
def preprocess_text(text):
    """Clean and normalize text"""
    text = str(text).lower()
    # Replace <br /> tags with spaces
    text = re.sub(r'<br\s*/?\s*>', ' ', text)
    # Remove other HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation and special characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

Real-world data is messy

This handles:
- HTML tags
- Special characters:@, #, $, %, &
- Multiple spaces and formatting issues

In [None]:
df['review'] = df['review'].apply(preprocess_text)
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [None]:
df.head()

Unnamed: 0,review,sentiment,label
0,one of the other reviewers has mentioned that ...,positive,1
1,a wonderful little production the filming tech...,positive,1
2,i thought this was a wonderful way to spend ti...,positive,1
3,basically theres a family where a little boy j...,negative,0
4,petter matteis love in the time of money is a ...,positive,1


In [None]:
# First split: separate test set (80% for training+validation, 20% for final test)
train_val_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['label']
)

In [None]:
# Second split: separate validation set (80% train, 20% validation)
train_df, val_df = train_test_split(
    train_val_df, test_size=0.2, random_state=42, stratify=train_val_df['label']
)

Why three splits?

- Training set: Model learns from this
- Validation set: Check performance during training (tune hyperparameters)
- Test set: Final evaluation (never seen during training)


In [None]:
print(f"  Training: {len(train_df)}")
print(f"  Validation: {len(val_df)}")
print(f"  Test: {len(test_df)}\n")

  Training: 32000
  Validation: 8000
  Test: 10000



In [None]:
# A vocabulary is a dictionary that maps each word to a unique number
# We only use the TRAINING data to build it (to prevent data leakage)

words = set()  # Use a set to collect unique words
for review in train_df['review']:
    words.update(review.split())  # Split review into words

# Create the vocabulary dictionary
vocab = {
    '<PAD>': 0,  # Special token for padding short sentences
    '<UNK>': 1   # Special token for unknown words
}

# Add all words from training data
for idx, word in enumerate(sorted(list(words))):
    vocab[word] = idx + 2  # Start from 2 (0 and 1 are special tokens)

vocab_size = len(vocab)

In [None]:
# Neural networks can't understand words directly, we need to convert them to numbers!

review_lengths = train_df['review'].apply(lambda x: len(x.split()))
max_length = int(review_lengths.quantile(0.95))

# Why not use the longest review?

# Outliers waste computation
# 95% coverage balances efficiency and completeness
# Typical result: ~400 words instead of 2000+

def text_to_numbers(sentence, vocab, max_len=max_length):
    """
    Convert a sentence to a list of numbers.
    Each word becomes its number from the vocabulary.
    Unknown words become <UNK> token.
    Sequences are padded/truncated to max_len
    """
    # Convert each word to its vocabulary number
    tokens = [vocab.get(word, vocab['<UNK>']) for word in sentence.split()]

    # Pad if too short (add zeros at the end)
    if len(tokens) < max_len:
        tokens = tokens + [vocab['<PAD>']] * (max_len - len(tokens))
    else:
        tokens = tokens[:max_len]
    return tokens

# Convert all reviews to numbers
X_train = torch.tensor([text_to_numbers(r, vocab, max_length) for r in train_df['review']])
X_val = torch.tensor([text_to_numbers(r, vocab, max_length) for r in val_df['review']])
X_test = torch.tensor([text_to_numbers(r, vocab, max_length) for r in test_df['review']])

y_train = torch.tensor(train_df['label'].values, dtype=torch.float32)
y_val = torch.tensor(val_df['label'].values, dtype=torch.float32)
y_test = torch.tensor(test_df['label'].values, dtype=torch.float32)

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}\n")

X_train shape: torch.Size([32000, 580])
X_val shape: torch.Size([8000, 580])
X_test shape: torch.Size([10000, 580])



In [None]:
# DataLoaders help us process data in small batches instead of all at once
# This is more efficient and allows training on large datasets

class IMDBDataset(Dataset):
    """
    Simple dataset wrapper for PyTorch
    """
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        return self.reviews[idx], self.labels[idx]


# Create datasets
train_dataset = IMDBDataset(X_train, y_train)
val_dataset = IMDBDataset(X_val, y_val)
test_dataset = IMDBDataset(X_test, y_test)

# Create loaders (batch_size = how many reviews to process at once)
batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)



What's a DataLoader?

- Feeds data in small batches (e.g., 64 reviews at a time)
- Shuffles data each epoch (prevents learning order patterns)
- Handles memory efficiently (can't load 50K reviews at once!)

In [None]:
class PositionalEncoding(nn.Module):
    """
    Positional Encoding adds information about word position.

    Why? Self-attention doesn't care about word order by default.
    "I hate this movie" vs "This movie I hate" would look the same!

    Positional encoding fixes this by adding position information to each word.
    """
    def __init__(self, embed_dim, max_len=max_length):
        super().__init__()
        self.embed_dim = embed_dim
        self.max_len = max_len

        # Create a matrix of shape (max_len, embed_dim)
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len).unsqueeze(1).float()

        # Use sine and cosine functions at different frequencies
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() *
                             (-np.log(10000.0) / embed_dim))

        pe[:, 0::2] = torch.sin(position * div_term)  # Even dimensions
        pe[:, 1::2] = torch.cos(position * div_term)  # Odd dimensions

        # Register as buffer (part of model but not a learnable parameter)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        # Add positional encoding to embeddings
        return x + self.pe[:, :x.size(1)]

Mathematical Intuition:

- Uses sine and cosine waves at different frequencies
- Each position gets a unique "signature"
- Similar positions have similar encodings
- Model can learn relative positions

In [None]:
class SimpleTransformer(nn.Module):
    """
    A Simple Transformer for Sentiment Analysis

    Architecture:
    1. Embedding Layer: Converts word numbers to dense vectors
    2. Positional Encoding: Adds position information
    3. Self-Attention: Lets words "look at" other words in the sentence
    4. Feed-Forward Network: Processes the attended information
    5. Classification Head: Makes the final positive/negative decision
    """

    def __init__(self, vocab_size, embed_dim=64, num_heads=4, dropout=0.1):
        super().__init__()

        # COMPONENT 1: WORD EMBEDDINGS
        # Converts each word number into a vector of size embed_dim
        # Example: word #234 -> [0.2, -0.5, 0.1, ...] (64 numbers)
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)

        # COMPONENT 2: POSITIONAL ENCODING
        # Adds position information so the model knows word order
        self.pos_encoding = PositionalEncoding(embed_dim, max_len=max_length)

        # COMPONENT 3: SELF-ATTENTION
        # This is the KEY component of transformers
        # It lets each word "attend to" (look at) other words
        # Example: In "The movie was not good", "not" attends to "good"
        self.attention = nn.MultiheadAttention(
            embed_dim,      # Size of embeddings
            num_heads,      # Number of attention heads (parallel attention mechanisms)
            dropout=dropout,
            batch_first=True
        )

        # COMPONENT 4: FEED-FORWARD NETWORK (FFN)
        # Processes the information from attention
        # Just a simple neural network: Linear -> ReLU -> Dropout -> Linear
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * 4),  # Expand
            nn.ReLU(),                             # Activation
            nn.Dropout(dropout),                   # Regularization
            nn.Linear(embed_dim * 4, embed_dim)   # Compress back
        )

        # COMPONENT 5: LAYER NORMALIZATION
        # Helps with training stability (normalizes the values)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)

        # COMPONENT 6: DROPOUT
        # Randomly drops some connections during training to prevent overfitting
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

        # COMPONENT 7: CLASSIFIER
        # Final layers that make the positive/negative decision
        self.classifier = nn.Sequential(
            nn.Linear(embed_dim, embed_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(embed_dim, 1)  # Output: single number for binary classification
        )

    def forward(self, x):
        """
        Forward pass: how data flows through the model
        Input: x = batch of reviews as numbers, shape (batch_size, max_length)
        Output: predictions (batch_size,) where each value is between 0 and 1
        """

        # Create padding mask
        # We need to ignore padded tokens (the zeros we added)
        padding_mask = (x == 0)  # True where there's padding

        # Convert word numbers to embeddings
        # Shape: (batch_size, max_length, embed_dim)
        x = self.embedding(x)

        # Add positional information
        x = self.pos_encoding(x)

        # Self-Attention Block (with residual connection)
        # The attention mechanism lets words interact with each other
        attn_out, _ = self.attention(x, x, x, key_padding_mask=padding_mask)
        x = self.norm1(x + self.dropout1(attn_out))  # Add & Norm (residual connection)

        # Feed-Forward Block (with residual connection)
        # Process the attended information
        ffn_out = self.ffn(x)
        x = self.norm2(x + self.dropout2(ffn_out))  # Add & Norm (residual connection)

        # Average Pooling
        # Combine all word representations into one vector
        # We ignore padded positions by masking them out
        mask_expanded = (~padding_mask).unsqueeze(-1).float()
        x = (x * mask_expanded).sum(dim=1) / mask_expanded.sum(dim=1).clamp(min=1)

        # Classification
        # Pass through classifier to get final prediction
        x = self.classifier(x)

        # Sigmoid activation
        # Converts output to probability between 0 and 1
        return torch.sigmoid(x).squeeze()


Multi-Head Attention (num_heads=4)
Think of it as having 4 different reading strategies simultaneously:

- Head 1: Focuses on nearby words (local context)
- Head 2: Looks for negations ("not", "never")
- Head 3: Identifies subject-verb relationships
- Head 4: Captures overall sentiment patterns


Input: "The movie was not very good"

Head 1 attention: "not" <--> "good" (negation)

Head 2 attention: "movie" <--> "good" (subject-attribute)

Head 3 attention: "was" <--> "good" (temporal context)

Head 4 attention: Overall sentence sentiment

**Layer Normalisation**

Purpose: Stabilizes training

- Normalizes values to have mean=0, std=1
- Prevents values from getting too large or small
- Makes training faster and more stable

**Dropout Regularisation**

Purpose: Prevents overfitting

- Randomly "turns off" 10% of neurons during training
- Forces model to not rely on specific neurons
- Creates more robust, generalizable model


**FFN**
Architecture Pattern:

1. Expand: Increase dimensionality (more expressive)
2. Activate: ReLU adds non-linearity (learns complex patterns)
3. Compress: Return to original size

Why expand then compress?

- Creates a "bottleneck" that forces learning of important features
- More parameters = more learning capacity
- Used in every Transformer layer

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleTransformer(vocab_size).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
epochs = 3

In [None]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for reviews, labels in tqdm(train_loader):
        reviews, labels = reviews.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(reviews)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch [{epoch+1}/{epochs}] - Loss: {total_loss / len(train_loader):.4f}")


100%|██████████| 500/500 [00:27<00:00, 18.20it/s]


Epoch [1/3] - Loss: 0.5108


100%|██████████| 500/500 [00:26<00:00, 18.89it/s]


Epoch [2/3] - Loss: 0.3123


100%|██████████| 500/500 [00:26<00:00, 18.84it/s]

Epoch [3/3] - Loss: 0.2338





In [None]:
# Evaluate on test set
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for reviews, labels in test_loader:
        reviews, labels = reviews.to(device), labels.to(device)
        preds = model(reviews)
        preds = (preds > 0.5).float()
        correct += (preds == labels).sum().item()
        total += labels.size(0)

# Print test accuracy
test_accuracy = 100 * correct / total
print(f"\nTest Accuracy: {test_accuracy:.2f}%")
print(f"Correct predictions: {correct}/{total}")

In [None]:
test_sentences = [
    "amazing wonderful best",
    "terrible awful worst",
    "love great film",
    "this movie made it into one of my top 10 most awful movies",
    "fantastic acting and story",
    "poorly written and boring"
]

for sentence in test_sentences:
    tokens = torch.tensor([text_to_numbers(sentence, vocab)])
    with torch.no_grad():
        pred = model(tokens.to(device)).item()
    sentiment = "POSITIVE" if pred > 0.5 else "NEGATIVE"
    print(f"'{sentence}' -> {sentiment} ({pred:.2f})")

'amazing wonderful best' → POSITIVE (1.00)
'terrible awful worst' → NEGATIVE (0.00)
'love great film' → POSITIVE (1.00)
'this movie made it into one of my top 10 most awful movies' → NEGATIVE (0.00)
'fantastic acting and story' → POSITIVE (1.00)
'poorly written and boring' → NEGATIVE (0.00)
