# LSTM-based Seq2Seq Model for Abstractive Summarization

You can ask your questions in Telegram : @FatemehNikkhoo

Name = ""

StudentId = ""

# Import Libraries

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from tqdm import tqdm
import numpy as np
import random

In [3]:
# Set up device (GPU if available, otherwise CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the Dataset

# Extreme Summarization (XSum) Dataset

The **XSUM** dataset is designed for the task of extreme summarization, where the goal is to generate a single-sentence summary for a news article. 

### Features:
- **document:** The input news article.
- **summary:** A one-sentence summary of the article.
- **id:** A unique BBC ID for each article.

For more details and to explore the dataset, you can visit the official [Hugging Face XSUM page](https://huggingface.co/datasets/xsum).


In [4]:
# 1. Load the XSUM dataset
print("Loading XSUM dataset...")

# Load each split using slice syntax
raw_datasets = {
    "train": load_dataset("xsum", split="train[:500]"),
    "validation": load_dataset("xsum", split="validation[:100]"),
    "test": load_dataset("xsum", split="test[:100]")
}

Loading XSUM dataset...


In [5]:
# Data Inspection 

# Inspect the dataset size
for split, data in raw_datasets.items():
    print(f"{split} size: {len(data)}")

# Inspect a random sample of the train dataset
train_len = len(raw_datasets['train'])
# Select a random index between 0 and train_len - 1
random_index = random.randint(0, train_len  - 1)
print(f"Sample from random index: {random_index}\n")
for key in raw_datasets['train'][random_index]:
    print(f"{key}: {raw_datasets['train'][random_index][key]}\n")

train size: 500
validation size: 100
test size: 100
Sample from random index: 437

document: Eight hundred of the plants, as well as electrical equipment, were seized at a workshop in Millisle on Friday.
Police said it was one of the biggest and most sophisticated production operations they had uncovered recently.
Insp Andy Dunlop said there were "young plants, the remnants of a previous harvest, space for drying the product and space for waste products".
He added: "No arrests were made this morning but our enquiries will be continuing over the coming days."

summary: Cannabis plants worth an estimated £400,000 have been seized by police  in County Down.

id: 30452755



# Tokenization

### Question:
- What is the role of a tokenizer in Natural Language Processing (NLP)?
- What does it mean to "tokenize" text, and why is this step necessary?

In [None]:
# 2. Tokenization

# Apply tokenization on the 'document' (news article) and 'summary' (highlight).
def tokenize_function(example, tokenizer):
    """
    This function takes a batch of example and applies tokenization using the provided tokenizer.
    
    Args:
    example (dict): A dictionary containing text data with keys like "document" and "summary".
    tokenizer: A tokenizer instance (e.g., from `torchtext` or `transformers`).
    
    Returns:
    dict: A dictionary containing tokenized inputs and target sequences with keys 'input_ids' and 'target_ids'.
    """
    # TODO: Apply tokenization
    # Place your code here
    # inputs  = ... # Tokenizing the article (input)
    # targets = ... # Tokenizing the summary (target)
    return {"input_ids": inputs, "target_ids": targets}


# Tokenizer (using basic English tokenizer)
tokenizer = get_tokenizer("basic_english")  # Basic word-level tokenization
# Applying the tokenizer function to the dataset
tokenized_datasets = {
    split: raw_datasets[split].map(lambda example: tokenize_function(example, tokenizer))
    for split in ["train", "validation", "test"]
}

# TODO: Inspect a sample of tokenized_datasets['train'] to better understand the results
# Print the keys and values of the sample at the random_index that was calculated earlier
# Loop through the keys of the random sample and print both key and value
# Place your code here

# Build Vocabulary

In NLP tasks, the vocabulary maps each token (word) to a unique integer ID.

### Question:
- What are the special characters `"<unk>"` and `"<pad>"` used for in vocabulary generation?
- Why should we build the vocabulary using only the training data?


In [None]:
# 3. Build Vocabulary
def build_vocab(texts, tokenizer):
    """
    Builds a vocabulary from the provided raw text data.
    The vocabulary maps each token (word) to a unique integer ID.
    Special tokens like <unk> (unknown words) and <pad> (padding) are included.

    Args:
        texts (list of str): List of raw text data (e.g., articles).
        tokenizer: The tokenizer function to tokenize the texts.

    Returns:
        vocab: A vocabulary object that maps each token to an integer ID.
    """
    # Using build_vocab_from_iterator to create vocabulary from tokenized data
    # Special tokens <unk> and <pad> are added during vocabulary creation
    return build_vocab_from_iterator(map(tokenizer, texts), specials=["<unk>", "<pad>"])


# TODO: Build the vocabulary from the training data considering both 'documents' and 'summary'
# Place your code here
# vocab = ...

# Inspecting the vocabulary:
# Get the length of the vocabulary
vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")
# To better understand the vocabulary, let's print the first 10 tokens and their corresponding IDs.
# This helps in ensuring that the special tokens are included, and the vocabulary is mapped correctly.
print("Sample tokens and their corresponding IDs:")
for token in list(vocab.get_itos())[:10]:
    print(token, vocab[token])

## Padding Function

### Question: 
- Why is padding important in data preprocessing for NLP tasks, and why should we do it?


In [None]:
# 4. Padding function (modified to accept token IDs)
# Constants
MAX_LENGTH = 128#512  # Maximum sequence length
PAD_IDX = vocab["<pad>"]  # Padding token index
UNK_IDX = vocab["<unk>"]  # Unknown token index

def pad_to_max_length(seq, max_length=MAX_LENGTH, pad_idx=PAD_IDX):
    """
    Pads or truncates a sequence of token IDs to a fixed maximum length.
    
    Args:
        seq (list): Sequence of token IDs.
        max_length (int): Target sequence length.
        pad_idx (int): Padding token ID.

    Returns:
        list: Padded/truncated sequence of token IDs.
    """
    return seq + [pad_idx] * (max_length - len(seq)) if len(seq) < max_length else seq[:max_length]

# 5. Sequence processing function (ensure tokenization and conversion to token IDs)
def process_data(example, vocab, tokenizer):
    """
    Pads input and target sequences to fixed lengths and records original lengths.
    Tokenizes the text and converts tokens to token IDs.

    Args:
        example (dict): Raw example with 'document' and 'summary'.
        vocab (Vocab): Vocabulary object.
        tokenizer: Tokenizer instance used for tokenizing text.

    Returns:
        dict: Tensors for input/target IDs and their original lengths.
    """

    # TODO: Tokenize raw text (both 'document' and 'summary')
    # Place your code here
    # input_tokens  = ... # Tokenizing the document (input)
    # target_tokens = ... # Tokenizing the summary (target)

    # Convert tokens to token IDs using the vocabulary
    input_ids  = [vocab[token] if token in vocab else UNK_IDX for token in input_tokens]
    target_ids = [vocab[token] if token in vocab else UNK_IDX for token in target_tokens]

    input_len  = len(input_ids)   # Save length BEFORE padding
    target_len = len(target_ids)

    # TODO: Apply padding/truncation
    # Place your code here (Hint: use defined functions)
    # input_ids  = ...
    # target_ids = ...

    # Return the processed data as tensors, along with original lengths
    return {
        "input_ids": torch.tensor(input_ids, dtype=torch.long),
        "target_ids": torch.tensor(target_ids, dtype=torch.long),
        "input_len": torch.tensor(input_len, dtype=torch.long),
        "target_len": torch.tensor(target_len, dtype=torch.long),
    }


# Apply processing to the datasets
processed_datasets = {
    split: raw_datasets[split].map(lambda example: process_data(example, vocab, tokenizer))
    for split in ["train", "validation", "test"]
}

# Creating Dataloaders and Custom Dataset Class

In [None]:
# 6. Custom Dataset Class
class Seq2SeqDataset(Dataset):
    """
    A PyTorch-compatible dataset wrapper for processed sequence-to-sequence data.
    
    This class takes tokenized, padded, and numericalized examples and allows them
    to be used with a DataLoader to enable batching, shuffling, and parallel loading.
    """

    def __init__(self, dataset):
        """
        Initializes the custom dataset.

        Args:
            dataset (DatasetDict): A HuggingFace-style dataset where each example is a dict
                                   containing 'input_ids', 'target_ids', 'input_len', 'target_len'.
        """
        self.dataset = dataset

    def __len__(self):
        """
        Returns:
            int: Total number of samples in the dataset.
        """
        return len(self.dataset)

    def __getitem__(self, idx):
        """
        Fetches the sample at a specific index.

        Args:
            idx (int): Index of the sample to retrieve.

        Returns:
            dict: A dictionary containing input/target sequences and their lengths.
                  These are returned as PyTorch tensors.
        """
        item = self.dataset[idx]
        return {
            "input_ids": torch.tensor(item["input_ids"], dtype=torch.long),  # Convert to tensor
            "target_ids": torch.tensor(item["target_ids"], dtype=torch.long),  # Convert to tensor
            "input_len": torch.tensor(item["input_len"], dtype=torch.long),  # Convert to tensor
            "target_len": torch.tensor(item["target_len"], dtype=torch.long)  # Convert to tensor
        }

BATCH_SIZE = 8

# Instantiate PyTorch-compatible datasets from the processed HuggingFace-style splits
train_dataset = Seq2SeqDataset(processed_datasets["train"])        # For training
valid_dataset = Seq2SeqDataset(processed_datasets["validation"])   # For validation
test_dataset  = Seq2SeqDataset(processed_datasets["test"])         # For testing

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Sanity Check – Inspect One Batch
batch = next(iter(train_loader))
print("Input shape:", batch["input_ids"].shape)
print("Target shape:", batch["target_ids"].shape)
print("Input lengths:", batch["input_len"][:5])
print("Target lengths:", batch["target_len"][:5])

Input shape: torch.Size([32, 512])
Target shape: torch.Size([32, 512])
Input lengths: tensor([266, 103, 129, 212, 386])
Target lengths: tensor([23, 18, 25, 23, 24])


### Seq2Seq Model

The following is a simple implementation of a LSTM-based Seq2Seq model for tasks like text summarization or machine translation.

#### Questions:
- **What is the Embedding Layer and Why is it Used?**  
- **What is Teacher Forcing and Why is it Used?**  

In [None]:
# Encoder class
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        # TODO: Embedding layer to convert token IDs to embeddings
        # Place your code here
        # self.embedding = ...
        # TODO: LSTM layer to process sequences and output hidden and cell states
        # Place your code here
        # self.lstm = ...

    def forward(self, input_ids):
        # Convert token IDs to embeddings
        embedded = self.embedding(input_ids)
        # Process the embeddings with the LSTM
        output, (hidden, cell) = self.lstm(embedded)
        return hidden, cell
    
    
# Decoder class
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        # TODO: Embedding layer to convert token IDs to embeddings
        # Place your code here
        # self.embedding = ...
        # LSTM layer to process the current token and hidden state
        # Place your code here
        # self.lstm = ...
        # Fully connected layer to predict the next token in the sequence
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_token, hidden, cell):
        # Convert current token to embedding
        embedded = self.embedding(input_token.unsqueeze(1))  # Shape: (B, 1, E)
        # Process the embedded token with the LSTM and pass hidden, cell states
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        # Get the logits for the next token prediction
        logits = self.fc(output.squeeze(1))  # Shape: (B, vocab_size)
        return logits, hidden, cell
    
# Seq2Seq class to combine the encoder and decoder
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        # TODO: Initialize the encoder and decoder here
        # Place your code here

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)  # Number of sequences in the batch
        max_len = tgt.size(1)     # Maximum length of the target sequence
        vocab_size = self.decoder.fc.out_features  # Size of the vocabulary

        # Tensor to hold all predictions (outputs) for each token
        outputs = torch.zeros(batch_size, max_len, vocab_size)

        # TODO: Get initial hidden and cell states from the encoder
        # Place your code here
        # hidden, cell = ...
        input_token = tgt[:, 0]  # Start token (usually <sos>)

        for t in range(1, max_len):
            # Pass the current token and states to the decoder
            output, hidden, cell = self.decoder(input_token, hidden, cell)
            outputs[:, t] = output  # Store the output for the current time step

            # Apply teacher forcing: decide whether to use true target or predicted token
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)  # Get the predicted token (max logit)

            # Use the true token (from the target) if teacher forcing is applied, otherwise use predicted token
            input_token = tgt[:, t] if teacher_force else top1

        return outputs

# Training and Evaluation Function

In [None]:
# Training function
def train(model, train_loader, optimizer, criterion, device, teacher_forcing_ratio=0.5):
    model.train()  # Set model to training mode
    epoch_loss = 0  # Track total loss for the epoch

    for batch_idx, batch in enumerate(train_loader):
        # Move data to the device (GPU or CPU)
        src = batch['input_ids'].to(device)
        tgt = batch['target_ids'].to(device)

        # TODO: Zero the gradients before each backpropagation
        # Place your code here
        

        # TODO: Forward pass through the model
        # Place your code here
        # output = 
        
        # Flatten the output and target for loss calculation
        output_dim = output.shape[-1]  # Output dimension (vocab size)
        # output = output.view(-1, output_dim)  # Shape: (batch_size * seq_len, vocab_size)
        # tgt = tgt[:, 1:].contiguous().view(-1)  # Shape: (batch_size * seq_len) (ignore <sos> token)
        output = output[:, 1:, :].contiguous().view(-1, output_dim)  # Skip <sos> predictions
        tgt = tgt[:, 1:].contiguous().view(-1)  # Skip <sos> targets


        # Calculate the loss
        loss = criterion(output, tgt)
        epoch_loss += loss.item()  # Accumulate loss

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()

        if batch_idx % 10 == 0:  # Print training progress every 10 batches
            print(f"Batch {batch_idx}/{len(train_loader)} Loss: {loss.item():.4f}")

    return epoch_loss / len(train_loader)  # Return average loss for the epoch


# Evaluation function
def evaluate(model, valid_loader, criterion, device):
    model.eval()  # Set model to evaluation mode
    epoch_loss = 0  # Track total loss for the validation

    with torch.no_grad():  # No need to track gradients for validation
        for batch in valid_loader:
            # Move data to the device
            src = batch['input_ids'].to(device)
            tgt = batch['target_ids'].to(device)

            # TODO: Forward pass through the model
            # Place your code here
            # output = ... # No teacher forcing during eval


            # Flatten the output and target for loss calculation
            output_dim = output.shape[-1]  # Output dimension (vocab size)
            # output = output.view(-1, output_dim)  # Shape: (batch_size * seq_len, vocab_size)
            # tgt = tgt[:, 1:].contiguous().view(-1)  # Shape: (batch_size * seq_len)
            output = output[:, 1:, :].contiguous().view(-1, output_dim)  # Skip <sos> predictions
            tgt = tgt[:, 1:].contiguous().view(-1)  # Skip <sos> targets


            # Calculate the loss
            loss = criterion(output, tgt)
            epoch_loss += loss.item()  # Accumulate loss

    return epoch_loss / len(valid_loader)  # Return average loss for the validation set


# Training loop function
def train_loop(model, train_loader, valid_loader, optimizer, criterion, num_epochs, device):
    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")

        # Train the model
        train_loss = train(model, train_loader, optimizer, criterion, device)
        print(f"Training Loss: {train_loss:.4f}")

        # Evaluate the model
        valid_loss = evaluate(model, valid_loader, criterion, device)
        print(f"Validation Loss: {valid_loss:.4f}")




# Configurations
vocab_size = len(vocab)  
embed_dim = 128  # Dimensionality of word embeddings
hidden_dim = 256 # Hidden state size of the LSTM

# TODO: Initialize Model
# Place your code here
# model = 

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX) 

# Define number of epochs for training
num_epochs = 10

# Train the model
train_loop(model, train_loader, valid_loader, optimizer, criterion, num_epochs, device)

# Predictions vs Ground Truth (Qualitative Evaluation)

In [None]:
def generate_prediction(model, src, tgt, device, vocab):
    model.eval()  # Set model to evaluation mode
    
    # Move the source and target to the correct device (CPU/GPU)
    src = src.to(device)
    tgt = tgt.to(device)
    
    # TODO: Generate output using the model (disable teacher forcing here)
    # Place your code here
    # output = ...  # No teacher forcing during evaluation

    # Get the predicted tokens (taking argmax across vocab size)
    predicted_tokens = output.argmax(2)  # (batch_size, seq_len)
    
    # TODO: Convert token IDs back to text using the vocab's get_itos() method (index-to-string)
    # Place your code here
    # predicted_text = ...
    
    # TODO: Convert the target tokens to text as well for comparison
    # Place your code here
    # target_text = ...

    return predicted_text, target_text

# Generate prediction for the first batch of test data
src_sample = test_loader.dataset[0]['input_ids']  # First input example from the test set
tgt_sample = test_loader.dataset[0]['target_ids']  # First target example from the test set

predictions, actuals = generate_prediction(model, src_sample.unsqueeze(0), tgt_sample.unsqueeze(0), device, vocab)

# Now let's print the comparison
print("Predicted Text:", predictions[0])
print("Actual Target Text:", actuals[0])

## Bonus: Incorporate Attention to the Model and Evaluate the Results

Incorporating **Attention** mechanisms into the Seq2Seq model can significantly improve the model's ability to focus on relevant parts of the input sequence while generating output. This is particularly useful for longer sequences where the model might struggle to capture long-range dependencies with a standard encoder-decoder architecture.

In [None]:
# TODO:
# Place your code here
# Hint: You can modify the main code of LSTM-based Seq2Seq model