In [3]:
%pip install ipywidgets jupyter




# Install compatible versions to fix import errors
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

print("Installing compatible package versions...")
print("This may take a few minutes...")

try:
    # Install NumPy first (compatible version)
    print("1. Installing NumPy < 2.0...")
    install_package("numpy<2.0")
    
    # Install PyTorch with CUDA support
    print("2. Installing PyTorch 2.2.0 with CUDA 12.1...")
    install_package("torch==2.2.0+cu121 --index-url https://download.pytorch.org/whl/cu121")
    
    # Install compatible torchvision
    print("3. Installing compatible torchvision...")
    install_package("torchvision==0.17.0+cu121 --index-url https://download.pytorch.org/whl/cu121")
    
    # Install transformers and other packages
    print("4. Installing transformers and other packages...")
    install_package("transformers==4.30.0")  # Older stable version
    install_package("datasets")
    install_package("tqdm")
    install_package("ipywidgets")
    
    print("✅ All packages installed successfully!")
    print("⚠️  IMPORTANT: Please restart the kernel after installation!")
    
except Exception as e:
    print(f"❌ Installation failed: {e}")
    print("Please try running these commands manually:")
    print("pip install 'numpy<2.0'")
    print("pip install 'torch==2.2.0+cu121' --index-url https://download.pytorch.org/whl/cu121")
    print("pip install 'torchvision==0.17.0+cu121' --index-url https://download.pytorch.org/whl/cu121")
    print("pip install 'transformers==4.30.0'")

print("\n" + "="*60)
print("RESTART KERNEL AFTER INSTALLATION COMPLETES!")Embeddings and GRU Architecture
This notebook implements a sequence-to-sequence model for text summarization using BERT embeddings with encoder-decoder GRU architecture.

In [1]:
# Import libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
from IPython.display import display, Markdown
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load BERT tokenizer and model for embeddings

bert_model_name = 'bert-base-uncased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = BertModel.from_pretrained(bert_model_name)

# Freeze BERT parameters to use as fixed embeddings
for param in bert_model.parameters():
    param.requires_grad = False

print(f"BERT model loaded: {bert_model_name}")
print(f"BERT embedding dimension: {bert_model.config.hidden_size}")

BERT model loaded: bert-base-uncased
BERT embedding dimension: 768


In [3]:
# Load CNN/DailyMail dataset (use subset for faster training)
display(Markdown("**Downloading CNN/DailyMail dataset...**"))
dataset = load_dataset('abisee/cnn_dailymail', '3.0.0')

train_data = dataset['train']
# Use only 1/10th of the data for faster training
subset_size = len(train_data) // 10
train_data = train_data.select(range(subset_size))
print(f"Using {subset_size} samples out of {len(dataset['train'])} total.")

def preprocess(sample):
    article = sample['article']
    summary = sample['highlights']
    return article, summary

articles = []
summaries = []
for i in tqdm(range(len(train_data)), desc='Preprocessing dataset'):
    sample = train_data[i]
    a, s = preprocess(sample)
    articles.append(a)
    summaries.append(s)

**Downloading CNN/DailyMail dataset...**

Using 28711 samples out of 287113 total.


Preprocessing dataset: 100%|██████████| 28711/28711 [00:02<00:00, 13193.24it/s]


In [4]:
# Tokenization using BERT tokenizer
def tokenize_with_bert(texts, max_length=512):
    """Tokenize texts using BERT tokenizer and return input_ids"""
    tokenized = bert_tokenizer(
        texts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    return tokenized['input_ids'], tokenized['attention_mask']

# Tokenize articles and summaries
max_article_len = 512
max_summary_len = 128

print("Tokenizing articles...")
article_ids, article_masks = tokenize_with_bert(articles, max_article_len)
print("Tokenizing summaries...")
summary_ids, summary_masks = tokenize_with_bert(summaries, max_summary_len)

print(f"Article tensor shape: {article_ids.shape}")
print(f"Summary tensor shape: {summary_ids.shape}")

# Vocabulary size from BERT tokenizer
vocab_size = bert_tokenizer.vocab_size
print(f"BERT vocabulary size: {vocab_size}")

Tokenizing articles...
Tokenizing summaries...
Article tensor shape: torch.Size([28711, 512])
Summary tensor shape: torch.Size([28711, 128])
BERT vocabulary size: 30522


In [5]:
# PyTorch Dataset with BERT embeddings
class SummarizationDataset(Dataset):
    def __init__(self, article_ids, article_masks, summary_ids, summary_masks):
        self.article_ids = article_ids
        self.article_masks = article_masks
        self.summary_ids = summary_ids
        self.summary_masks = summary_masks
    
    def __len__(self):
        return len(self.article_ids)
    
    def __getitem__(self, idx):
        return (
            self.article_ids[idx], 
            self.article_masks[idx],
            self.summary_ids[idx], 
            self.summary_masks[idx]
        )

dataset = SummarizationDataset(article_ids, article_masks, summary_ids, summary_masks)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)  # Reduced batch size for memory
print(f"Dataset created with {len(dataset)} samples")
print(f"DataLoader created with batch size 8")

Dataset created with 28711 samples
DataLoader created with batch size 8


In [6]:
# Encoder-Decoder Model with BERT Embeddings
class BERTEncoder(nn.Module):
    def __init__(self, bert_model, hidden_size, num_layers=1):
        super().__init__()
        self.bert = bert_model
        self.bert_hidden_size = bert_model.config.hidden_size  # 768 for bert-base
        self.rnn = nn.GRU(self.bert_hidden_size, hidden_size, 
                         num_layers=num_layers, bidirectional=True, batch_first=True)
        
    def forward(self, input_ids, attention_mask):
        # Get BERT embeddings
        with torch.no_grad():  # BERT parameters are frozen
            bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = bert_outputs.last_hidden_state  # [batch_size, seq_len, 768]
        
        # Pass through GRU
        outputs, hidden = self.rnn(embeddings)
        # Concatenate forward and backward hidden states
        hidden = torch.cat([hidden[-2], hidden[-1]], dim=1).unsqueeze(0)
        return outputs, hidden

class BERTDecoder(nn.Module):
    def __init__(self, bert_model, hidden_size, vocab_size, num_layers=1):
        super().__init__()
        self.bert = bert_model
        self.bert_hidden_size = bert_model.config.hidden_size
        self.rnn = nn.GRU(self.bert_hidden_size, hidden_size*2, 
                         num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size*2, vocab_size)
        
    def forward(self, input_ids, hidden, attention_mask=None):
        # Get BERT embeddings
        with torch.no_grad():  # BERT parameters are frozen
            bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = bert_outputs.last_hidden_state
        
        # Pass through GRU
        outputs, hidden = self.rnn(embeddings, hidden)
        logits = self.fc(outputs)
        return logits, hidden

# Initialize models
hidden_size = 256
encoder = BERTEncoder(bert_model, hidden_size)
decoder = BERTDecoder(bert_model, hidden_size, vocab_size)

print(f"Encoder initialized with BERT embeddings")
print(f"Decoder initialized with BERT embeddings")
print(f"Hidden size: {hidden_size}")

Encoder initialized with BERT embeddings
Decoder initialized with BERT embeddings
Hidden size: 256


In [None]:
# Training Loop with BERT embeddings
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"CUDA device count: {torch.cuda.device_count()}")
    print(f"Current CUDA device: {torch.cuda.current_device()}")
    print(f"CUDA device name: {torch.cuda.get_device_name(0)}")

# Move models to device
bert_model.to(device)
encoder.to(device)
decoder.to(device)
print(f"Models moved to device: {device}")

criterion = nn.CrossEntropyLoss(ignore_index=bert_tokenizer.pad_token_id)
encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=0.001)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001)

num_epochs = 5
for epoch in range(num_epochs):
    epoch_loss = 0
    batch_count = 0
    
    for article_ids, article_masks, summary_ids, summary_masks in tqdm(dataloader, desc=f'Epoch {epoch+1}'):
        article_ids = article_ids.to(device)
        article_masks = article_masks.to(device)
        summary_ids = summary_ids.to(device)
        summary_masks = summary_masks.to(device)
        
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        # Encoder forward
        _, hidden = encoder(article_ids, article_masks)

        # Decoder forward (teacher forcing)
        decoder_input = summary_ids[:, :-1]
        decoder_masks = summary_masks[:, :-1]
        target = summary_ids[:, 1:]
        
        output, _ = decoder(decoder_input, hidden, decoder_masks)
        output = output.reshape(-1, output.size(-1))
        target = target.reshape(-1)
        
        loss = criterion(output, target)
        loss.backward()
        encoder_optimizer.step()
        decoder_optimizer.step()
        
        epoch_loss += loss.item()
        batch_count += 1
    
    avg_loss = epoch_loss / batch_count
    print(f'Epoch {epoch+1}, Average Loss: {avg_loss:.4f}')

Using device: cuda
CUDA device count: 1
Current CUDA device: 0
CUDA device name: NVIDIA GeForce RTX 3050 Laptop GPU
Models moved to device: cuda


Epoch 1: 100%|██████████| 3589/3589 [26:05<00:00,  2.29it/s]


Epoch 1, Average Loss: 3.4079


Epoch 2: 100%|██████████| 3589/3589 [26:24<00:00,  2.26it/s]


Epoch 2, Average Loss: 1.8205


Epoch 3:  74%|███████▍  | 2664/3589 [19:25<06:46,  2.27it/s]

In [None]:
# Inference with BERT embeddings
def summarize_article_bert(article_text):
    encoder.eval()
    decoder.eval()
    
    with torch.no_grad():
        # Tokenize input article
        inputs = bert_tokenizer(
            article_text,
            max_length=max_article_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        article_ids = inputs['input_ids'].to(device)
        article_mask = inputs['attention_mask'].to(device)
        
        # Encode
        _, hidden = encoder(article_ids, article_mask)
        
        # Decode
        input_token = torch.tensor([[bert_tokenizer.cls_token_id]]).to(device)
        summary_tokens = []
        
        for _ in range(max_summary_len):
            # Create attention mask for single token
            attention_mask = torch.ones_like(input_token).to(device)
            
            output, hidden = decoder(input_token, hidden, attention_mask)
            token_id = output.argmax(-1)[:, -1].item()
            
            if token_id == bert_tokenizer.sep_token_id or token_id == bert_tokenizer.pad_token_id:
                break
                
            summary_tokens.append(token_id)
            input_token = torch.tensor([[token_id]]).to(device)
        
        # Decode tokens to text
        summary = bert_tokenizer.decode(summary_tokens, skip_special_tokens=True)
        return summary

# Example usage
new_article = "The morning began with a soft drizzle that painted the streets in a shimmering glaze, each raindrop bouncing off the pavement before sliding into shallow puddles that reflected fragments of the gray sky. Along the narrow lane, small shops opened one after another, their metal shutters creaking upward to reveal the colors and smells of daily life—fresh bread from the bakery, earthy spices from the grocer, and the faint tang of ink and paper from the bookstore at the corner. People moved in a slow rhythm, some clutching umbrellas while others simply embraced the rain, their clothes damp but their expressions calm, as though they had grown used to the unpredictability of the weather. A street dog trotted along confidently, pausing now and then at doorsteps where familiar hands reached out with scraps of food, a quiet reminder of the kindness woven into ordinary days. Somewhere in the distance, a bell rang—perhaps from a nearby temple or school—its sound cutting through the misty air and marking the passage of another hour. Inside a dimly lit café, the chatter of early customers blended with the hiss of steaming milk and the sharp click of ceramic cups being placed on wooden tables. The air carried a comforting warmth, wrapping around anyone who stepped inside, offering a small refuge from the damp world beyond the glass windows. It was one of those mornings that felt suspended between movement and stillness, inviting both reflection and the gentle anticipation of what the rest of the day might hold."

print("Generated Summary:")
print(summarize_article_bert(new_article))

In [None]:
# Save the trained models
torch.save(encoder.state_dict(), 'bert_encoder_model.pth')
torch.save(decoder.state_dict(), 'bert_decoder_model.pth')
print("Models saved successfully!")

# To load the models later:
# encoder.load_state_dict(torch.load('bert_encoder_model.pth'))
# decoder.load_state_dict(torch.load('bert_decoder_model.pth'))