In [None]:
"""
T5 Fine-tuning for Text Summarization on CNN/DailyMail Dataset
"""

# Install required packages
!pip install transformers datasets rouge-score nltk accelerate -q

import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW 
from transformers import (
    T5Tokenizer, 
    T5ForConditionalGeneration,
    get_linear_schedule_with_warmup
)
from datasets import load_dataset
from rouge_score import rouge_scorer
import nltk
from tqdm.auto import tqdm
import json
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data for sentence tokenization
nltk.download('punkt', quiet=True)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# ============================================
# 1. CONFIGURATION
# ============================================

class Config:
    # Model
    MODEL_NAME = "t5-small"  
    MAX_SOURCE_LENGTH = 512
    MAX_TARGET_LENGTH = 128
    
    # Training
    BATCH_SIZE = 4  
    GRADIENT_ACCUMULATION_STEPS = 4  # Effective batch size = 4 * 4 = 16
    LEARNING_RATE = 3e-4
    NUM_EPOCHS = 3
    WARMUP_STEPS = 500
    MAX_GRAD_NORM = 1.0
    
    # Data
    TRAIN_SIZE = 10000  
    VAL_SIZE = 1000
    
    # Paths
    OUTPUT_DIR = "/kaggle/working/t5_summarization"
    CHECKPOINT_DIR = f"{OUTPUT_DIR}/checkpoints"
    
    # Mixed Precision
    USE_FP16 = True

config = Config()
os.makedirs(config.CHECKPOINT_DIR, exist_ok=True)

# ============================================
# 2. DATASET PREPARATION
# ============================================

class SummarizationDataset(Dataset):
    def __init__(self, articles, summaries, tokenizer, max_source_len, max_target_len):
        self.articles = articles
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_source_len = max_source_len
        self.max_target_len = max_target_len
    
    def __len__(self):
        return len(self.articles)
    
    def __getitem__(self, idx):
        article = str(self.articles[idx])
        summary = str(self.summaries[idx])
        
        # T5 requires task prefix
        article = "summarize: " + article
        
        # Tokenize inputs
        source = self.tokenizer(
            article,
            max_length=self.max_source_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Tokenize targets
        target = self.tokenizer(
            summary,
            max_length=self.max_target_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        
        # Replace padding token id with -100 for loss calculation
        target_ids[target_ids == self.tokenizer.pad_token_id] = -100
        
        return {
            'input_ids': source_ids,
            'attention_mask': source_mask,
            'labels': target_ids
        }

def load_and_prepare_data(tokenizer):
    """Load CNN/DailyMail dataset from Hugging Face or Kaggle"""
    
    print("Loading dataset...")
    
    # Option 1: Load from Hugging Face (recommended)
    try:
        dataset = load_dataset("cnn_dailymail", "3.0.0")
        
        # Extract train and validation splits
        if config.TRAIN_SIZE:
            train_data = dataset['train'].select(range(config.TRAIN_SIZE))
        else:
            train_data = dataset['train']
        
        if config.VAL_SIZE:
            val_data = dataset['validation'].select(range(config.VAL_SIZE))
        else:
            val_data = dataset['validation']
        
        train_articles = train_data['article']
        train_summaries = train_data['highlights']
        val_articles = val_data['article']
        val_summaries = val_data['highlights']
        
        print("✓ Dataset loaded from Hugging Face")
    
    except Exception as e:
        print(f"Could not load from Hugging Face: {e}")
        print("Loading from Kaggle dataset...")
        
        # Option 2: Load from Kaggle CSV
        # Adjust path based on your Kaggle setup
        try:
            df = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv')
            
            # Split into train and validation
            train_size = config.TRAIN_SIZE if config.TRAIN_SIZE else int(len(df) * 0.9)
            val_size = config.VAL_SIZE if config.VAL_SIZE else int(len(df) * 0.1)
            
            train_df = df.iloc[:train_size]
            val_df = df.iloc[train_size:train_size + val_size]
            
            train_articles = train_df['article'].tolist()
            train_summaries = train_df['highlights'].tolist()
            val_articles = val_df['article'].tolist()
            val_summaries = val_df['highlights'].tolist()
            
            print("✓ Dataset loaded from Kaggle")
        except Exception as e2:
            print(f"Error loading from Kaggle: {e2}")
            raise RuntimeError("Could not load dataset from either Hugging Face or Kaggle")
    
    print(f"Train size: {len(train_articles)}")
    print(f"Validation size: {len(val_articles)}")
    
    # Create datasets
    train_dataset = SummarizationDataset(
        train_articles, train_summaries, tokenizer,
        config.MAX_SOURCE_LENGTH, config.MAX_TARGET_LENGTH
    )
    
    val_dataset = SummarizationDataset(
        val_articles, val_summaries, tokenizer,
        config.MAX_SOURCE_LENGTH, config.MAX_TARGET_LENGTH
    )
    
    return train_dataset, val_dataset, val_articles, val_summaries

# ============================================
# 3. EVALUATION METRICS
# ============================================

def compute_rouge_scores(predictions, references):
    """Compute ROUGE scores"""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)
    
    return {
        'rouge1': np.mean(rouge1_scores),
        'rouge2': np.mean(rouge2_scores),
        'rougeL': np.mean(rougeL_scores)
    }

# ============================================
# 4. TRAINING LOOP
# ============================================

def train_epoch(model, dataloader, optimizer, scheduler, scaler=None):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc="Training")
    
    optimizer.zero_grad()
    
    for step, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Mixed precision training
        if config.USE_FP16 and scaler and torch.cuda.is_available():
            with torch.cuda.amp.autocast():
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss / config.GRADIENT_ACCUMULATION_STEPS
            
            scaler.scale(loss).backward()
            
            if (step + 1) % config.GRADIENT_ACCUMULATION_STEPS == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.MAX_GRAD_NORM)
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                optimizer.zero_grad()
        else:
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss / config.GRADIENT_ACCUMULATION_STEPS
            loss.backward()
            
            if (step + 1) % config.GRADIENT_ACCUMULATION_STEPS == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.MAX_GRAD_NORM)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
        
        total_loss += loss.item() * config.GRADIENT_ACCUMULATION_STEPS
        progress_bar.set_postfix({'loss': f'{loss.item() * config.GRADIENT_ACCUMULATION_STEPS:.4f}'})
    
    return total_loss / len(dataloader)

def validate(model, dataloader, tokenizer):
    """Validate the model"""
    model.eval()
    total_loss = 0
    predictions = []
    
    progress_bar = tqdm(dataloader, desc="Validation")
    
    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            total_loss += outputs.loss.item()
            
            # Generate summaries
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=config.MAX_TARGET_LENGTH,
                num_beams=4,
                length_penalty=2.0,
                early_stopping=True
            )
            
            decoded_preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            predictions.extend(decoded_preds)
            
            progress_bar.set_postfix({'loss': f'{outputs.loss.item():.4f}'})
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss, predictions

# ============================================
# 5. MAIN TRAINING FUNCTION
# ============================================

def train_model():
    """Main training function"""
    print("=" * 50)
    print("T5 FINE-TUNING FOR TEXT SUMMARIZATION")
    print("=" * 50)
    
    # Initialize tokenizer and model
    print(f"\nLoading {config.MODEL_NAME}...")
    tokenizer = T5Tokenizer.from_pretrained(config.MODEL_NAME, legacy=False)
    model = T5ForConditionalGeneration.from_pretrained(config.MODEL_NAME)
    model.to(device)
    
    print(f"✓ Model loaded: {sum(p.numel() for p in model.parameters()):,} parameters")
    
    # Load and prepare data
    train_dataset, val_dataset, val_articles, val_summaries = load_and_prepare_data(tokenizer)
    
    train_loader = DataLoader(
        train_dataset,
        batch_size=config.BATCH_SIZE,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=config.BATCH_SIZE,
        shuffle=False,
        num_workers=2,
        pin_memory=True
    )
    
    # Setup optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=config.LEARNING_RATE, weight_decay=0.01)
    
    total_steps = len(train_loader) * config.NUM_EPOCHS // config.GRADIENT_ACCUMULATION_STEPS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config.WARMUP_STEPS,
        num_training_steps=total_steps
    )
    
    print(f"✓ Total training steps: {total_steps}")
    
    # Mixed precision scaler
    scaler = torch.cuda.amp.GradScaler() if (config.USE_FP16 and torch.cuda.is_available()) else None
    
    # Training history
    history = {
        'train_loss': [],
        'val_loss': [],
        'rouge1': [],
        'rouge2': [],
        'rougeL': []
    }
    
    best_rougeL = 0
    
    # Training loop
    for epoch in range(config.NUM_EPOCHS):
        print(f"\n{'=' * 50}")
        print(f"Epoch {epoch + 1}/{config.NUM_EPOCHS}")
        print(f"{'=' * 50}")
        
        # Train
        train_loss = train_epoch(model, train_loader, optimizer, scheduler, scaler)
        print(f"\nTrain Loss: {train_loss:.4f}")
        
        # Validate
        val_loss, predictions = validate(model, val_loader, tokenizer)
        print(f"Validation Loss: {val_loss:.4f}")
        
        # Compute ROUGE scores
        rouge_scores = compute_rouge_scores(predictions, val_summaries[:len(predictions)])
        print(f"\nROUGE Scores:")
        print(f"  ROUGE-1: {rouge_scores['rouge1']:.4f}")
        print(f"  ROUGE-2: {rouge_scores['rouge2']:.4f}")
        print(f"  ROUGE-L: {rouge_scores['rougeL']:.4f}")
        
        # Save history
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['rouge1'].append(rouge_scores['rouge1'])
        history['rouge2'].append(rouge_scores['rouge2'])
        history['rougeL'].append(rouge_scores['rougeL'])
        
        # Save best model
        if rouge_scores['rougeL'] > best_rougeL:
            best_rougeL = rouge_scores['rougeL']
            print(f"\n✓ New best model! Saving checkpoint...")
            model.save_pretrained(f"{config.CHECKPOINT_DIR}/best_model")
            tokenizer.save_pretrained(f"{config.CHECKPOINT_DIR}/best_model")
        
        # Save checkpoint
        checkpoint_path = f"{config.CHECKPOINT_DIR}/epoch_{epoch + 1}"
        model.save_pretrained(checkpoint_path)
        tokenizer.save_pretrained(checkpoint_path)
        print(f"✓ Checkpoint saved to {checkpoint_path}")
    
    # Save training history
    with open(f"{config.OUTPUT_DIR}/training_history.json", 'w') as f:
        json.dump(history, f, indent=2)
    
    print("\n" + "=" * 50)
    print("TRAINING COMPLETED!")
    print("=" * 50)
    print(f"Best ROUGE-L Score: {best_rougeL:.4f}")
    
    return model, tokenizer, history

# ============================================
# 6. EXAMPLE GENERATION & EVALUATION
# ============================================

def generate_examples(model, tokenizer, articles, references, num_examples=5):
    """Generate example summaries"""
    model.eval()
    
    examples = []
    
    for i in range(min(num_examples, len(articles))):
        article = "summarize: " + str(articles[i])
        
        inputs = tokenizer(
            article,
            max_length=config.MAX_SOURCE_LENGTH,
            truncation=True,
            return_tensors='pt'
        ).to(device)
        
        with torch.no_grad():
            summary_ids = model.generate(
                inputs['input_ids'],
                max_length=config.MAX_TARGET_LENGTH,
                num_beams=4,
                length_penalty=2.0,
                early_stopping=True
            )
        
        generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        
        examples.append({
            'article': str(articles[i])[:500] + "...",  # Truncate for display
            'reference': str(references[i]),
            'generated': generated_summary
        })
    
    return examples

# ============================================
# 7. RUN TRAINING
# ============================================

if __name__ == "__main__":
    # Train the model
    model, tokenizer, history = train_model()
    
    # Load validation data for examples
    print("\nGenerating example summaries...")
    try:
        dataset = load_dataset("cnn_dailymail", "3.0.0")
        val_data = dataset['validation'].select(range(10))
        
        examples = generate_examples(
            model, 
            tokenizer,
            val_data['article'],
            val_data['highlights'],
            num_examples=5
        )
    except:
        print("Could not load validation data for examples")
        examples = []
    
    # Display examples
    if examples:
        print("\n" + "=" * 50)
        print("EXAMPLE SUMMARIES")
        print("=" * 50)
        
        for i, ex in enumerate(examples, 1):
            print(f"\n--- Example {i} ---")
            print(f"\nArticle (truncated):\n{ex['article']}")
            print(f"\nReference Summary:\n{ex['reference']}")
            print(f"\nGenerated Summary:\n{ex['generated']}")
            print("\n" + "-" * 50)
    
    print("\n✓ All done! Model saved to:", config.CHECKPOINT_DIR)
    print(f"✓ Training history saved to: {config.OUTPUT_DIR}/training_history.json")

In [None]:

!pip uninstall transformers -y
!pip install transformers==4.36.0 torch==2.1.0 -q

In [None]:
import shutil

folder_path = "/kaggle/working/t5_summarization/checkpoints"
zip_path = "/kaggle/working/t5_checkpoints.zip"

shutil.make_archive(zip_path.replace('.zip', ''), 'zip', folder_path)

print("✅ Zipped successfully! Now check the right panel under 'Output > Files' for:")
print(zip_path)
