In [None]:
# ---
# Title: Adjacent Possible Exploration (APE) - Scaled-Down Experiment
# Description: This notebook implements the scaled-down experiment for the NeurIPS 2025 submission
#              "APE: A Data-Centric Benchmark for Efficient LLM Adaptation in Text Summarization".
#              It fine-tunes T5-base on 1,200 CNN/DailyMail articles over 15 iterations with 80-article batches,
#              evaluates qualitative results on 100 test articles, and saves outputs for reproducibility.
# Author: [Anonymous for Submission]
# Institution: [Anonymous for Submission]
# Date: April 10, 2025
# License: CC BY 4.0 (Attribution for non-commercial use; anonymized for review)
# Contact: [Anonymous email or submission portal reference]
# Requirements: Python 3.8+, transformers>=4.28, datasets>=2.14, torch>=2.0, nltk>=3.8,
#               rouge_score>=0.1, bert-score>=0.3, matplotlib>=3.7, seaborn>=0.12, pandas>=2.0
# Hardware: Google Colab T4 GPU (16 GB VRAM, 7.5 TFLOPS recommended)
# Expected Runtime: ~2-3 hours on T4 GPU
# Overview:
#   1. Environment Setup and Verification
#   2. Model and Dataset Loading
#   3. APE Perturbation and Fine-Tuning
#   4. Qualitative Analysis and Output Saving
# ---


In [None]:
# Import core libraries for environment checks
import sys
import pkg_resources
import platform
import datetime
import os
from typing import List, Dict
import warnings
import torch
from google.colab import drive

In [None]:
# Define required packages and minimum versions
REQUIRED_PACKAGES = {
    "transformers": "4.28.0",
    "datasets": "2.14.0",
    "torch": "2.0.0",
    "nltk": "3.8.0",
    "rouge_score": "0.1.0",
    "bert_score": "0.3.0",
    "matplotlib": "3.7.0",
    "seaborn": "0.12.0",
    "pandas": "2.0.0",
    "numpy": "1.23.0"
}

def check_environment() -> bool:
    """Verify the Python environment meets minimum requirements."""
    print("\n=== Environment Verification ===")

    # Check Python version
    python_version = sys.version_info
    if python_version < (3, 8):
        print(f"Error: Python {python_version.major}.{python_version.minor} detected. Requires 3.8+.")
        return False
    print(f"Python Version: {platform.python_version()} - OK")

    # Check package versions
    for pkg, min_version in REQUIRED_PACKAGES.items():
        try:
            installed_version = pkg_resources.get_distribution(pkg).version
            if pkg_resources.parse_version(installed_version) < pkg_resources.parse_version(min_version):
                print(f"Error: {pkg} version {installed_version} is below required {min_version}.")
                return False
            print(f"{pkg}: {installed_version} - OK")
        except pkg_resources.DistributionNotFound:
            print(f"Error: {pkg} is not installed. Install with `pip install {pkg}`.")
            return False

    # Check GPU availability
    if not torch.cuda.is_available():
        print("Warning: No GPU detected. CPU will be used, but T4 GPU is recommended.")
    else:
        gpu_name = torch.cuda.get_device_name(0)
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
        print(f"GPU: {gpu_name} ({gpu_memory:.1f} GiB) - OK")

    # Check Google Drive connectivity
    try:
        drive.mount('/content/drive', force_remount=True)
        print("Google Drive: Mounted successfully - OK")
    except Exception as e:
        print(f"Error: Google Drive mount failed - {str(e)}")
        return False

    return True

# Run environment check
if not check_environment():
    raise SystemExit("Environment check failed. Please resolve issues and rerun.")

# Install missing packages (optional; uncomment if needed)
# !pip install transformers datasets torch nltk rouge_score bert-score

In [None]:
# Install missing packages (optional; uncomment if needed)
# !pip install transformers datasets torch nltk rouge_score bert-score matplotlib seaborn pandas numpy

# Import remaining libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset


# Set random seeds for reproducibility across libraries
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)  # For multi-GPU setups

# Suppress FutureWarning for cleaner output
warnings.filterwarnings("ignore", category=FutureWarning)

# 1. Notebook configuration

In [None]:
# Log notebook execution timestamp
print(f"Notebook executed on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}")

# Usage note
print("""
=== Usage Note ===
Run all cells sequentially. Expected runtime is ~2-3 hours on a T4 GPU.
Outputs (models, results) are saved to Google Drive at {BASE_DIR}.
Ensure sufficient storage (~5 GB) and a stable connection.
""")

In [None]:
# Mount Google Drive for persistent storage
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')

# Define base directory in Google Drive
BASE_DIR = "/content/drive/MyDrive/NeurIPS2025_Results"
for subdir in ["models", "qualitative_analysis"]:
    os.makedirs(os.path.join(BASE_DIR, subdir), exist_ok=True)

# Function to print GPU memory usage
def print_gpu_memory() -> None:
    """Print current GPU memory usage in GiB."""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        print(f"GPU Memory - Allocated: {allocated:.2f} GiB, Reserved: {reserved:.2f} GiB")

# 2. Load model

In [None]:
# Load T5-base model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base", torch_dtype=torch.float32).to(device)

dataset = load_dataset("cnn_dailymail", "3.0.0")
train_articles = dataset['train']['article'][:1200]  # 400 training samples
train_summaries = dataset['train']['highlights'][:1200]
test_articles = dataset['test']['article'][:300]  # 100 test samples
test_summaries = dataset['test']['highlights'][:300]

# 3. Functions

In [None]:
def perturb_model(
    model: T5ForConditionalGeneration,
    articles: List[str],
    summaries: List[str],
    learning_rate: float,
    epochs: int = 3,
    accum_steps: int = 4
) -> None:
    """Fine-tune the model on a batch of articles and summaries using APE perturbation."""
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    scaler = torch.cuda.amp.GradScaler()
    model.train()

    print("Before fine-tuning:")
    print_gpu_memory()

    for epoch in range(epochs):
        raw_loss = 0
        for i, (article, summary) in enumerate(zip(articles, summaries)):
            optimizer.zero_grad(set_to_none=True)
            inputs = tokenizer(
                article, return_tensors="pt", max_length=512,
                truncation=True, padding=True
            ).to(device)
            targets = tokenizer(
                summary, return_tensors="pt", max_length=128,
                truncation=True, padding=True
            ).to(device)

            with torch.amp.autocast('cuda'):
                outputs = model(
                    input_ids=inputs.input_ids,
                    attention_mask=inputs.attention_mask,
                    labels=targets.input_ids
                )
                loss = outputs.loss / accum_steps

            scaler.scale(loss).backward()
            raw_loss += loss.item()

            if (i + 1) % accum_steps == 0 or (i + 1) == len(articles):
                scaler.unscale_(optimizer)
                grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                print(f"Epoch {epoch+1}, Step {i+1}: Loss = {raw_loss:.4f}, Grad Norm = {grad_norm:.4f}")
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad(set_to_none=True)
                raw_loss = 0

    print("After fine-tuning:")
    print_gpu_memory()

def generate_summary(model: T5ForConditionalGeneration, article: str) -> str:
    """Generate a summary for a given article using the model."""
    model.eval()
    inputs = tokenizer(
        article, return_tensors="pt", max_length=512,
        truncation=True, padding=True
    ).to(device)
    with torch.no_grad():
        with torch.amp.autocast('cuda'):
            outputs = model.generate(
                input_ids=inputs.input_ids,
                attention_mask=inputs.attention_mask,
                max_length=128
            )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


# 4. Generate summaries

In [None]:
# Save the baseline model (initial T5-base) to Google Drive
model.save_pretrained(os.path.join(GDRIVE_BASE_DIR, "models/baseline_model"))
tokenizer.save_pretrained(os.path.join(GDRIVE_BASE_DIR, "models/baseline_model"))

In [None]:
# Generate baseline summaries for qualitative analysis (100 articles)
subset_size = 100
subset_indices = np.random.choice(len(test_articles), subset_size, replace=False)
subset_articles = [test_articles[i] for i in subset_indices]
subset_references = [test_summaries[i] for i in subset_indices]

baseline_summaries = [generate_summary(model, article) for article in subset_articles]
print(f"Generated {len(baseline_summaries)} baseline summaries.")

# Run APE perturbations (15 iterations, 80 articles per batch)
fixed_lr = 3e-6
batch_size = 80  # 1,200 articles / 15 iterations ≈ 80
perturbations = [
    (train_articles[i:i + batch_size], train_summaries[i:i + batch_size], fixed_lr)
    for i in range(0, len(train_articles), batch_size)
]

for i, (articles, summaries, lr) in enumerate(perturbations, 1):
    print(f"Iteration {i}: Perturbing with lr={lr}, {len(articles)} articles")
    perturb_model(model, articles, summaries, lr)

# Save final model
model.save_pretrained(os.path.join(BASE_DIR, "models/final_model"))
tokenizer.save_pretrained(os.path.join(BASE_DIR, "models/final_model"))
print("Final model saved.")

# Generate final summaries
final_summaries = [generate_summary(model, article) for article in subset_articles]
print(f"Generated {len(final_summaries)} final summaries.")

# Compile qualitative results
qualitative_results = [
    {
        "article": article,
        "reference_summary": ref,
        "baseline_summary": baseline,
        "final_summary": final
    }
    for article, ref, baseline, final in zip(
        subset_articles, subset_references, baseline_summaries, final_summaries
    )
]

# Save qualitative results
with open(os.path.join(BASE_DIR, "qualitative_analysis/results.pkl"), "wb") as f:
    pickle.dump(qualitative_results, f)
print("Qualitative results saved.")

# Display example summaries for paper
print("\nQualitative Analysis Examples:")
for i, result in enumerate(qualitative_results[:3], 1):
    print(f"\nExample {i}:")
    print(f"Article (excerpt): {result['article'][:200]}...")
    print(f"Reference Summary: {result['reference_summary']}")
    print(f"Baseline Summary: {result['baseline_summary']}")
    print(f"Final Summary: {result['final_summary']}")

# Save summaries for human evaluation
file_path = os.path.join(BASE_DIR, "summaries_for_evaluation.txt")
with open(file_path, 'w') as f:
    f.write("Human Evaluation Texts for Summarization Experiment\n")
    f.write("================================================\n\n")
    f.write("Instructions: Rate Baseline and Final summaries (1-5) for Informativeness, Fluency, Factual Accuracy.\n\n")
    for idx, result in enumerate(qualitative_results, 1):
        title = result['article'].split('(CNN)')[1].split('.')[0].strip()[:50]
        f.write(f"Example {idx}: {title}\n")
        f.write(f"Article (excerpt): {result['article'][:200]}...\n")
        f.write(f"Reference Summary: {result['reference_summary']}\n")
        f.write(f"Baseline Summary: {result['baseline_summary']}\n")
        f.write(f"Final Summary: {result['final_summary']}\n\n")
print(f"Summaries for evaluation saved to {file_path}")


In [None]:
# Flush and unmount Google Drive
drive.flush_and_unmount()
print(f"All results saved to {BASE_DIR}")
