In [None]:
# Import core libraries for environment checks
import sys
import pkg_resources
import platform
import datetime
import os
from typing import List, Dict
import warnings
import torch
from google.colab import drive

# Suppress FutureWarning for cleaner output
warnings.filterwarnings("ignore", category=FutureWarning)

# Log notebook execution timestamp
print(f"Notebook executed on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [None]:
# Define required packages and minimum versions
REQUIRED_PACKAGES = {
    "transformers": "4.28.0",
    "datasets": "2.14.0",
    "torch": "2.0.0",
    "nltk": "3.8.0",
    "rouge_score": "0.1.0",
    "bert_score": "0.3.0",
    "matplotlib": "3.7.0",
    "seaborn": "0.12.0",
    "pandas": "2.0.0",
    "numpy": "1.23.0"
}

def check_environment() -> bool:
    """Verify the Python environment meets minimum requirements."""
    print("\n=== Environment Verification ===")

    # Check Python version
    python_version = sys.version_info
    if python_version < (3, 8):
        print(f"Error: Python {python_version.major}.{python_version.minor} detected. Requires 3.8+.")
        return False
    print(f"Python Version: {platform.python_version()} - OK")

    # Check package versions
    for pkg, min_version in REQUIRED_PACKAGES.items():
        try:
            installed_version = pkg_resources.get_distribution(pkg).version
            if pkg_resources.parse_version(installed_version) < pkg_resources.parse_version(min_version):
                print(f"Error: {pkg} version {installed_version} is below required {min_version}.")
                return False
            print(f"{pkg}: {installed_version} - OK")
        except pkg_resources.DistributionNotFound:
            print(f"Error: {pkg} is not installed. Install with `pip install {pkg}`.")
            return False

    # Check GPU availability
    if not torch.cuda.is_available():
        print("Warning: No GPU detected. CPU will be used, but T4 GPU is recommended.")
    else:
        gpu_name = torch.cuda.get_device_name(0)
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
        print(f"GPU: {gpu_name} ({gpu_memory:.1f} GiB) - OK")

    # Check Google Drive connectivity
    try:
        drive.mount('/content/drive', force_remount=True)
        print("Google Drive: Mounted successfully - OK")
    except Exception as e:
        print(f"Error: Google Drive mount failed - {str(e)}")
        return False

    return True

# Run environment check
if not check_environment():
    raise SystemExit("Environment check failed. Please resolve issues and rerun.")


In [None]:
# Optional: Install dependencies if not pre-installed
# !pip install transformers datasets torch nltk rouge_score bert-score matplotlib seaborn pandas numpy

In [None]:
import warnings
import os
import pickle
from typing import List, Tuple, Dict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset
from nltk.translate import bleu_score
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import nltk

# Set random seeds for reproducibility across libraries
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)  # For multi-GPU setups

# Suppress FutureWarning for cleaner output
warnings.filterwarnings("ignore", category=FutureWarning)

# 1. Notebook configuration

In [None]:
# Log notebook execution timestamp
print(f"Notebook executed on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}")

# Usage note
print("""
=== Usage Note ===
Run all cells sequentially. Expected runtime is ~2-3 hours on a T4 GPU.
Outputs (models, results) are saved to Google Drive at {BASE_DIR}.
Ensure sufficient storage (~5 GB) and a stable connection.
""")

In [None]:
# Mount Google Drive for persistent storage
from google.colab import drive
drive.mount('/content/drive')

# Define base directory in Google Drive
BASE_DIR = "/content/drive/MyDrive/NeurIPS2025_Results"
for subdir in ["models", "plots"]:
    os.makedirs(os.path.join(BASE_DIR, subdir), exist_ok=True)

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')

# Define base directory in Google Drive
BASE_DIR = "/content/drive/MyDrive/NeurIPS2025_Results"
for subdir in ["models", "qualitative_analysis"]:
    os.makedirs(os.path.join(BASE_DIR, subdir), exist_ok=True)

# Function to print GPU memory usage
def print_gpu_memory() -> None:
    """Print current GPU memory usage in GiB."""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1024**3
        reserved = torch.cuda.memory_reserved() / 1024**3
        print(f"GPU Memory - Allocated: {allocated:.2f} GiB, Reserved: {reserved:.2f} GiB")

In [None]:
# Set up plotting style
plt.style.use('seaborn-v0_8-paper')
sns.set_context("paper")
plt.rcParams.update({
    'font.family': 'serif',
    'font.serif': ['DejaVu Serif'],
    'font.size': 12,
    'axes.labelsize': 14,
    'axes.titlesize': 16,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'legend.fontsize': 12,
    'figure.figsize': (7, 5),
    'figure.dpi': 150,
    'savefig.dpi': 300,
    'savefig.format': 'pdf',
    'savefig.bbox': 'tight',
    'axes.grid': True,
    'grid.linestyle': '--',
    'grid.alpha': 0.7
})

# 2. Model and tokenizer

In [None]:
# Load T5-base in FP32 with mixed precision
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained(
    "t5-base",
    torch_dtype=torch.float32
    ).to(device)

# Load dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")
articles = dataset['train']['article'][:4000]
summaries = dataset['train']['highlights'][:4000]
test_articles = dataset['test']['article'][:1000]
test_summaries = dataset['test']['highlights'][:1000]


# 3. Model functions

In [None]:
def perturb_model(
    model: T5ForConditionalGeneration,
    articles: List[str],
    summaries: List[str],
    learning_rate: float,
    epochs: int = 3,
    accum_steps: int = 4
) -> None:
    """Fine-tune the model on a batch of articles and summaries
    using APE perturbation."""
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    scaler = torch.cuda.amp.GradScaler()
    model.train()

    print("Before fine-tuning:")
    print_gpu_memory()

    for epoch in range(epochs):
        raw_loss = 0
        for i, (article, summary) in enumerate(zip(articles, summaries)):
            optimizer.zero_grad(set_to_none=True)
            inputs = tokenizer(
                article, return_tensors="pt", max_length=512,
                truncation=True, padding=True
            ).to(device)
            targets = tokenizer(
                summary, return_tensors="pt", max_length=128,
                truncation=True, padding=True
            ).to(device)

            with torch.amp.autocast('cuda'):
                outputs = model(
                    input_ids=inputs.input_ids,
                    attention_mask=inputs.attention_mask,
                    labels=targets.input_ids
                )
                loss = outputs.loss / accum_steps

            scaler.scale(loss).backward()
            raw_loss += loss.item()

            if (i + 1) % accum_steps == 0 or (i + 1) == len(articles):
                scaler.unscale_(optimizer)
                grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                print(f"Epoch {epoch+1}, Step {i+1}: Loss = {raw_loss:.4f}, Grad Norm = {grad_norm:.4f}")
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad(set_to_none=True)
                raw_loss = 0

    print("After fine-tuning:")
    print_gpu_memory()


def generate_summaries(
    model: T5ForConditionalGeneration,
    articles: List[str]
    ) -> List[str]:
    """Generate summaries for a list of articles."""
    model.eval()
    summaries = []
    for article in articles:
        inputs = tokenizer(
            article, return_tensors="pt", max_length=512,
            truncation=True, padding=True
        ).to(device)
        with torch.no_grad():
            with torch.amp.autocast('cuda'):
                outputs = model.generate(
                    input_ids=inputs.input_ids,
                    attention_mask=inputs.attention_mask,
                    max_length=128
                )
        summaries.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
    return summaries


def compute_metrics(
    generated_summaries: List[str],
    reference_summaries: List[str]
) -> Tuple[Dict[str, float], Dict[str, float]]:

    """Compute BLEU, ROUGE-1, BERTScore, and perplexity for generated summaries."""
    bleu_scores = [
        bleu_score.sentence_bleu([ref.split()], gen.split())
        for ref, gen in zip(reference_summaries, generated_summaries)
    ]
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    rouge1_scores = [
        scorer.score(ref, gen)['rouge1'].fmeasure
        for ref, gen in zip(reference_summaries, generated_summaries)
    ]
    _, _, bert_scores = bert_score(generated_summaries, reference_summaries, lang="en")
    bert_scores = bert_scores.tolist()

    # Compute perplexity (average cross-entropy loss)
    model.eval()
    perplexities = []
    for article, summary in zip(test_articles, generated_summaries):
        inputs = tokenizer(
            article, return_tensors="pt", max_length=512,
            truncation=True, padding=True
        ).to(device)
        targets = tokenizer(
            summary, return_tensors="pt", max_length=128,
            truncation=True, padding=True
        ).to(device)
        with torch.no_grad():
            with torch.amp.autocast('cuda'):
                outputs = model(
                    input_ids=inputs.input_ids,
                    attention_mask=inputs.attention_mask,
                    labels=targets.input_ids
                )
        perplexities.append(torch.exp(outputs.loss).item())

    means = {
        "bleu": np.mean(bleu_scores),
        "rouge1": np.mean(rouge1_scores),
        "bertscore": np.mean(bert_scores),
        "perplexity": np.mean(perplexities)
    }
    stds = {
        "bleu": np.std(bleu_scores),
        "rouge1": np.std(rouge1_scores),
        "bertscore": np.std(bert_scores),
        "perplexity": np.std(perplexities)
    }
    return means, stds

def plot_metric(
    iterations: List[int],
    means: List[float],
    stds: List[float],
    title: str,
    ylabel: str,
    color: str,
    filename: str
) -> plt.Figure:

    """Create a publication-quality plot for a given metric."""
    fig, ax = plt.subplots()
    ax.errorbar(
        iterations, means, yerr=stds, fmt='-o', color=color, ecolor='gray',
        capsize=4, capthick=1, linewidth=2, markersize=6, label='APE Iterations'
    )
    ax.axhline(y=means[0], color='red', linestyle='--', linewidth=2, label='Baseline')
    ax.fill_between(iterations, [m - s for m, s in zip(means, stds)],
                    [m + s for m, s in zip(means, stds)], alpha=0.2, color=color)
    z = np.polyfit(iterations, means, 3)
    p = np.poly1d(z)
    x_trend = np.linspace(min(iterations), max(iterations), 100)
    ax.plot(x_trend, p(x_trend), '--', color='darkgray', alpha=0.5)
    ax.set_xlabel('Iteration', fontweight='bold')
    ax.set_ylabel(ylabel, fontweight='bold')
    ax.set_title(title, fontweight='bold')
    ax.set_xticks(iterations)
    ax.legend(loc='best')
    plt.tight_layout()
    plt.savefig(os.path.join(BASE_DIR, "plots", filename))
    plt.close()
    return fig


# 4. Run experiment

In [None]:
# Save baseline model
model.save_pretrained(os.path.join(BASE_DIR, "models/baseline_model"))
tokenizer.save_pretrained(os.path.join(BASE_DIR, "models/baseline_model"))
print("Baseline model saved.")

# Generate baseline summaries and compute metrics
baseline_summaries = generate_summaries(model, test_articles)
baseline_means, baseline_stds = compute_metrics(baseline_summaries, test_summaries)
print("Baseline metrics computed.")

# Run APE perturbations (17 iterations, ~235 articles per batch)
fixed_lr = 3e-6
batch_size = 235  # 4,000 articles / 17 iterations ≈ 235
perturbations = [
    (train_articles[i:i + batch_size], train_summaries[i:i + batch_size], fixed_lr)
    for i in range(0, len(train_articles), batch_size)
]

# Store metrics across iterations (0 = baseline)
metrics_history = {
    "bleu_means": [baseline_means["bleu"]],
    "bleu_stds": [baseline_stds["bleu"]],
    "rouge1_means": [baseline_means["rouge1"]],
    "rouge1_stds": [baseline_stds["rouge1"]],
    "bertscore_means": [baseline_means["bertscore"]],
    "bertscore_stds": [baseline_stds["bertscore"]],
    "perplexity_means": [baseline_means["perplexity"]],
    "perplexity_stds": [baseline_stds["perplexity"]]
}

for i, (articles, summaries, lr) in enumerate(perturbations, 1):
    print(f"Iteration {i}: Perturbing with lr={lr}, {len(articles)} articles")
    perturb_model(model, articles, summaries, lr)
    summaries = generate_summaries(model, test_articles)
    means, stds = compute_metrics(summaries, test_summaries)
    metrics_history["bleu_means"].append(means["bleu"])
    metrics_history["bleu_stds"].append(stds["bleu"])
    metrics_history["rouge1_means"].append(means["rouge1"])
    metrics_history["rouge1_stds"].append(stds["rouge1"])
    metrics_history["bertscore_means"].append(means["bertscore"])
    metrics_history["bertscore_stds"].append(stds["bertscore"])
    metrics_history["perplexity_means"].append(means["perplexity"])
    metrics_history["perplexity_stds"].append(stds["perplexity"])

# Save final model
model.save_pretrained(os.path.join(BASE_DIR, "models/final_model"))
tokenizer.save_pretrained(os.path.join(BASE_DIR, "models/final_model"))
print("Final model saved.")


# 5. Results visualization

In [None]:
# Plot results
iterations = list(range(18))  # 0 (baseline) + 17 iterations
plot_metric(
    iterations, metrics_history["bleu_means"], metrics_history["bleu_stds"],
    "BLEU Score Across APE Iterations (T5-base, Fixed LR)",
    "BLEU Score", "blue", "bleu_plot_t5base_fixed_lr.pdf"
)
plot_metric(
    iterations, metrics_history["rouge1_means"], metrics_history["rouge1_stds"],
    "ROUGE-1 Score Across APE Iterations (T5-base, Fixed LR)",
    "ROUGE-1 Score", "darkorange", "rouge1_plot_t5base_fixed_lr.pdf"
)
plot_metric(
    iterations, metrics_history["bertscore_means"], metrics_history["bertscore_stds"],
    "BERTScore Across APE Iterations (T5-base, Fixed LR)",
    "BERTScore F1", "purple", "bertscore_plot_t5base_fixed_lr.pdf"
)
plot_metric(
    iterations, metrics_history["perplexity_means"], metrics_history["perplexity_stds"],
    "Perplexity Across APE Iterations (T5-base, Fixed LR)",
    "Perplexity", "green", "perplexity_plot_t5base_fixed_lr.pdf"
)
print("Plots saved to Google Drive.")

# Save metrics history
with open(os.path.join(BASE_DIR, "results_summary.pkl"), "wb") as f:
    pickle.dump(metrics_history, f)
print("Metrics history saved.")


In [None]:
# Flush and unmount Google Drive
drive.flush_and_unmount()
print(f"All results saved to {BASE_DIR}")
