In [1]:
! pip install datasets
! pip install transformers
! pip install rouge_score
! pip install sacrebleu



In [2]:
! pip install rouge_score --upgrade



In [3]:
! pip install accelerate -U




In [None]:
import os
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, TrainingArguments, Trainer
from datasets import load_dataset, DatasetDict
from rouge_score import rouge_scorer
from sacrebleu import corpus_bleu

# Define model name and tokenizer
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name, model_max_length=128)  # Reduce model max_length

model_save_path = "/content/drive/MyDrive/trained_model"

if os.path.exists(model_save_path):
    # Load the existing model
    print("Loading existing model...")
    model = PegasusForConditionalGeneration.from_pretrained(model_save_path)
    tokenizer = PegasusTokenizer.from_pretrained(model_save_path)
else:
    # Initialize model
    model = PegasusForConditionalGeneration.from_pretrained(model_name)

def preprocess_function(examples):
    """Preprocesses document and summary text for training or evaluation."""
    inputs = tokenizer(examples["document"], truncation=True, padding="max_length", max_length=128)  # Further reduce sequence lengths
    targets = tokenizer(examples["summary"], truncation=True, padding="max_length", max_length=32)
    inputs["labels"] = targets["input_ids"]
    return inputs

def save_datasets(dataset, save_dir="./data", limit=None):
    """
    Saves preprocessed train and validation datasets to a directory,
    with optional limiting of dataset size.

    Args:
        dataset: The loaded XSum dataset from `load_dataset`.
        save_dir (str, optional): The directory to save the datasets. Defaults to "./data".
        limit (int, optional): The maximum number of records to save for each split (train and validation). Defaults to None (no limit).
    """
    if limit is not None:
        train_dataset = dataset["train"].select(range(limit))  # Select first 'limit' records
        eval_dataset = dataset["validation"].select(range(limit))  # Select first 'limit' records
    else:
        train_dataset = dataset["train"]
        eval_dataset = dataset["validation"]

    train_dataset = train_dataset.map(preprocess_function, batched=True)
    eval_dataset = eval_dataset.map(preprocess_function, batched=True)
    saved_dataset = DatasetDict({"train": train_dataset, "validation": eval_dataset})
    saved_dataset.save_to_disk(save_dir)
    print(f"Datasets saved to directory: {save_dir}")

# Load XSum dataset
dataset = load_dataset("xsum")

# Define desired dataset limit (optional)
limit = 80  # Example limit, reduced to 80 records

# Preprocess and save the datasets with further reduced sequence lengths
save_datasets(dataset, limit=limit)  # Limit to 'limit' records if desired

# Load the saved datasets for future use
saved_dataset = DatasetDict.load_from_disk("./data")
print("Saved datasets loaded!")
train_dataset = saved_dataset["train"]
eval_dataset = saved_dataset["validation"]

# Check for GPU availability
if torch.cuda.is_available():
    print("GPU detected!")
    device = "cuda"
else:
    print("GPU not detected, using CPU.")
    device = "cpu"

# Define TrainingArguments
training_args = TrainingArguments(
    output_dir="./output",  # Output directory for checkpoints
    per_device_train_batch_size=1,  # Minimal batch size
    per_device_eval_batch_size=1,
    save_steps=80,  # Save checkpoints more frequently for potential memory issues
    eval_steps=50,
    num_train_epochs=1,  # Adjust number of epochs for fine-tuning
    learning_rate=1e-5,  # Adjust learning rate based on your dataset (potentially lower due to smaller batches)
    logging_dir='./logs',  # Directory for storing logs
    evaluation_strategy="steps",
    fp16=True,  # Use mixed precision if supported
    gradient_accumulation_steps=8,  # Accumulate gradients over 8 steps
)

if not os.path.exists(model_save_path):
    # Fine-tuning
    # Move model to the chosen device
    model = model.to(device)

    # Create Trainer instance with GPU support
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    # Train the model
    trainer.train()

    # Save the model and tokenizer
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)
    print(f"Model and tokenizer saved to {model_save_path}")

def compute_metrics(pred):
    """Calculates ROUGE and SacreBLEU scores."""
    labels_ids = pred.label_ids
    pred_ids = pred.predictions.argmax(-1)

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [rouge.score(l, p) for l, p in zip(labels_str, pred_str)]

    bleu = corpus_bleu(pred_str, [labels_str])

    avg_rouge = {
        'rouge1': sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores),
        'rouge2': sum([score['rouge2'].fmeasure for score in rouge_scores]) / len(rouge_scores),
        'rougeL': sum([score['rougeL'].fmeasure for score in rouge_scores]) / len(rouge_scores),
    }

    return {"rouge": avg_rouge, "sacrebleu": bleu.score}

# Evaluate the model and calculate metrics
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=eval_dataset,
)
predictions = trainer.predict(eval_dataset)
metrics = compute_metrics(predictions)
print(metrics)
