In [1]:
import os
import re
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
from rouge import Rouge
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam

# Check for CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("CUDA available:", torch.cuda.is_available())

# XSum Dataset (Small Subset)
dataset_path = "xsum_dataset"

if not os.path.exists(dataset_path):
    print("Downloading dataset...")
    # Download and save the full dataset (if needed for future training)
    dataset = load_dataset("xsum")
    dataset.save_to_disk(dataset_path)
    print("Dataset downloaded and saved locally.")
else:
    print("Loading dataset from local disk...")
    dataset = load_from_disk(dataset_path)
    print("Dataset loaded from local disk.")

print("Dataset preparing")

# Create a small subset for evaluation and training (adjust split size as needed)
train_size = 0.6  # Use a small portion for faster evaluation and training
dataset_split = dataset["train"].train_test_split(test_size=1 - train_size, shuffle=True)
train_data = dataset_split["train"]
eval_data = dataset_split["test"]
print(f"Train data size: {len(train_data)}")
print(f"Evaluation data size: {len(eval_data)}")
print("done")

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        cleaned_text = clean_text(sample["document"])
        input_ids = tokenize_text(cleaned_text, self.tokenizer).squeeze()
        summary = tokenize_text(sample["summary"], self.tokenizer).squeeze()  # Tokenize the summary as well if needed

        return {"input_ids": input_ids, "labels": summary}

def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove non-alphanumeric characters
    return text

def tokenize_text(text, tokenizer):
    input_ids = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)["input_ids"]
    return input_ids

def evaluate_model(model, data_loader, tokenizer):
    rouge = Rouge()
    model.eval()  # Set model to evaluation mode

    predictions = []
    references = []
    for batch in data_loader:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        # Generate summary
        summary_ids = model.generate(
            input_ids=input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True
        )
        # Decode summaries
        for summary_id in summary_ids:
            summary = tokenizer.decode(summary_id, skip_special_tokens=True)
            predictions.append(summary)

        for label in labels:
            reference = tokenizer.decode(label, skip_special_tokens=True)
            references.append(reference)

    # Calculate ROUGE score
    rouge_score = rouge.get_scores(predictions, references, avg=True)
    print(f"ROUGE Score: {rouge_score}")
    return rouge_score["rouge-l"]["f"]

def train_model(model, train_data, tokenizer, epochs=2, batch_size=1, gradient_accumulation_steps=5):
    optimizer = Adam(model.parameters(), lr=1e-5)  # Adjust learning rate as needed
    model.train()  # Set model to training mode

    training_loss = []  # Track training loss for visualization (optional)
    best_rouge = 0  # Track best ROUGE score for early stopping (optional)
    patience = 3  # Number of epochs to wait for improvement before stopping (optional)

    for epoch in range(epochs):
        epoch_loss = 0.0

        # Create DataLoader for the training data
        train_dataset = CustomDataset(train_data, tokenizer)
        train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

        for step, batch in enumerate(train_data_loader):
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss / gradient_accumulation_steps  # Normalize loss

            # Backward pass
            loss.backward()

            # Gradient accumulation
            if (step + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            epoch_loss += loss.item()

        # Print training loss
        print(f"Epoch: {epoch+1}/{epochs}, Training Loss: {epoch_loss:.4f}")
        training_loss.append(epoch_loss)  # Track training loss (optional)

        # Evaluate model on validation set (optional)
        eval_dataset = CustomDataset(eval_data, tokenizer)
        eval_data_loader = DataLoader(eval_dataset, batch_size=1)
        val_rouge = evaluate_model(model, eval_data_loader, tokenizer)

        # Early stopping (optional)
        if val_rouge > best_rouge:
            best_rouge = val_rouge
            patience = 3  # Reset patience counter
        else:
            patience -= 1
            if patience == 0:
                print("Early stopping triggered!")
                break

    return model  # Return the trained model

# Pre-trained Model Selection (T5-Tiny)
model_name = "t5-small"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(device)  # Move model to appropriate device

# Evaluate the pre-trained model on the evaluation set
eval_dataset = CustomDataset(eval_data, tokenizer)
eval_data_loader = DataLoader(eval_dataset, batch_size=1)
pre_trained_rouge = evaluate_model(model, eval_data_loader, tokenizer)
print(f"\nROUGE Score (Pre-trained Model): {pre_trained_rouge}")

# Train the model (adjust epochs, batch size, and gradient accumulation steps for desired training time and memory constraints)
trained_model = train_model(model, train_data, tokenizer, epochs=1, batch_size=2, gradient_accumulation_steps=16)

# Evaluate the trained model on the evaluation set
trained_rouge = evaluate_model(trained_model, eval_data_loader, tokenizer)
print(f"\nROUGE Score (Trained Model): {trained_rouge}")

print(f"\nROUGE Score Improvement: {trained_rouge - pre_trained_rouge:.4f}")

def summarize_text(text_to_summarize, trained_model, tokenizer):
    cleaned_text = clean_text(text_to_summarize)
    input_ids = tokenize_text(cleaned_text, tokenizer).to(device)

    summary_ids = trained_model.generate(
        input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

# Optional: Summarize Text Using Trained Model
text_to_summarize = """
Once on a very dark, cold night there lived the Nicholls family.  They were a kind hearted, happy family.
Their house was quite grand but there was only one thing about their house, their house was a ……………. HAUNTED HOUSE!!
The story begins with all the Nicholls family fast asleep in their warm cosy beds.  During the night when the clock struck midnight, 
a silvery white ghost appeared.  After the ghost appeared, he yowled a very spooky sound.  Whenever he yowled that sound, lightening struck!\
Even though the sounds were so loud, the family were still asleep.  Well asleep except for two – Molly and Holly, the 9 year old twins.
"""

summary = summarize_text(text_to_summarize, trained_model, tokenizer)
print(f"Summary: {summary}")


  from .autonotebook import tqdm as notebook_tqdm


CUDA available: True
Loading dataset from local disk...
Dataset loaded from local disk.
Dataset preparing
Train data size: 1000
Evaluation data size: 1000
done
ROUGE Score: {'rouge-1': {'r': 0.2517467263193698, 'p': 0.10449615256322187, 'f': 0.144403534904559}, 'rouge-2': {'r': 0.0335487556787004, 'p': 0.011295964763606652, 'f': 0.01639923082383876}, 'rouge-l': {'r': 0.20085080797869231, 'p': 0.08306493580539231, 'f': 0.11479753671209264}}

ROUGE Score (Pre-trained Model): 0.11479753671209264
Epoch: 1/1, Training Loss: 259.8775
ROUGE Score: {'rouge-1': {'r': 0.25087947493771123, 'p': 0.10432261711866546, 'f': 0.14423557594271008}, 'rouge-2': {'r': 0.033515552139916374, 'p': 0.011090959721455492, 'f': 0.016338473540057054}, 'rouge-l': {'r': 0.2009444200972446, 'p': 0.08324854983230438, 'f': 0.11516210407238227}}
ROUGE Score: {'rouge-1': {'r': 0.25087947493771123, 'p': 0.10432261711866546, 'f': 0.14423557594271008}, 'rouge-2': {'r': 0.033515552139916374, 'p': 0.011090959721455492, 'f': 0