In [2]:
import os
import re
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
from rouge import Rouge
import torch
import subprocess
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Check for CUDA availability
print("Available torch Version", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("Number of CUDA devices:", torch.cuda.device_count())
print("CUDA device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No CUDA device")

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")
  print("CUDA is not available")

Available torch Version 2.3.1
CUDA available: True
CUDA version: 11.8
Number of CUDA devices: 1
CUDA device name: NVIDIA GeForce GTX 1650 Ti


In [None]:
# Define the path to save the trained model 
model_save_path = "saved_model"

# XSum Dataset (Small Subset)
dataset_path = "E:\\NIKHIL\\ML\\Text Summerizer Using Deep Learning\\xsum_dataset"


if not os.path.exists(dataset_path):
  print("Downloading dataset...")
  # Download and save the full dataset 
  dataset = load_dataset("xsum")
  dataset.save_to_disk(dataset_path)
  print("Dataset downloaded and saved locally.")
else:
  print("Loading dataset from local disk...")
  dataset = load_from_disk(dataset_path)
  print("Dataset loaded from local disk.")

print("Dataset preparing")
# Create a small subset for evaluation and training
train_size = 0.9 # Use a small portion for faster evaluation and training
dataset_split = dataset["test"].train_test_split(test_size=1 - train_size, shuffle=True)
train_data = dataset_split["train"].select(range(20))
eval_data = dataset_split["test"].select(range(10))
# print(f"Train data size: {len(train_data)}")
# print(f"Evaluation data size: {len(eval_data)}")
print("done")

class CustomDataset(Dataset):
  def __init__(self, data, tokenizer):
    self.data = data
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    sample = self.data[idx]
    cleaned_text = clean_text(sample["document"])
    input_ids = tokenize_text(cleaned_text, self.tokenizer).squeeze()
    summary = tokenize_text(sample["summary"], self.tokenizer).squeeze()  # Tokenize the summary as well

    return {"input_ids": input_ids, "labels": summary}


Loading dataset from local disk...
Dataset loaded from local disk.
Dataset preparing
done


In [None]:

def clean_text(text):
 
  text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
  text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
  text = text.lower()  # Convert to lowercase
  text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove non-alphanumeric characters
  return text

def tokenize_text(text, tokenizer):
  
  input_ids = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)["input_ids"]
  return input_ids


In [None]:

def evaluate_model(model, data_loader, tokenizer):
 
  rouge = Rouge()
  model.eval()  # Set model to evaluation mode

  predictions = []
  references = []
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch["input_ids"].to(device)

      labels = batch["labels"].to(device)

      # Generate summary
      summary_ids = model.generate(
          input_ids=input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True
      )

      # Decode summaries
      for summary_id in summary_ids:
        summary = tokenizer.decode(summary_id, skip_special_tokens=True)
        predictions.append(summary)

      for label in labels:
        reference = tokenizer.decode(label, skip_special_tokens=True)
        references.append(reference)

  # Calculate ROUGE score
  rouge_score = rouge.get_scores(predictions, references, avg=True)
  print(f"ROUGE Score: {rouge_score}")
  return rouge_score["rouge-l"]["f"]


In [None]:

def summarize_text(text_to_summarize, model, tokenizer):
 
  cleaned_text = clean_text(text_to_summarize)
  input_ids = tokenize_text(cleaned_text, tokenizer).to(device)

  summary_ids = model.generate(
      input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True
  )
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

  return summary


In [None]:

def train_model(model, train_data, tokenizer, epochs=3, batch_size=2, gradient_accumulation_steps=16):
 
  optimizer = Adam(model.parameters(), lr=1e-5)  # Adjust learning rate as needed
  model.train()  # Set model to training mode

  training_loss = []  # Track training loss for visualization
  best_rouge = 0  # Track best ROUGE score for early stopping 
  patience = 3  # Number of epochs to wait for improvement before stopping 

  for epoch in range(epochs):
    epoch_loss = 0.0

    # Create DataLoader for the training data
    train_dataset = CustomDataset(train_data, tokenizer)
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    for step, batch in enumerate(train_data_loader):
      input_ids = batch["input_ids"].to(device)
      labels = batch["labels"].to(device)

      # Forward pass
      outputs = model(input_ids=input_ids, labels=labels)
      loss = outputs.loss / gradient_accumulation_steps  # Normalize loss

      # Backward pass
      loss.backward()

      # Gradient accumulation
      if (step + 1) % gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

      epoch_loss += loss.item()

    # Print training loss
    print(f"Epoch: {epoch+1}/{epochs}, Training Loss: {epoch_loss:.4f}")
    training_loss.append(epoch_loss)  # Track training loss 

    # Evaluate model on validation set (optional)
    eval_dataset = CustomDataset(eval_data, tokenizer)
    eval_data_loader = DataLoader(eval_dataset, batch_size=1)
    val_rouge = evaluate_model(model, eval_data_loader, tokenizer)

    # Early stopping 
    if val_rouge > best_rouge:
      best_rouge = val_rouge
      patience = 3  # Reset patience counter
    else:
      patience -= 1
      if patience == 0:
        print("Early stopping triggered!")
        break

  return model  # Return the trained model


In [None]:

# Pre-trained Model Selection
model_name = "facebook/bart-base"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the trained model if it exists, otherwise initialize a new model
if os.path.exists(model_save_path):
  print("Loading trained model from disk...")
  model = AutoModelForSeq2SeqLM.from_pretrained(model_save_path)
  # Train the model (adjust epochs, batch size, and gradient accumulation steps for desired training time and memory constraints)
  trained_model = train_model(model, train_data, tokenizer, epochs=10, batch_size=2, gradient_accumulation_steps=16)

else:
  print("Loading pre-trained model...")
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
  model.to(device)  # Move model to appropriate device

  # Train the model (adjust epochs, batch size, and gradient accumulation steps for desired training time and memory constraints)
  trained_model = train_model(model, train_data, tokenizer, epochs=10, batch_size=2, gradient_accumulation_steps=16)

  # Save the trained model
  trained_model.save_pretrained(model_save_path)
  print(f"Model saved to {model_save_path}")

# Ensure the model is on the correct device
model = trained_model
model.to(device)




Loading pre-trained model...


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch: 1/10, Training Loss: 9.8122
ROUGE Score: {'rouge-1': {'r': 0.3960110860931294, 'p': 0.08847719380371691, 'f': 0.14404494360313747}, 'rouge-2': {'r': 0.07212418859090686, 'p': 0.01253075034940521, 'f': 0.02125603274122903}, 'rouge-l': {'r': 0.26453162429246024, 'p': 0.05951817923806291, 'f': 0.09679213322078727}}
Epoch: 2/10, Training Loss: 9.8818
ROUGE Score: {'rouge-1': {'r': 0.3960110860931294, 'p': 0.08847719380371691, 'f': 0.14404494360313747}, 'rouge-2': {'r': 0.07212418859090686, 'p': 0.01253075034940521, 'f': 0.02125603274122903}, 'rouge-l': {'r': 0.26453162429246024, 'p': 0.05951817923806291, 'f': 0.09679213322078727}}
Epoch: 3/10, Training Loss: 9.8818
ROUGE Score: {'rouge-1': {'r': 0.3960110860931294, 'p': 0.08847719380371691, 'f': 0.14404494360313747}, 'rouge-2': {'r': 0.07212418859090686, 'p': 0.01253075034940521, 'f': 0.02125603274122903}, 'rouge-l': {'r': 0.26453162429246024, 'p': 0.05951817923806291, 'f': 0.09679213322078727}}
Epoch: 4/10, Training Loss: 9.8818


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


ROUGE Score: {'rouge-1': {'r': 0.3960110860931294, 'p': 0.08847719380371691, 'f': 0.14404494360313747}, 'rouge-2': {'r': 0.07212418859090686, 'p': 0.01253075034940521, 'f': 0.02125603274122903}, 'rouge-l': {'r': 0.26453162429246024, 'p': 0.05951817923806291, 'f': 0.09679213322078727}}
Early stopping triggered!
Model saved to saved_model


BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): Laye

In [None]:

# Evaluate the model on the evaluation set
eval_dataset = CustomDataset(eval_data, tokenizer)
eval_data_loader = DataLoader(eval_dataset, batch_size=2)
rouge_score = evaluate_model(model, eval_data_loader, tokenizer)
print(f"\nROUGE Score: {rouge_score}")


ROUGE Score: {'rouge-1': {'r': 0.3960110860931294, 'p': 0.08847719380371691, 'f': 0.14404494360313747}, 'rouge-2': {'r': 0.07212418859090686, 'p': 0.01253075034940521, 'f': 0.02125603274122903}, 'rouge-l': {'r': 0.26453162429246024, 'p': 0.05951817923806291, 'f': 0.09679213322078727}}

ROUGE Score: 0.09679213322078727
