In [1]:
import os
import re
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
from rouge import Rouge
import torch
import subprocess
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Check for CUDA availability

print("Available torch Version",torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("Number of CUDA devices:", torch.cuda.device_count())
print("CUDA device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No CUDA device")

if torch.cuda.is_available():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tensor = torch.rand(3, 3).cuda()
    print("Tensor on GPU:", tensor)
else:
    print("CUDA is not available")


def print_nvidia_smi():
    try:
        # Run the nvidia-smi command
        result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        
        # Check if the command was successful
        if result.returncode == 0:
            print("nvidia-smi output:\n")
            print(result.stdout)
        else:
            print(f"nvidia-smi failed with error code {result.returncode}")
            print(result.stderr)
    
    except FileNotFoundError:
        print("nvidia-smi command not found. Make sure NVIDIA drivers are installed.")

# Call the function to print nvidia-smi output
print_nvidia_smi()

Available torch Version 2.3.1
CUDA available: True
CUDA version: 11.8
Number of CUDA devices: 1
CUDA device name: NVIDIA GeForce GTX 1650 Ti
Tensor on GPU: tensor([[0.2538, 0.2948, 0.6243],
        [0.0917, 0.1094, 0.3214],
        [0.3831, 0.1380, 0.4759]], device='cuda:0')
nvidia-smi output:

Sun Jun 30 22:44:16 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 522.06       Driver Version: 522.06       CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   58C    P0    16W /  N/A |    660MiB /  4096MiB |     19%      Default |
|                               

In [3]:
# Define paths and hyperparameters
model_name = "facebook/bart-base"  # Pre-trained model
model_save_path =  "E:\\NIKHIL\\ML\\Text Summerizer Using Deep Learning\\Bart-fine_tuned_model"
print(model_save_path)  # Path to save the trained model
epochs = 3  # Training epochs 
batch_size = 2  # Training batch size 
gradient_accumulation_steps = 16  # Gradient accumulation for memory efficiency

E:\NIKHIL\ML\Text Summerizer Using Deep Learning\Bart-fine_tuned_model


In [4]:
# XSum Dataset (Small Subset)
dataset_path = "E:\\NIKHIL\\ML\\Text Summerizer Using Deep Learning\\src\\xsum_dataset"

if not os.path.exists(dataset_path):
  print("Downloading dataset...")
  dataset = load_dataset("xsum")
  dataset.save_to_disk(dataset_path)
  print("Dataset downloaded and saved locally.")
else:
  print("Loading dataset from local disk...")
  dataset = load_from_disk(dataset_path)
  print("Dataset loaded from local disk.")

print("Dataset preparing")

# Create a small subset for evaluation and training 
train_size = 0.9  # Use a small portion for faster evaluation and training
dataset_split = dataset["validation"].train_test_split(test_size=1 - train_size, shuffle=True)
train_data = dataset_split["train"].select(range(5))
eval_data = dataset_split["test"].select(range(5))

class CustomDataset(Dataset):
  def __init__(self, data, tokenizer):
    self.data = data
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    sample = self.data[idx]
    cleaned_text = clean_text(sample["document"])
    input_ids = tokenize_text(cleaned_text, self.tokenizer).squeeze()
    summary = tokenize_text(sample["summary"], self.tokenizer).squeeze()  # Tokenize the summary as well if needed
    return {"input_ids": input_ids, "labels": summary}

Loading dataset from local disk...


Dataset loaded from local disk.
Dataset preparing


In [5]:
def clean_text(text):
  text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
  text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
  text = text.lower()  # Convert to lowercase
  text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove non-alphanumeric characters
  return text

def tokenize_text(text, tokenizer):
  """
  Tokenizes text using the provided tokenizer.
  """
  input_ids = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)["input_ids"]
  return input_ids

In [6]:
def evaluate_model(model, data_loader, tokenizer):
  rouge = Rouge()
  model.eval()  # Set model to evaluation mode

  predictions = []
  references = []
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch["input_ids"].to(device)
      labels = batch["labels"].to(device)

      # Generate summary
      summary_ids = model.generate(
          input_ids=input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True
      )

      # Decode summaries
      for summary_id in summary_ids:
        summary = tokenizer.decode(summary_id, skip_special_tokens=True)
        predictions.append(summary)

      for label in labels:
        reference = tokenizer.decode(label, skip_special_tokens=True)
        references.append(reference)

  # Calculate ROUGE score
  rouge_score = rouge.get_scores(predictions, references, avg=True)
  print(f"ROUGE Score: {rouge_score}")
  return rouge_score["rouge-l"]["f"]

In [7]:
def summarize_text(text_to_summarize, model, tokenizer):
  cleaned_text = clean_text(text_to_summarize)
  input_ids = tokenize_text(cleaned_text, tokenizer).to(device)

  summary_ids = model.generate(
      input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True
  )
  summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

  return summary

In [8]:
def train_model(model, train_data, tokenizer, epochs=3, batch_size=2, gradient_accumulation_steps=16):
  optimizer = Adam(model.parameters(), lr=1e-5)  # Adjust learning rate as needed
  model.train()  # Set model to training mode

  training_loss = []  # Track training loss for visualization (optional)
  best_rouge = 0  # Track best ROUGE score for early stopping (optional)
  patience = 3  # Number of epochs to wait for improvement before stopping (optional)

  for epoch in range(epochs):
    epoch_loss = 0.0

    # Create DataLoader for the training data
    train_dataset = CustomDataset(train_data, tokenizer)
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    for step, batch in enumerate(train_data_loader):
      input_ids = batch["input_ids"].to(device)
      labels = batch["labels"].to(device)

      # Forward pass
      outputs = model(input_ids=input_ids, labels=labels)
      loss = outputs.loss / gradient_accumulation_steps  # Normalize loss

      # Backward pass
      loss.backward()

      # Gradient accumulation
      if (step + 1) % gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

      epoch_loss += loss.item()

    # Print training loss
    print(f"Epoch: {epoch+1}/{epochs}, Training Loss: {epoch_loss:.4f}")
    training_loss.append(epoch_loss)  # Track training loss 

    # Evaluate model on validation set
    eval_dataset = CustomDataset(eval_data, tokenizer)
    eval_data_loader = DataLoader(eval_dataset, batch_size=1)
    val_rouge = evaluate_model(model, eval_data_loader, tokenizer)
  

    # Early stopping
    if val_rouge > best_rouge:
      best_rouge = val_rouge
      patience = 3  # Reset patience counter
    else:
      patience -= 1
      if patience == 0:
        print("Early stopping triggered!")
        break


 

  return model  # Return the trained model

In [9]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load or create model
if os.path.exists(model_save_path):
  model = AutoModelForSeq2SeqLM.from_pretrained(model_save_path)
  print("Loaded pre-trained model from:", model_save_path)
else:
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
  print("Created new model from:", model_name)

model.to(device)

# Train the model
trained_model = train_model(model, train_data, tokenizer, epochs=epochs, batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps)

# Save the trained model (optional)
if not os.path.exists(model_save_path):
  model.save_pretrained(model_save_path)
  print("Saved trained model to:", model_save_path)

Created new model from: facebook/bart-base


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch: 1/3, Training Loss: 2.9462
ROUGE Score: {'rouge-1': {'r': 0.3845454545454545, 'p': 0.099366196287821, 'f': 0.15562081869531363}, 'rouge-2': {'r': 0.060497835497835495, 'p': 0.010754658377043322, 'f': 0.018037140417443864}, 'rouge-l': {'r': 0.23666666666666666, 'p': 0.05896891685736079, 'f': 0.09312242642623349}}
Epoch: 2/3, Training Loss: 2.9047
ROUGE Score: {'rouge-1': {'r': 0.3845454545454545, 'p': 0.099366196287821, 'f': 0.15562081869531363}, 'rouge-2': {'r': 0.060497835497835495, 'p': 0.010754658377043322, 'f': 0.018037140417443864}, 'rouge-l': {'r': 0.23666666666666666, 'p': 0.05896891685736079, 'f': 0.09312242642623349}}
Epoch: 3/3, Training Loss: 2.9047


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


ROUGE Score: {'rouge-1': {'r': 0.3845454545454545, 'p': 0.099366196287821, 'f': 0.15562081869531363}, 'rouge-2': {'r': 0.060497835497835495, 'p': 0.010754658377043322, 'f': 0.018037140417443864}, 'rouge-l': {'r': 0.23666666666666666, 'p': 0.05896891685736079, 'f': 0.09312242642623349}}
Saved trained model to: E:\NIKHIL\ML\Text Summerizer Using Deep Learning\Bart-fine_tuned_model


In [10]:
# Evaluate model on validation set
eval_dataset = CustomDataset(eval_data, tokenizer)
eval_data_loader = DataLoader(eval_dataset, batch_size=1)
evaluate_model(model, eval_data_loader, tokenizer)

ROUGE Score: {'rouge-1': {'r': 0.3845454545454545, 'p': 0.099366196287821, 'f': 0.15562081869531363}, 'rouge-2': {'r': 0.060497835497835495, 'p': 0.010754658377043322, 'f': 0.018037140417443864}, 'rouge-l': {'r': 0.23666666666666666, 'p': 0.05896891685736079, 'f': 0.09312242642623349}}


0.09312242642623349

In [11]:
# Summarize new text (optional)
new_text = """Article: The Rise of Citizen Science
Citizen science, the involvement of the public in scientific research, is rapidly transforming how we understand the world around us.  Traditionally, scientific inquiry has been the domain of professional researchers working in labs and universities. However, citizen science projects are harnessing the power of the public to collect and analyze massive amounts of data, leading to groundbreaking discoveries across various fields.
One of the most prominent examples of citizen science is Galaxy Zoo, an online project where volunteers classify galaxies based on their morphology. This project has not only helped astronomers categorize millions of galaxies but also led to the discovery of new galaxy types. Similarly, eBird, a platform where birdwatchers log their sightings, has provided invaluable data on bird populations and migration patterns, crucial for conservation efforts.
Citizen science isn't limited to online projects. Initiatives like the National Audubon Society's Christmas Bird Count, a century-old tradition where volunteers conduct annual bird surveys, have yielded long-term datasets that track bird population trends. Likewise, projects involving water quality monitoring or invasive species tracking empower communities to become active participants in protecting their local environments.
The rise of citizen science presents several advantages. It allows scientists to gather data at a much larger scale and geographic scope than ever before. This can be particularly valuable in studying phenomena like climate change or species distribution that require global monitoring efforts. Additionally, citizen science fosters public engagement with science, promoting scientific literacy and empowering communities to take ownership of their environment.
However, citizen science also faces challenges. Data quality can be a concern, as volunteers may lack the expertise of professional researchers. Project design and training for volunteers are crucial to ensure data accuracy. Additionally, ensuring equitable access to citizen science opportunities is essential to avoid biases in data collection.
Despite these challenges, the future of citizen science is bright. Technological advancements like smartphones with built-in sensors and user-friendly online platforms are making participation even easier. As citizen science continues to evolve, it has the potential to revolutionize scientific research and empower communities to become active stewards of our planet.
"""
print("Original Text")
print(new_text)
summary = summarize_text(new_text, trained_model, tokenizer)
print("Summary:", summary)

Original Text
Article: The Rise of Citizen Science
Citizen science, the involvement of the public in scientific research, is rapidly transforming how we understand the world around us.  Traditionally, scientific inquiry has been the domain of professional researchers working in labs and universities. However, citizen science projects are harnessing the power of the public to collect and analyze massive amounts of data, leading to groundbreaking discoveries across various fields.
One of the most prominent examples of citizen science is Galaxy Zoo, an online project where volunteers classify galaxies based on their morphology. This project has not only helped astronomers categorize millions of galaxies but also led to the discovery of new galaxy types. Similarly, eBird, a platform where birdwatchers log their sightings, has provided invaluable data on bird populations and migration patterns, crucial for conservation efforts.
Citizen science isn't limited to online projects. Initiatives l