In [1]:
pip install transformers torch datasets rouge-score nltk ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lopa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [1]:
from transformers import BartTokenizer, BartForConditionalGeneration
import nltk
from nltk.tokenize import sent_tokenize
from rouge_score import rouge_scorer
from datasets import load_dataset
import re
import ipywidgets as widgets

try:
    try:
        nltk.data.find('tokenizers/punkt')
        print("Punkt data already available.")
    except LookupError:
        print("Punkt data not found. Downloading now...")
        nltk.download('punkt')
        nltk.data.find('tokenizers/punkt')
        print("Punkt data downloaded successfully.")
except Exception as e:
    print(f"Failed to download NLTK punkt data: {e}")
    print("Please run 'import nltk; nltk.download('punkt')' manually in a new cell and try again.")
    raise

# Loading pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
try:
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name)
except Exception as e:
    print(f"Error loading BART model: {e}")
    raise

# Loading CNN/DailyMail dataset
try:
    dataset = load_dataset("cnn_dailymail", "3.0.0")
    test_samples = dataset["test"].select(range(3))
except Exception as e:
    print(f"Error loading CNN/DailyMail dataset: {e}")
    raise

# Simple text preprocessing function
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Summarization function using BART (abstractive)
def summarize_text(text, max_length=150, min_length=30):
    clean_text = preprocess_text(text)
    inputs = tokenizer(clean_text, max_length=1024, truncation=True, return_tensors="pt")
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=min_length,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary



# Evaluate summary using ROUGE
def evaluate_summary(generated_summary, reference_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)
    return scores

# Test summarization on CNN/DailyMail samples
def test_on_dataset():
    print("Testing Summarization on CNN/DailyMail Dataset Samples:\n")
    for i, sample in enumerate(test_samples):
        article = sample["article"]
        reference_summary = sample["highlights"]
        
        print(f"Sample {i+1}:")
        print("Original Article (first 200 chars):", article[:200], "...")
        print("Reference Summary:", reference_summary)
        
        abstractive = summarize_text(article)
        print("\nAbstractive Summary (BART):", abstractive)
        
        
        print("\nROUGE Scores for Abstractive Summary:")
        rouge_scores = evaluate_summary(abstractive, reference_summary)
        for metric, score in rouge_scores.items():
            print(f"{metric}: Precision={score.precision:.2f}, Recall={score.recall:.2f}, F1={score.fmeasure:.2f}")
        print("-" * 50)

try:
    test_on_dataset()
except Exception as e:
    print(f"Error during dataset testing: {e}")



Punkt data already available.
Testing Summarization on CNN/DailyMail Dataset Samples:

Sample 1:
Original Article (first 200 chars): (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territor ...
Reference Summary: Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .

Abstractive Summary (BART): The Palestinian Authority becomes the 123rd member of the International Criminal Court. The move gives the court jurisdiction over alleged crimes in Palestinian territories. Israel and the United States opposed the Palestinians' efforts to join the body.

ROUGE Scores for Abstractive Summary:
rouge1: Precision=0.51, Recall=0.56, F1=0.54
rouge2: Precision=0.36, Recall=0.39, F1=0