In [1]:

import numpy as np
import pandas as pd
from evaluate import load
from scipy.ndimage import label
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import AutoTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
from datasets import load_from_disk
import numpy as np
import nltk
import re
import spacy
import torch
nltk.download('punkt')
from nltk.tokenize import sent_tokenize


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jameelamer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load English tokenizer, POS tagger, parser, NER from spaCy
import spacy.cli

try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import spacy.cli
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
def normalize_whitespace(text):
    """Remove extra whitespace, newlines, and tabs."""
    return re.sub(r'\s+', ' ', text).strip()

def remove_noise_and_references(text):
    """
    Remove references like [1], (Smith et al., 2020), and figure/table mentions.
    You can extend the patterns as needed.
    """
    # Remove square bracket citations like [1], [12]
    text = re.sub(r'\[\d+\]', '', text)
    
    # Remove in-text references like (Smith et al., 2020)
    text = re.sub(r'\(([^)]*et al\.,?\s?\d{4})\)', '', text)
    
    # Remove "Fig. 1", "Table 2", etc.
    text = re.sub(r'(Fig\.?|Figure|Table)\s?\d+[a-zA-Z]?', '', text, flags=re.IGNORECASE)
    
    # Remove licensing and copyright boilerplate
    text = re.sub(r'©.*?(\.|\n)', '', text)
    text = re.sub(r'This article is licensed.*?(\.|\n)', '', text, flags=re.IGNORECASE)
    
    return text

def sentence_segmentation(text):
    """Segment text into individual sentences using spaCy."""
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents if sent.text.strip()]

def preprocess_text(raw_text):
    """Complete preprocessing pipeline."""
    step1 = normalize_whitespace(raw_text)
    step2 = remove_noise_and_references(step1)
    sentences = sentence_segmentation(step2)
    return ' '.join(str(sentence) for sentence in sentences)

In [4]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from transformers import BartForConditionalGeneration, BartTokenizer

# Load the model
model = BertForSequenceClassification.from_pretrained(
    "./bertsum_bbc_news/bertsum_finetuned_model",
    num_labels=1,  # Binary classification per sentence
    problem_type="multi_label_classification"
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('./bertsum_bbc_news/bertsum_finetuned_model')
# Detect MPS device on Mac
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)  # Move model to MPS

def bertsum_summary(model, tokenizer, text, device):    
    # call preprocess
    clean_text=preprocess_text(text)
   
    model.eval()
    
    # Tokenize input text
    inputs = tokenizer(clean_text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Move inputs to the correct device
    inputs = {key: val.to(device) for key, val in inputs.items()}
    
    with torch.no_grad():
        output = model(**inputs)  # Forward pass on MPS
        logits = output.logits.squeeze(-1)  # Extract logits
        
        # Ensure logits are moved to CPU before processing
        logits = logits.cpu()

        # Select sentences using thresholding
        predicted_labels = (logits > 0.5).int()
        
        sentences = text.split(". ")  # Sentence tokenization
        min_length = min(len(sentences), len(predicted_labels))
        # print("Logits:", logits)
        # print("Predicted Labels:", predicted_labels)
        # print("Sentences:", sentences)
        selected_sentences = [sentences[i] for i in range(min_length) if predicted_labels[i] == 1]        
        summary = " ".join(selected_sentences)
        return summary

# Example usage
text = """
Artificial intelligence is rapidly evolving. Many industries are integrating AI to enhance performance.
Applications include healthcare, finance, and transportation. AI also raises ethical and employment concerns.
Researchers are focused on creating explainable and fair AI. Governments are responding with new regulations and policies.
"""

summary = bertsum_summary(model, tokenizer, text, device)
print(summary)





In [5]:
testdf=pd.read_csv('bbc_news_with_articles_and_extractive_summary.csv')

In [6]:
testdf['generated_summary'] = testdf['Article'].apply(lambda x:str(bertsum_summary(model, tokenizer, x, device)))

In [7]:
import numpy as np
testdf.replace(r'^\s*$', np.nan, regex=True)
testdf.dropna(axis=0, how='any')
# Remove rows where "Title" is empty or null
testdf = testdf[testdf["generated_summary"].notna() & (testdf["generated_summary"] != "")]
len(testdf)

1462

In [8]:
testdf=testdf.drop_duplicates(subset=["generated_summary"])
testdf['generated_summary'].value_counts()

generated_summary
Musicians to tackle US red tape\n\nMusicians' groups are to tackle US visa regulations which are blamed for hindering British acts' chances of succeeding across the Atlantic.\n\nA singer hoping to perform in the US can expect to pay $1,300 (£680) simply for obtaining a visa                                                                                                         1
Trial date is set for Balco case\n\nA US judge has set a preliminary trial date for the Balco steroid distribution case which has rocked athletics.\n\nUS district court judge Susan Ilston rejected an attempt by the defence team to have the case dismissed at a pre-trial hearing in San Francisco                                                                                     1
Donor attacks Blair-Brown 'feud'\n\nThe reported feud between Tony Blair and Gordon Brown has prompted a Labour donor to say he will almost certainly refuse to give more funds.\n\nDuncan Bannatyne also attacked the gover

In [9]:
import re
# Function to clean text
def clean_text(text):
    return re.sub(r'[^\w\s]', '', str(text)).strip().lower()

In [10]:
testdf['is_match'] = testdf.apply(lambda row: clean_text(row['Article']) == clean_text(testdf['generated_summary']), axis=1)

In [11]:
testdf.head()

Unnamed: 0.1,Unnamed: 0,Title,Article,Summary,Category,extractive_summary,generated_summary,is_match
0,0,289,Musicians to tackle US red tape\n\nMusicians' ...,Nigel McCune from the Musicians' Union said Br...,entertainment,Nigel McCune from the Musicians' Union said Br...,Musicians to tackle US red tape\n\nMusicians' ...,False
2,2,276,Rocker Doherty in on-stage fight\n\nRock singe...,"Babyshambles, which he formed after his acrimo...",entertainment,"Babyshambles, which he formed after his acrimo...",Rocker Doherty in on-stage fight\n\nRock singe...,False
6,6,114,Pete Doherty misses bail deadline\n\nSinger Pe...,Mr Wass was also given a curfew and told to su...,entertainment,"Mr Doherty, 25, was arrested following an alle...",Pete Doherty misses bail deadline\n\nSinger Pe...,False
7,7,100,Fockers retain film chart crown\n\nComedy Meet...,Meet the Fockers also broke the box office rec...,entertainment,Meet the Fockers also broke the box office rec...,Fockers retain film chart crown\n\nComedy Meet...,False
11,11,303,Film production 'falls' 40% in UK\n\nThe numbe...,The UK Film Council said the drop was partly d...,entertainment,Twenty-seven British films were made in the UK...,Film production 'falls' 40% in UK\n\nThe numbe...,False


In [12]:
from rouge import Rouge

rouge = Rouge()

def truncate_text(text, max_words=100):
    return " ".join(text.split()[:max_words])

def compute_rouge(reference, generated):
    if not generated.strip():  # Handle empty strings
        return {"rouge-1": 0, "rouge-2": 0, "rouge-l": 0}
    
    # Truncate long summaries
    reference = truncate_text(reference)
    generated = truncate_text(generated)
    
    scores = rouge.get_scores(generated, reference)
    return scores[0]

# Apply function
testdf["rouge_scores"] = testdf.apply(lambda row: compute_rouge(row["Summary"], row["generated_summary"]), axis=1)


In [13]:
testdf.head()

Unnamed: 0.1,Unnamed: 0,Title,Article,Summary,Category,extractive_summary,generated_summary,is_match,rouge_scores
0,0,289,Musicians to tackle US red tape\n\nMusicians' ...,Nigel McCune from the Musicians' Union said Br...,entertainment,Nigel McCune from the Musicians' Union said Br...,Musicians to tackle US red tape\n\nMusicians' ...,False,"{'rouge-1': {'r': 0.2987012987012987, 'p': 0.6..."
2,2,276,Rocker Doherty in on-stage fight\n\nRock singe...,"Babyshambles, which he formed after his acrimo...",entertainment,"Babyshambles, which he formed after his acrimo...",Rocker Doherty in on-stage fight\n\nRock singe...,False,"{'rouge-1': {'r': 0.2857142857142857, 'p': 0.6..."
6,6,114,Pete Doherty misses bail deadline\n\nSinger Pe...,Mr Wass was also given a curfew and told to su...,entertainment,"Mr Doherty, 25, was arrested following an alle...",Pete Doherty misses bail deadline\n\nSinger Pe...,False,"{'rouge-1': {'r': 0.4358974358974359, 'p': 0.7..."
7,7,100,Fockers retain film chart crown\n\nComedy Meet...,Meet the Fockers also broke the box office rec...,entertainment,Meet the Fockers also broke the box office rec...,Fockers retain film chart crown\n\nComedy Meet...,False,"{'rouge-1': {'r': 0.32051282051282054, 'p': 0...."
11,11,303,Film production 'falls' 40% in UK\n\nThe numbe...,The UK Film Council said the drop was partly d...,entertainment,Twenty-seven British films were made in the UK...,Film production 'falls' 40% in UK\n\nThe numbe...,False,"{'rouge-1': {'r': 0.37142857142857144, 'p': 0...."


In [14]:
# Display average ROUGE scores
rouge_l_scores = [score["rouge-l"]["f"] for score in testdf["rouge_scores"]]
print(f"Average ROUGE-L Score: {sum(rouge_l_scores) / len(rouge_l_scores):.4f}")

Average ROUGE-L Score: 0.4143


In [15]:
# Display average ROUGE scores
rouge_l_scores = [score["rouge-2"]["f"] for score in testdf["rouge_scores"]]
print(f"Average ROUGE-2 Score: {sum(rouge_l_scores) / len(rouge_l_scores):.4f}")

Average ROUGE-2 Score: 0.2971


In [16]:
# Display average ROUGE scores
rouge_l_scores = [score["rouge-1"]["f"] for score in testdf["rouge_scores"]]
print(f"Average ROUGE-1 Score: {sum(rouge_l_scores) / len(rouge_l_scores):.4f}")

Average ROUGE-1 Score: 0.4284


In [17]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')


def compute_similarity(reference, generated):
    ref_embedding = model.encode(reference)
    gen_embedding = model.encode(generated)
    return cosine_similarity([ref_embedding], [gen_embedding])[0][0]


testdf["cosine_similarity"] = testdf.apply(lambda row: compute_similarity(row["Summary"], row["generated_summary"]),
                                           axis=1)
print(f"Average Cosine Similarity: {testdf['cosine_similarity'].mean():.4f}")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Average Cosine Similarity: 0.7437
