In [1]:

import numpy as np
import pandas as pd
from evaluate import load
from scipy.ndimage import label
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset, DatasetDict
from datasets import load_from_disk
import numpy as np
import nltk
import re
import spacy
nltk.download('punkt')
from nltk.tokenize import sent_tokenize


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jameelamer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load English tokenizer, POS tagger, parser, NER from spaCy
import spacy.cli

try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import spacy.cli
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
def normalize_whitespace(text):
    """Remove extra whitespace, newlines, and tabs."""
    return re.sub(r'\s+', ' ', text).strip()

def remove_noise_and_references(text):
    """
    Remove references like [1], (Smith et al., 2020), and figure/table mentions.
    You can extend the patterns as needed.
    """
    # Remove square bracket citations like [1], [12]
    text = re.sub(r'\[\d+\]', '', text)
    
    # Remove in-text references like (Smith et al., 2020)
    text = re.sub(r'\(([^)]*et al\.,?\s?\d{4})\)', '', text)
    
    # Remove "Fig. 1", "Table 2", etc.
    text = re.sub(r'(Fig\.?|Figure|Table)\s?\d+[a-zA-Z]?', '', text, flags=re.IGNORECASE)
    
    # Remove licensing and copyright boilerplate
    text = re.sub(r'©.*?(\.|\n)', '', text)
    text = re.sub(r'This article is licensed.*?(\.|\n)', '', text, flags=re.IGNORECASE)
    
    return text

def sentence_segmentation(text):
    """Segment text into individual sentences using spaCy."""
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents if sent.text.strip()]

def preprocess_text(raw_text):
    """Complete preprocessing pipeline."""
    step1 = normalize_whitespace(raw_text)
    step2 = remove_noise_and_references(step1)
    sentences = sentence_segmentation(step2)
    return ' '.join(str(sentence) for sentence in sentences)

In [4]:
from transformers import BartTokenizer, BartForConditionalGeneration

model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

In [5]:

def bert_summary(text, num_sentences=3):    
    # call preprocess
    clean_text=preprocess_text(text)
    inputs = tokenizer.encode(clean_text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example usage
text = """
Artificial intelligence is rapidly evolving. Many industries are integrating AI to enhance performance.
Applications include healthcare, finance, and transportation. AI also raises ethical and employment concerns.
Researchers are focused on creating explainable and fair AI. Governments are responding with new regulations and policies.
"""

summary = bert_summary(text, num_sentences=2)
print(summary)


Many industries are integrating AI to enhance performance. Applications include healthcare, finance, and transportation. AI also raises ethical and employment concerns. Researchers are focused on creating explainable and fair AI. Governments are responding with new regulations and policies.


In [6]:
testdf=pd.read_csv('bbc_news_with_articles_and_extractive_summary.csv')

In [7]:
testdf['generated_summary'] = testdf['Article'].apply(lambda x:str(bert_summary( x, 5)))

In [8]:
import numpy as np
testdf.replace(r'^\s*$', np.nan, regex=True)
testdf.dropna(axis=0, how='any')
# Remove rows where "Title" is empty or null
testdf = testdf[testdf["generated_summary"].notna() & (testdf["generated_summary"] != "")]
len(testdf)

2225

In [9]:
testdf=testdf.drop_duplicates(subset=["generated_summary"])
testdf['generated_summary'].value_counts()

generated_summary
Musicians' groups are calling for an end to the "raw deal" faced by British performers. US acts are not faced with comparable expense and bureaucracy. Musicians face tougher regulations than athletes and journalists.                                                                                                                                                                         1
MPs voted 272-219 in favour of the Bill after key concessions from Mr Clarke. Labour's majority in the Commons was reduced to 14. Bill now faces opposition from peers angry at house arrest proposals. Lord Strathclyde, Tory leader in the Lords, said ministers should expect it to be "substantially re-written"                                                                             1
Blair uses speech to set out his broad brush election manifesto. Little in terms of concrete proposals or what might form manifesto pledges. More a speech designed to remind people what New Labour stood for. 

In [10]:
import re
# Function to clean text
def clean_text(text):
    return re.sub(r'[^\w\s]', '', str(text)).strip().lower()

In [11]:
testdf['is_match'] = testdf.apply(lambda row: clean_text(row['Article']) == clean_text(testdf['generated_summary']), axis=1)

In [12]:
testdf.head()

Unnamed: 0.1,Unnamed: 0,Title,Article,Summary,Category,extractive_summary,generated_summary,is_match
0,0,289,Musicians to tackle US red tape\n\nMusicians' ...,Nigel McCune from the Musicians' Union said Br...,entertainment,Nigel McCune from the Musicians' Union said Br...,Musicians' groups are calling for an end to th...,False
1,1,262,"U2's desire to be number one\n\nU2, who have w...",But they still want more.They have to want to ...,entertainment,They have to want to be the biggest band ever ...,"U2's new album, How To Dismantle An Atomic Bom...",False
2,2,276,Rocker Doherty in on-stage fight\n\nRock singe...,"Babyshambles, which he formed after his acrimo...",entertainment,"Babyshambles, which he formed after his acrimo...",Former Libertines singer Pete Doherty involved...,False
3,3,60,Snicket tops US box office chart\n\nThe film a...,A Series of Unfortunate Events also stars Scot...,entertainment,"A Series of Unfortunate Events, starring Jim C...",A Series of Unfortunate Events took $30.2m (£1...,False
4,4,74,Ocean's Twelve raids box office\n\nOcean's Twe...,"Ocean's Twelve, the crime caper sequel starrin...",entertainment,Ocean's Twelve raids box office\n\nOcean's Twe...,Ocean's Twelve is the fourth-biggest opening f...,False


In [13]:
from rouge import Rouge

rouge = Rouge()

def truncate_text(text, max_words=100):
    return " ".join(text.split()[:max_words])

def compute_rouge(reference, generated):
    if not generated.strip():  # Handle empty strings
        return {"rouge-1": 0, "rouge-2": 0, "rouge-l": 0}
    
    # Truncate long summaries
    reference = truncate_text(reference)
    generated = truncate_text(generated)
    
    scores = rouge.get_scores(generated, reference)
    return scores[0]

# Apply function
testdf["rouge_scores"] = testdf.apply(lambda row: compute_rouge(row["Summary"], row["generated_summary"]), axis=1)


In [14]:
testdf.head()

Unnamed: 0.1,Unnamed: 0,Title,Article,Summary,Category,extractive_summary,generated_summary,is_match,rouge_scores
0,0,289,Musicians to tackle US red tape\n\nMusicians' ...,Nigel McCune from the Musicians' Union said Br...,entertainment,Nigel McCune from the Musicians' Union said Br...,Musicians' groups are calling for an end to th...,False,"{'rouge-1': {'r': 0.15584415584415584, 'p': 0...."
1,1,262,"U2's desire to be number one\n\nU2, who have w...",But they still want more.They have to want to ...,entertainment,They have to want to be the biggest band ever ...,"U2's new album, How To Dismantle An Atomic Bom...",False,"{'rouge-1': {'r': 0.22972972972972974, 'p': 0...."
2,2,276,Rocker Doherty in on-stage fight\n\nRock singe...,"Babyshambles, which he formed after his acrimo...",entertainment,"Babyshambles, which he formed after his acrimo...",Former Libertines singer Pete Doherty involved...,False,"{'rouge-1': {'r': 0.19480519480519481, 'p': 0...."
3,3,60,Snicket tops US box office chart\n\nThe film a...,A Series of Unfortunate Events also stars Scot...,entertainment,"A Series of Unfortunate Events, starring Jim C...",A Series of Unfortunate Events took $30.2m (£1...,False,"{'rouge-1': {'r': 0.4745762711864407, 'p': 0.9..."
4,4,74,Ocean's Twelve raids box office\n\nOcean's Twe...,"Ocean's Twelve, the crime caper sequel starrin...",entertainment,Ocean's Twelve raids box office\n\nOcean's Twe...,Ocean's Twelve is the fourth-biggest opening f...,False,"{'rouge-1': {'r': 0.19230769230769232, 'p': 0...."


In [15]:
# Display average ROUGE scores
rouge_l_scores = [score["rouge-l"]["f"] for score in testdf["rouge_scores"]]
print(f"Average ROUGE-L Score: {sum(rouge_l_scores) / len(rouge_l_scores):.4f}")

Average ROUGE-L Score: 0.3579


In [16]:
# Display average ROUGE scores
rouge_l_scores = [score["rouge-2"]["f"] for score in testdf["rouge_scores"]]
print(f"Average ROUGE-2 Score: {sum(rouge_l_scores) / len(rouge_l_scores):.4f}")

Average ROUGE-2 Score: 0.2209


In [17]:
# Display average ROUGE scores
rouge_l_scores = [score["rouge-1"]["f"] for score in testdf["rouge_scores"]]
print(f"Average ROUGE-1 Score: {sum(rouge_l_scores) / len(rouge_l_scores):.4f}")

Average ROUGE-1 Score: 0.3722


In [18]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')


def compute_similarity(reference, generated):
    ref_embedding = model.encode(reference)
    gen_embedding = model.encode(generated)
    return cosine_similarity([ref_embedding], [gen_embedding])[0][0]


testdf["cosine_similarity"] = testdf.apply(lambda row: compute_similarity(row["Summary"], row["generated_summary"]),
                                           axis=1)
print(f"Average Cosine Similarity: {testdf['cosine_similarity'].mean():.4f}")


Average Cosine Similarity: 0.7268
