In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
import re
import spacy
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jameelamer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load English tokenizer, POS tagger, parser, NER from spaCy
import spacy.cli

try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import spacy.cli
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
def normalize_whitespace(text):
    """Remove extra whitespace, newlines, and tabs."""
    return re.sub(r'\s+', ' ', text).strip()

def remove_noise_and_references(text):
    """
    Remove references like [1], (Smith et al., 2020), and figure/table mentions.
    You can extend the patterns as needed.
    """
    # Remove square bracket citations like [1], [12]
    text = re.sub(r'\[\d+\]', '', text)
    
    # Remove in-text references like (Smith et al., 2020)
    text = re.sub(r'\(([^)]*et al\.,?\s?\d{4})\)', '', text)
    
    # Remove "Fig. 1", "Table 2", etc.
    text = re.sub(r'(Fig\.?|Figure|Table)\s?\d+[a-zA-Z]?', '', text, flags=re.IGNORECASE)
    
    # Remove licensing and copyright boilerplate
    text = re.sub(r'©.*?(\.|\n)', '', text)
    text = re.sub(r'This article is licensed.*?(\.|\n)', '', text, flags=re.IGNORECASE)
    
    return text

def sentence_segmentation(text):
    """Segment text into individual sentences using spaCy."""
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents if sent.text.strip()]

def preprocess_text(raw_text):
    """Complete preprocessing pipeline."""
    step1 = normalize_whitespace(raw_text)
    step2 = remove_noise_and_references(step1)
    sentences = sentence_segmentation(step2)
    return sentences

In [3]:
def summarize_with_tfidf(text, num_sentences=3):
    # call preprocess
    clean_text=preprocess_text(text)
    # Step 1: Sentence Tokenization
    sentences = sent_tokenize(str(clean_text))

    # Check if there are enough sentences
    if len(sentences) == 0:
        return "No content to summarize."

    try:
        # Step 2: Compute TF-IDF matrix
        vectorizer = TfidfVectorizer(stop_words='english')  # Filter stopwords
        tfidf_matrix = vectorizer.fit_transform(sentences)

        if tfidf_matrix.shape[1] == 0:
            return "The text contains only stopwords or insufficient content for summarization."

        # Step 3: Calculate sentence scores using cosine similarity
        sentence_scores = cosine_similarity(tfidf_matrix, tfidf_matrix).sum(axis=1)

        # Step 4: Rank sentences
        ranked_sentences_idx = np.argsort(sentence_scores)[::-1]

        # Step 5: Select top N sentences
        selected_sentences = [sentences[i] for i in sorted(ranked_sentences_idx[:num_sentences])]

        # Step 6: Combine them into a summary
        summary = " ".join(selected_sentences)
        return summary
    
    except ValueError as e:
        return f"Error: {str(e)}"


In [4]:
text = """
Musicians to tackle US red tape\n\nMusicians' groups are to tackle US visa regulations which are blamed for hindering British acts' chances of succeeding across the Atlantic.\n\nA singer hoping to perform in the US can expect to pay $1,300 (£680) simply for obtaining a visa. Groups including the Musicians' Union are calling for an end to the "raw deal" faced by British performers. US acts are not faced with comparable expense and bureaucracy when visiting the UK for promotional purposes.\n\nNigel McCune from the Musicians' Union said British musicians are "disadvantaged" compared to their US counterparts. A sponsor has to make a petition on their behalf, which is a form amounting to nearly 30 pages, while musicians face tougher regulations than athletes and journalists. "If you make a mistake on your form, you risk a five-year ban and thus the ability to further your career," says Mr McCune.\n\n"The US is the world's biggest music market, which means something has to be done about the creaky bureaucracy," says Mr McCune. "The current situation is preventing British acts from maintaining momentum and developing in the US," he added.\n\nThe Musicians' Union stance is being endorsed by the Music Managers' Forum (MMF), who say British artists face "an uphill struggle" to succeed in the US, thanks to the tough visa requirements, which are also seen as impractical. The MMF's general secretary James Seller said: "Imagine if you were an orchestra from the Orkneys? Every member would have to travel to London to have their visas processed."\n\n"The US market is seen as the holy grail and one of the benchmarks of success, and we're still going to fight to get in there. "It's still very important, but there are other markets like Europe, India and China," added Mr Seller. A Department for Media, Culture and Sport spokeswoman said: "We're aware that people are experiencing problems, and are working with the US embassy and record industry to see what we can do about it." A US Embassy spokesman said: "We are aware that entertainers require visas for time-specific visas and are doing everything we can to process those applications speedily." "We are aware of the importance of cultural exchange and we will do our best to facilitate that," he added.\n
"""

summary = summarize_with_tfidf(text, num_sentences=5)
print(str(summary))


["Musicians to tackle US red tape Musicians' groups are to tackle US visa regulations which are blamed for hindering British acts' chances of succeeding across the Atlantic. ', 'Groups including the Musicians\' Union are calling for an end to the "raw deal" faced by British performers. ', 'Nigel McCune from the Musicians\' Union said British musicians are "disadvantaged" compared to their US counterparts. ', '"The US is the world\'s biggest music market, which means something has to be done about the creaky bureaucracy," says Mr McCune. ', 'The Musicians\' Union stance is being endorsed by the Music Managers\' Forum (MMF), who say British artists face "an uphill struggle" to succeed in the US, thanks to the tough visa requirements, which are also seen as impractical.


In [5]:
#evaluation  

In [6]:
import numpy as np
import pandas as pd
from evaluate import load
from scipy.ndimage import label
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# BBC News Dataset
# testdf=pd.read_csv('bbc_news_with_articles_and_extractive_summary.csv')
# gov report dataset

testdf=pd.read_csv('gov_report_with_articles_and_extractive_summary.csv')

In [8]:
testdf.head()

Unnamed: 0.1,Unnamed: 0,Article,Summary,extractive_summary
0,0,"In our prior work, we have found that technolo...",Multiple firms have produced cell-cultured mea...,Some firms have developed prototypes of cell-c...
1,1,"A variety of federal laws, regulations, and po...",Federal advisory committees provide advice to ...,EPA’s federal advisory committees play an impo...
2,2,"According to the National Research Council, al...",DOD manages a global real-estate portfolio wit...,DOD has a global real estate portfolio that su...
3,3,Nursing homes are required to keep residents s...,Nursing homes provide care to about 1.4 millio...,Nursing home characteristics.. At the national...
4,4,This section provides an overview of (1) the l...,Decades of defense activities at DOE's Idaho N...,DOE has a process for determining that certain...


In [9]:
testdf['generated_summary'] = testdf['Article'].apply(lambda x:str(summarize_with_tfidf( x, 25)))

In [10]:
testdf.replace(r'^\s*$', np.nan, regex=True)
testdf.dropna(axis=0, how='any')
# Remove rows where "Title" is empty or null
testdf = testdf[testdf["generated_summary"].notna() & (testdf["generated_summary"] != "")]
len(testdf)

973

In [11]:
testdf=testdf.drop_duplicates(subset=["generated_summary"])
testdf['generated_summary'].value_counts()

generated_summary
', 'As a result, certain information is not yet available to stakeholders—including cell-cultured meat firms themselves, regulators, and the public—about specific aspects of the technology and commercial production methods that will be used, such as the composition of the growth medium and of the final products. ', 'Each cell-cultured meat firm is developing detailed information on its own eventual commercial production methods for making cell-cultured meat. ', 'Specifically, FDA and USDA officials said they have limited information on cell-cultured meat production methods and products and need more in order to regulate this new food. ', 'According to FDA officials and representatives from one cell-cultured meat firm, it is likely that some firms will use genetic engineering in their commercial cell-cultured meat production methods. ', 'Some studies and stakeholders we interviewed, including representatives from cell-cultured meat firms, said that the high production 

In [12]:
import re
# Function to clean text
def clean_text(text):
    return re.sub(r'[^\w\s]', '', str(text)).strip().lower()

In [13]:
testdf['is_match'] = testdf.apply(lambda row: clean_text(row['Article']) == clean_text(testdf['generated_summary']), axis=1)

In [14]:
testdf.head()

Unnamed: 0.1,Unnamed: 0,Article,Summary,extractive_summary,generated_summary,is_match
0,0,"In our prior work, we have found that technolo...",Multiple firms have produced cell-cultured mea...,Some firms have developed prototypes of cell-c...,"', 'As a result, certain information is not ye...",False
1,1,"A variety of federal laws, regulations, and po...",Federal advisory committees provide advice to ...,EPA’s federal advisory committees play an impo...,"', 'Based on our review of EPA’s Federal Advis...",False
2,2,"According to the National Research Council, al...",DOD manages a global real-estate portfolio wit...,DOD has a global real estate portfolio that su...,"', 'However, 8 of the 23 installations we visi...",False
3,3,Nursing homes are required to keep residents s...,Nursing homes provide care to about 1.4 millio...,Nursing home characteristics.. At the national...,"', 'In addition to standard surveys, state sur...",False
4,4,This section provides an overview of (1) the l...,Decades of defense activities at DOE's Idaho N...,DOE has a process for determining that certain...,"', 'As of March 2019, EM’s IWTU reengineering ...",False


In [15]:
from rouge import Rouge

rouge = Rouge()

def truncate_text(text, max_words=100):
    return " ".join(text.split()[:max_words])

def compute_rouge(reference, generated):
    if not generated.strip():  # Handle empty strings
        return {"rouge-1": 0, "rouge-2": 0, "rouge-l": 0}
    
    # Truncate long summaries
    reference = truncate_text(reference)
    generated = truncate_text(generated)
    
    scores = rouge.get_scores(generated, reference)
    return scores[0]

# Apply function
testdf["rouge_scores"] = testdf.apply(lambda row: compute_rouge(row["extractive_summary"], row["generated_summary"]), axis=1)


In [16]:
testdf.head()

Unnamed: 0.1,Unnamed: 0,Article,Summary,extractive_summary,generated_summary,is_match,rouge_scores
0,0,"In our prior work, we have found that technolo...",Multiple firms have produced cell-cultured mea...,Some firms have developed prototypes of cell-c...,"', 'As a result, certain information is not ye...",False,"{'rouge-1': {'r': 0.27631578947368424, 'p': 0...."
1,1,"A variety of federal laws, regulations, and po...",Federal advisory committees provide advice to ...,EPA’s federal advisory committees play an impo...,"', 'Based on our review of EPA’s Federal Advis...",False,"{'rouge-1': {'r': 0.13333333333333333, 'p': 0...."
2,2,"According to the National Research Council, al...",DOD manages a global real-estate portfolio wit...,DOD has a global real estate portfolio that su...,"', 'However, 8 of the 23 installations we visi...",False,"{'rouge-1': {'r': 0.2, 'p': 0.2539682539682539..."
3,3,Nursing homes are required to keep residents s...,Nursing homes provide care to about 1.4 millio...,Nursing home characteristics.. At the national...,"', 'In addition to standard surveys, state sur...",False,"{'rouge-1': {'r': 0.2112676056338028, 'p': 0.2..."
4,4,This section provides an overview of (1) the l...,Decades of defense activities at DOE's Idaho N...,DOE has a process for determining that certain...,"', 'As of March 2019, EM’s IWTU reengineering ...",False,"{'rouge-1': {'r': 0.18309859154929578, 'p': 0...."


In [17]:
# Display average ROUGE scores
rouge_l_scores = [score["rouge-l"]["f"] for score in testdf["rouge_scores"]]
print(f"Average ROUGE-L Score: {sum(rouge_l_scores) / len(rouge_l_scores):.4f}")

Average ROUGE-L Score: 0.2818


In [18]:
# Display average ROUGE scores
rouge_l_scores = [score["rouge-2"]["f"] for score in testdf["rouge_scores"]]
print(f"Average ROUGE-2 Score: {sum(rouge_l_scores) / len(rouge_l_scores):.4f}")

Average ROUGE-2 Score: 0.1421


In [19]:
# Display average ROUGE scores
rouge_l_scores = [score["rouge-1"]["f"] for score in testdf["rouge_scores"]]
print(f"Average ROUGE-1 Score: {sum(rouge_l_scores) / len(rouge_l_scores):.4f}")

Average ROUGE-1 Score: 0.3115


In [20]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
def compute_similarity(reference, generated):
    ref_embedding = model.encode(reference)
    gen_embedding = model.encode(generated)
    return cosine_similarity([ref_embedding], [gen_embedding])[0][0]

testdf["cosine_similarity"] = testdf.apply(lambda row: compute_similarity(row["extractive_summary"], row["generated_summary"]), axis=1)
print(f"Average Cosine Similarity: {testdf['cosine_similarity'].mean():.4f}")


Average Cosine Similarity: 0.7873


In [21]:
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import Counter
def precision_recall_f1(reference_texts, generated_summaries):
    precisions, recalls, f1s = [], [], []

    for ref, gen in zip(reference_texts, generated_summaries):
        ref_tokens = ref.lower().split()
        gen_tokens = gen.lower().split()

        ref_counter = Counter(ref_tokens)
        gen_counter = Counter(gen_tokens)

        common = ref_counter & gen_counter
        tp = sum(common.values())
        fp = sum(gen_counter.values()) - tp
        fn = sum(ref_counter.values()) - tp

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

    return {
        "average_precision": np.mean(precisions),
        "average_recall": np.mean(recalls),
        "average_f1_score": np.mean(f1s)
    }


In [22]:
reference_summaries = testdf['extractive_summary'].tolist()  
generated_summaries = testdf['generated_summary'].tolist()  

# Evaluate
results = precision_recall_f1(reference_summaries, generated_summaries)
# Print results
print("Average Precision:", round(results['average_precision'], 4))
print("Average Recall:", round(results['average_recall'], 4))
print("Average F1 Score:", round(results['average_f1_score'], 4))

Average Precision: 0.4577
Average Recall: 0.6099
Average F1 Score: 0.5108
