In [1]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jameelamer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load English tokenizer, POS tagger, parser, NER from spaCy
import spacy.cli
import re
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import spacy.cli
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
def normalize_whitespace(text):
    """Remove extra whitespace, newlines, and tabs."""
    return re.sub(r'\s+', ' ', text).strip()

def remove_noise_and_references(text):
    """
    Remove references like [1], (Smith et al., 2020), and figure/table mentions.
    You can extend the patterns as needed.
    """
    # Remove square bracket citations like [1], [12]
    text = re.sub(r'\[\d+\]', '', text)
    
    # Remove in-text references like (Smith et al., 2020)
    text = re.sub(r'\(([^)]*et al\.,?\s?\d{4})\)', '', text)
    
    # Remove "Fig. 1", "Table 2", etc.
    text = re.sub(r'(Fig\.?|Figure|Table)\s?\d+[a-zA-Z]?', '', text, flags=re.IGNORECASE)
    
    # Remove licensing and copyright boilerplate
    text = re.sub(r'©.*?(\.|\n)', '', text)
    text = re.sub(r'This article is licensed.*?(\.|\n)', '', text, flags=re.IGNORECASE)
    
    return text

def sentence_segmentation(text):
    """Segment text into individual sentences using spaCy."""
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents if sent.text.strip()]

def preprocess_text(raw_text):
    """Complete preprocessing pipeline."""
    step1 = normalize_whitespace(raw_text)
    step2 = remove_noise_and_references(step1)
    sentences = sentence_segmentation(step2)
    return ' '.join(str(sentence) for sentence in sentences)

In [3]:
def cluster_summary(text, num_sentences=3):
    # call preprocess
    clean_text=preprocess_text(text)
    return sbert_with_clustering(clean_text,num_sentences,num_sentences)

def sbert_with_clustering(text, num_sentences=3, num_clusters=3):

    sentences = sent_tokenize(text)
    if len(sentences) <= num_sentences:
        return text
    embeddings = model.encode(sentences)
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    clusters = kmeans.fit_predict(embeddings)
    cluster_summaries = []
    for cluster_id in range(num_clusters):
        cluster_sentences = [sentences[i] for i in range(len(sentences)) if clusters[i] == cluster_id]
        cluster_embeddings = [embeddings[i] for i in range(len(sentences)) if clusters[i] == cluster_id]
        sim_matrix = cosine_similarity(cluster_embeddings)
        scores = sim_matrix.sum(axis=1)
        top_sentence_idx = np.argmax(scores)
        cluster_summaries.append(cluster_sentences[top_sentence_idx])
    summary = " ".join(cluster_summaries)
    return summary

In [4]:
text = """
Artificial intelligence (AI) has seen rapid advancements in recent years. Many industries are adopting AI for automation and efficiency. 
Healthcare, finance, and manufacturing are among the top sectors leveraging AI technologies. 
Despite its benefits, there are concerns about job displacement and ethical implications. 
AI researchers are working on developing responsible and transparent systems. 
Governments are also introducing regulations to govern AI development and deployment.
"""

summary = cluster_summary(text, num_sentences=5)
print(summary)

Healthcare, finance, and manufacturing are among the top sectors leveraging AI technologies. Despite its benefits, there are concerns about job displacement and ethical implications. Artificial intelligence (AI) has seen rapid advancements in recent years. Governments are also introducing regulations to govern AI development and deployment. AI researchers are working on developing responsible and transparent systems.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [5]:
#evaluation  

In [6]:
import numpy as np
import pandas as pd
from evaluate import load
from scipy.ndimage import label
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset, DatasetDict

In [7]:
# Load BBC dataset
testdf=pd.read_csv('bbc_news_with_articles_and_extractive_summary.csv')
#load gov report dataset
testdf = pd.read_csv('gov_report_with_articles_and_extractive_summary.csv')

testdf.head()

Unnamed: 0.1,Unnamed: 0,Article,Summary,extractive_summary
0,0,"In our prior work, we have found that technolo...",Multiple firms have produced cell-cultured mea...,Some firms have developed prototypes of cell-c...
1,1,"A variety of federal laws, regulations, and po...",Federal advisory committees provide advice to ...,EPA’s federal advisory committees play an impo...
2,2,"According to the National Research Council, al...",DOD manages a global real-estate portfolio wit...,DOD has a global real estate portfolio that su...
3,3,Nursing homes are required to keep residents s...,Nursing homes provide care to about 1.4 millio...,Nursing home characteristics.. At the national...
4,4,This section provides an overview of (1) the l...,Decades of defense activities at DOE's Idaho N...,DOE has a process for determining that certain...


In [8]:
testdf['generated_summary'] = testdf['Article'].apply(lambda x:str(cluster_summary( x, 25)))

In [9]:
testdf.replace(r'^\s*$', np.nan, regex=True)
testdf.dropna(axis=0, how='any')
# Remove rows where "Title" is empty or null
testdf = testdf[testdf["generated_summary"].notna() & (testdf["generated_summary"] != "")]
len(testdf)

973

In [10]:
testdf=testdf.drop_duplicates(subset=["generated_summary"])
testdf['generated_summary'].value_counts()

generated_summary
As of December 2019, these firms had not provided regulators with specific information detailing the composition of their cell-cultured meat prototypes, according to FDA and USDA officials. Have participating agencies articulated and agreed to a process for making and enforcing decisions? In addition, as part of their oversight of the food supply, FDA and USDA oversee food labeling of the products under their respective jurisdictions. The ability to regularly attend activities of the collaborative mechanism? Developing and updating written guidance and agreements. We shared our description of the process for making cell-cultured meat, and associated questions, with representatives from three cell-cultured meat firms and academic researchers at two universities for their technical review and incorporated revisions as appropriate. What type of growth medium will producers use, and how might variations in the media affect the final product? Instead, the agencies stated t

In [11]:
import re
# Function to clean text
def clean_text(text):
    return re.sub(r'[^\w\s]', '', str(text)).strip().lower()

In [12]:
from rouge import Rouge

rouge = Rouge()

def truncate_text(text, max_words=100):
    return " ".join(text.split()[:max_words])

def compute_rouge(reference, generated):
    if not generated.strip():  # Handle empty strings
        return {"rouge-1": 0, "rouge-2": 0, "rouge-l": 0}
    
    # Truncate long summaries
    reference = truncate_text(reference)
    generated = truncate_text(generated)
    
    scores = rouge.get_scores(generated, reference)
    return scores[0]

# Apply function
testdf["rouge_scores"] = testdf.apply(lambda row: compute_rouge(row["extractive_summary"], row["generated_summary"]), axis=1)

In [13]:
# Display average ROUGE scores
rouge_l_scores = [score["rouge-l"]["f"] for score in testdf["rouge_scores"]]
print(f"Average ROUGE-L Score: {sum(rouge_l_scores) / len(rouge_l_scores):.4f}")

Average ROUGE-L Score: 0.2223


In [14]:
# Display average ROUGE scores
rouge_l_scores = [score["rouge-2"]["f"] for score in testdf["rouge_scores"]]
print(f"Average ROUGE-2 Score: {sum(rouge_l_scores) / len(rouge_l_scores):.4f}")

Average ROUGE-2 Score: 0.0729


In [15]:
# Display average ROUGE scores
rouge_l_scores = [score["rouge-1"]["f"] for score in testdf["rouge_scores"]]
print(f"Average ROUGE-1 Score: {sum(rouge_l_scores) / len(rouge_l_scores):.4f}")

Average ROUGE-1 Score: 0.2532


In [16]:
from sklearn.metrics.pairwise import cosine_similarity
def compute_similarity(reference, generated):
    ref_embedding = model.encode(reference)
    gen_embedding = model.encode(generated)
    return cosine_similarity([ref_embedding], [gen_embedding])[0][0]

testdf["cosine_similarity"] = testdf.apply(lambda row: compute_similarity(row["extractive_summary"], row["generated_summary"]), axis=1)
print(f"Average Cosine Similarity: {testdf['cosine_similarity'].mean():.4f}")


Average Cosine Similarity: 0.7733


In [17]:
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import Counter
def precision_recall_f1(reference_texts, generated_summaries):
    precisions, recalls, f1s = [], [], []

    for ref, gen in zip(reference_texts, generated_summaries):
        ref_tokens = ref.lower().split()
        gen_tokens = gen.lower().split()

        ref_counter = Counter(ref_tokens)
        gen_counter = Counter(gen_tokens)

        common = ref_counter & gen_counter
        tp = sum(common.values())
        fp = sum(gen_counter.values()) - tp
        fn = sum(ref_counter.values()) - tp

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

    return {
        "average_precision": np.mean(precisions),
        "average_recall": np.mean(recalls),
        "average_f1_score": np.mean(f1s)
    }


In [18]:
reference_summaries = testdf['extractive_summary'].tolist()
generated_summaries = testdf['generated_summary'].tolist()

# Evaluate
results = precision_recall_f1(reference_summaries, generated_summaries)
# Print results
print("Average Precision:", round(results['average_precision'], 4))
print("Average Recall:", round(results['average_recall'], 4))
print("Average F1 Score:", round(results['average_f1_score'], 4))

Average Precision: 0.5092
Average Recall: 0.5276
Average F1 Score: 0.5058
