In [1]:

import numpy as np
import pandas as pd
from evaluate import load
from scipy.ndimage import label
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset, DatasetDict
from datasets import load_from_disk
import numpy as np
import nltk
import re
import spacy
nltk.download('punkt')
from nltk.tokenize import sent_tokenize


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jameelamer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load English tokenizer, POS tagger, parser, NER from spaCy
import spacy.cli

try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import spacy.cli
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
def normalize_whitespace(text):
    """Remove extra whitespace, newlines, and tabs."""
    return re.sub(r'\s+', ' ', text).strip()

def remove_noise_and_references(text):
    """
    Remove references like [1], (Smith et al., 2020), and figure/table mentions.
    You can extend the patterns as needed.
    """
    # Remove square bracket citations like [1], [12]
    text = re.sub(r'\[\d+\]', '', text)
    
    # Remove in-text references like (Smith et al., 2020)
    text = re.sub(r'\(([^)]*et al\.,?\s?\d{4})\)', '', text)
    
    # Remove "Fig. 1", "Table 2", etc.
    text = re.sub(r'(Fig\.?|Figure|Table)\s?\d+[a-zA-Z]?', '', text, flags=re.IGNORECASE)
    
    # Remove licensing and copyright boilerplate
    text = re.sub(r'©.*?(\.|\n)', '', text)
    text = re.sub(r'This article is licensed.*?(\.|\n)', '', text, flags=re.IGNORECASE)
    
    return text

def sentence_segmentation(text):
    """Segment text into individual sentences using spaCy."""
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents if sent.text.strip()]

def preprocess_text(raw_text):
    """Complete preprocessing pipeline."""
    step1 = normalize_whitespace(raw_text)
    step2 = remove_noise_and_references(step1)
    sentences = sentence_segmentation(step2)
    return ' '.join(str(sentence) for sentence in sentences)

In [3]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

def lexrank_summary(text, num_sentences=3):    
    # call preprocess
    clean_text=preprocess_text(text)
    parser = PlaintextParser.from_string(clean_text, Tokenizer("english"))
    summarizer = LexRankSummarizer()
    summary = summarizer(parser.document, num_sentences)
    
    return ' '.join(str(sentence) for sentence in summary)

# Example usage
text = """
Artificial intelligence is rapidly evolving. Many industries are integrating AI to enhance performance.
Applications include healthcare, finance, and transportation. AI also raises ethical and employment concerns.
Researchers are focused on creating explainable and fair AI. Governments are responding with new regulations and policies.
"""

summary = lexrank_summary(text, num_sentences=2)
print(summary)


Artificial intelligence is rapidly evolving. Many industries are integrating AI to enhance performance.


In [4]:
# BBC News Dataset
# testdf=pd.read_csv('bbc_news_with_articles_and_extractive_summary.csv')
# gov report dataset

testdf = pd.read_csv('gov_report_with_articles_and_extractive_summary.csv')

In [5]:
testdf['generated_summary'] = testdf['Article'].apply(lambda x:str(lexrank_summary( x, 25)))

In [6]:
import numpy as np
testdf.replace(r'^\s*$', np.nan, regex=True)
testdf.dropna(axis=0, how='any')
# Remove rows where "Title" is empty or null
testdf = testdf[testdf["generated_summary"].notna() & (testdf["generated_summary"] != "")]
len(testdf)

973

In [7]:
testdf=testdf.drop_duplicates(subset=["generated_summary"])
testdf['generated_summary'].value_counts()

generated_summary
The technology to produce cell-cultured meat at a commercial scale is still in development, and information about the methods to be used for commercial production and the composition of the final product are not yet known. As a result, certain information is not yet available to stakeholders—including cell-cultured meat firms themselves, regulators, and the public—about specific aspects of the technology and commercial production methods that will be used, such as the composition of the growth medium and of the final products. For example, if the scaffold on which the cell-cultured meat is grown is not edible, the agencies may require firms to disclose certain aspects of their commercial production methods, such as how they removed the cell- cultured meat from the scaffold. According to FDA officials and representatives from one cell-cultured meat firm, it is likely that some firms will use genetic engineering in their commercial cell-cultured meat production methods.

In [8]:
import re
# Function to clean text
def clean_text(text):
    return re.sub(r'[^\w\s]', '', str(text)).strip().lower()

In [9]:
testdf['is_match'] = testdf.apply(lambda row: clean_text(row['Article']) == clean_text(testdf['generated_summary']), axis=1)

In [10]:
testdf.head()

Unnamed: 0.1,Unnamed: 0,Article,Summary,extractive_summary,generated_summary,is_match
0,0,"In our prior work, we have found that technolo...",Multiple firms have produced cell-cultured mea...,Some firms have developed prototypes of cell-c...,The technology to produce cell-cultured meat a...,False
1,1,"A variety of federal laws, regulations, and po...",Federal advisory committees provide advice to ...,EPA’s federal advisory committees play an impo...,EPA’s Ethics Office is responsible for helping...,False
2,2,"According to the National Research Council, al...",DOD manages a global real-estate portfolio wit...,DOD has a global real estate portfolio that su...,Individual DOD facilities projects within inst...,False
3,3,Nursing homes are required to keep residents s...,Nursing homes provide care to about 1.4 millio...,Nursing home characteristics.. At the national...,"In addition to standard surveys, state survey ...",False
4,4,This section provides an overview of (1) the l...,Decades of defense activities at DOE's Idaho N...,DOE has a process for determining that certain...,"As of March 2019, EM’s IWTU reengineering proj...",False


In [11]:
from rouge import Rouge

rouge = Rouge()

def truncate_text(text, max_words=100):
    return " ".join(text.split()[:max_words])

def compute_rouge(reference, generated):
    if not generated.strip():  # Handle empty strings
        return {"rouge-1": 0, "rouge-2": 0, "rouge-l": 0}
    
    # Truncate long summaries
    reference = truncate_text(reference)
    generated = truncate_text(generated)
    
    scores = rouge.get_scores(generated, reference)
    return scores[0]

# Apply function
testdf["rouge_scores"] = testdf.apply(lambda row: compute_rouge(row["extractive_summary"], row["generated_summary"]), axis=1)


In [12]:
testdf.head()

Unnamed: 0.1,Unnamed: 0,Article,Summary,extractive_summary,generated_summary,is_match,rouge_scores
0,0,"In our prior work, we have found that technolo...",Multiple firms have produced cell-cultured mea...,Some firms have developed prototypes of cell-c...,The technology to produce cell-cultured meat a...,False,"{'rouge-1': {'r': 0.23684210526315788, 'p': 0...."
1,1,"A variety of federal laws, regulations, and po...",Federal advisory committees provide advice to ...,EPA’s federal advisory committees play an impo...,EPA’s Ethics Office is responsible for helping...,False,"{'rouge-1': {'r': 0.17333333333333334, 'p': 0...."
2,2,"According to the National Research Council, al...",DOD manages a global real-estate portfolio wit...,DOD has a global real estate portfolio that su...,Individual DOD facilities projects within inst...,False,"{'rouge-1': {'r': 0.2, 'p': 0.2105263157894736..."
3,3,Nursing homes are required to keep residents s...,Nursing homes provide care to about 1.4 millio...,Nursing home characteristics.. At the national...,"In addition to standard surveys, state survey ...",False,"{'rouge-1': {'r': 0.18309859154929578, 'p': 0...."
4,4,This section provides an overview of (1) the l...,Decades of defense activities at DOE's Idaho N...,DOE has a process for determining that certain...,"As of March 2019, EM’s IWTU reengineering proj...",False,"{'rouge-1': {'r': 0.36619718309859156, 'p': 0...."


In [13]:
# Display average ROUGE scores
rouge_l_scores = [score["rouge-l"]["f"] for score in testdf["rouge_scores"]]
print(f"Average ROUGE-L Score: {sum(rouge_l_scores) / len(rouge_l_scores):.4f}")

Average ROUGE-L Score: 0.2911


In [14]:
# Display average ROUGE scores
rouge_l_scores = [score["rouge-2"]["f"] for score in testdf["rouge_scores"]]
print(f"Average ROUGE-2 Score: {sum(rouge_l_scores) / len(rouge_l_scores):.4f}")

Average ROUGE-2 Score: 0.1425


In [15]:
# Display average ROUGE scores
rouge_l_scores = [score["rouge-1"]["f"] for score in testdf["rouge_scores"]]
print(f"Average ROUGE-1 Score: {sum(rouge_l_scores) / len(rouge_l_scores):.4f}")

Average ROUGE-1 Score: 0.3237


In [16]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')


def compute_similarity(reference, generated):
    ref_embedding = model.encode(reference)
    gen_embedding = model.encode(generated)
    return cosine_similarity([ref_embedding], [gen_embedding])[0][0]


testdf["cosine_similarity"] = testdf.apply(lambda row: compute_similarity(row["Summary"], row["generated_summary"]),
                                           axis=1)
print(f"Average Cosine Similarity: {testdf['cosine_similarity'].mean():.4f}")


Average Cosine Similarity: 0.7756


In [17]:
from sklearn.metrics import precision_score, recall_score, f1_score
from collections import Counter
def precision_recall_f1(reference_texts, generated_summaries):
    precisions, recalls, f1s = [], [], []

    for ref, gen in zip(reference_texts, generated_summaries):
        ref_tokens = ref.lower().split()
        gen_tokens = gen.lower().split()

        ref_counter = Counter(ref_tokens)
        gen_counter = Counter(gen_tokens)

        common = ref_counter & gen_counter
        tp = sum(common.values())
        fp = sum(gen_counter.values()) - tp
        fn = sum(ref_counter.values()) - tp

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

    return {
        "average_precision": np.mean(precisions),
        "average_recall": np.mean(recalls),
        "average_f1_score": np.mean(f1s)
    }

In [18]:
reference_summaries = testdf['extractive_summary'].tolist()  
generated_summaries = testdf['generated_summary'].tolist()  

# Evaluate
results = precision_recall_f1(reference_summaries, generated_summaries)
# Print results
print("Average Precision:", round(results['average_precision'], 4))
print("Average Recall:", round(results['average_recall'], 4))
print("Average F1 Score:", round(results['average_f1_score'], 4))

Average Precision: 0.4739
Average Recall: 0.6392
Average F1 Score: 0.5321
