In [2]:
!pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: breadability, docopt
  Building wheel for breadability (setup.py) ... [?25l[?25hdone
  Created wheel for breadability: filename=brea

In [4]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=6417ac73a12a3bb2db29bbfab960a8ddfd5df6d62a8bf2f51e48ebd73cc1bdf2
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [5]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from rouge_score import rouge_scorer
import nltk
import pandas as pd

# Download NLTK tokenizer data (required for sumy)
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
def lexrank_summarize(text, num_sentences=3):
    """
    Generate a summary using LexRank.
    Args:
        text (str): Input text to summarize.
        num_sentences (int): Number of sentences in the summary.
    Returns:
        str: Extracted summary.
    """
    # Initialize parser and tokenizer
    parser = PlaintextParser.from_string(text, Tokenizer("english"))

    # Initialize LexRank summarizer
    stemmer = Stemmer("english")
    summarizer = LexRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words("english")

    # Generate summary
    summary_sentences = summarizer(parser.document, num_sentences)

    # Combine selected sentences into a summary
    summary = " ".join(str(sentence) for sentence in summary_sentences)
    return summary





In [7]:
def textrank_summarize(text, num_sentences=3):
    """
    Generate a summary using TextRank with sentence embeddings.
    Args:
        text (str): Input text to summarize.
        num_sentences (int): Number of sentences in the summary.
    Returns:
        str: Extracted summary.
    """
    # Initialize sentence transformer for embeddings
    model = SentenceTransformer('all-MiniLM-L6-v2')

    # Split text into sentences
    sentences = nltk.sent_tokenize(text)
    if len(sentences) < num_sentences:
        return " ".join(sentences)  # Return all sentences if too few

    # Compute sentence embeddings
    embeddings = model.encode(sentences, convert_to_tensor=False)

    # Compute cosine similarity matrix
    similarity_matrix = cosine_similarity(embeddings)

    # Normalize similarity matrix to create transition probabilities
    similarity_matrix = similarity_matrix / (similarity_matrix.sum(axis=1, keepdims=True) + 1e-10)

    # Apply PageRank
    scores = np.ones(len(sentences)) / len(sentences)  # Initialize scores
    damping_factor = 0.85
    for _ in range(100):  # Iterate until convergence (or fixed iterations)
        new_scores = (1 - damping_factor) / len(sentences) + damping_factor * similarity_matrix.T.dot(scores)
        if np.allclose(scores, new_scores, atol=1e-5):
            break
        scores = new_scores

    # Select top sentences based on scores
    ranked_sentences = [(score, sent) for score, sent in zip(scores, sentences)]
    ranked_sentences.sort(reverse=True)
    top_sentences = [sent for _, sent in ranked_sentences[:num_sentences]]

    # Preserve original order of selected sentences
    summary_sentences = sorted(top_sentences, key=lambda x: sentences.index(x))
    summary = " ".join(summary_sentences)
    return summary

In [8]:

def evaluate_rouge(generated_summaries, reference_summaries):
    """
    Compute ROUGE scores for generated summaries against reference summaries using rouge_score.
    Args:
        generated_summaries (list): List of generated summaries.
        reference_summaries (list): List of reference summaries.
    Returns:
        dict: Aggregated ROUGE scores (precision, recall, fmeasure) for each metric.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    for gen_summary, ref_summary in zip(generated_summaries, reference_summaries):
        scores = scorer.score(ref_summary, gen_summary)
        for key in rouge_scores:
            rouge_scores[key].append({
                'precision': scores[key].precision,
                'recall': scores[key].recall,
                'fmeasure': scores[key].fmeasure
            })

    # Aggregate scores (average)
    aggregated_scores = {}
    for key in rouge_scores:
        precision = sum(score['precision'] for score in rouge_scores[key]) / len(rouge_scores[key])
        recall = sum(score['recall'] for score in rouge_scores[key]) / len(rouge_scores[key])
        fmeasure = sum(score['fmeasure'] for score in rouge_scores[key]) / len(rouge_scores[key])
        aggregated_scores[key] = {'precision': precision, 'recall': recall, 'fmeasure': fmeasure}

    return aggregated_scores

In [9]:
from datasets import load_dataset


In [10]:
ds = load_dataset("FiscalNote/billsum")
df_billsum = pd.DataFrame(ds['train'])
sample_df = df_billsum.sample(n=1000, random_state=42).reset_index(drop=True)

# Rename billsum columns to match expected 'source' and 'target' (adjust column names as needed)
# Assuming 'text' is the input and 'summary' is the target in billsum dataset
sample_df.rename(columns={'text': 'source', 'summary': 'target'}, inplace=True)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/91.8M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

data/ca_test-00000-of-00001.parquet:   0%|          | 0.00/6.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

In [13]:

# Example usage with sample_df
# Assuming sample_df has 'source' (input texts) and 'target' (reference summaries)
test_texts = list(sample_df['source'])[:3]
reference_summaries = list(sample_df['target'])[:3]  # Corresponding reference summaries

# Generate summaries using LexRank
lexrank_summaries = [lexrank_summarize(text, num_sentences=3) for text in test_texts]

# Generate summaries using TextRank
textrank_summaries = [textrank_summarize(text, num_sentences=3) for text in test_texts]





modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:

# Print LexRank summaries
print("LexRank Summaries vs Reference Summaries:")
for i, (gen_summary, ref_summary) in enumerate(zip(lexrank_summaries, reference_summaries)):
    print(f"\nExample {i+1}:")
    print(f"LexRank Summary: {gen_summary}")
    print(f"Reference Summary: {ref_summary}")

# Compute ROUGE scores for LexRank
lexrank_rouge_scores = evaluate_rouge(lexrank_summaries, reference_summaries)
print("\nLexRank ROUGE Scores:")
for key, value in lexrank_rouge_scores.items():
    print(f"{key}:")
    print(f"  Precision: {value['precision']:.4f}")
    print(f"  Recall: {value['recall']:.4f}")
    print(f"  F1 Score: {value['fmeasure']:.4f}")

LexRank Summaries vs Reference Summaries:

Example 1:
LexRank Summary: (a) Procedures for PDP Sponsors To Identify Fraud and Abuse.-- Section 1860D-4(c) of the Social Security Act (42 U.S.C. 1320a-7(b)) is amended by adding at the end the following: ``(17) Inappropriate prescribing or dispensing.--Any individual or entity that the Secretary determines has prescribed or dispensed under title XVIII-- ``(A) a covered part D drug to an individual under a prescription drug plan or a MA-PD plan, as such terms are defined for purposes of part D of such title, that could not have been prescribed or dispensed to the individual on the date of such prescribing or dispensing; or ``(B) any drug under such title at a frequency or amount that-- ``(i) represents a practice or pattern of abusive prescribing or dispensing; or ``(ii) presents a risk to enrollee health or safety.''. (2) Delayed effective date for certain provisions.--The amendments made by subsection (a) shall apply with respect to plan y

In [15]:
# Print TextRank summaries
print("\nTextRank Summaries vs Reference Summaries:")
for i, (gen_summary, ref_summary) in enumerate(zip(textrank_summaries, reference_summaries)):
    print(f"\nExample {i+1}:")
    print(f"TextRank Summary: {gen_summary}")
    print(f"Reference Summary: {ref_summary}")



# Compute ROUGE scores for TextRank
textrank_rouge_scores = evaluate_rouge(textrank_summaries, reference_summaries)
print("\nTextRank ROUGE Scores:")
for key, value in textrank_rouge_scores.items():
    print(f"{key}:")
    print(f"  Precision: {value['precision']:.4f}")
    print(f"  Recall: {value['recall']:.4f}")
    print(f"  F1 Score: {value['fmeasure']:.4f}")


TextRank Summaries vs Reference Summaries:

Example 1:
TextRank Summary: 1395w-104(c)) 
is amended--
            (1) in paragraph (1)(D)--
                    (A) by inserting ``, designed to'' after 
                ``program''; and
                    (B) by inserting ``, that includes the procedures 
                described in paragraph (4)'' after ``waste''; and
            (2) by adding at the end the following:
            ``(4) Procedures to prevent fraud and abuse.--
                    ``(A) PDP sponsor procedures.--A PDP sponsor shall 
                have in place procedures designed to--
                            ``(i) identify an individual that has 
                        obtained coverage for a covered part D drug at 
                        a frequency or amount not medically necessary, 
                        as determined in accordance with utilization 
                        guidelines established by the Secretary;
                            ``(ii) subject t