In [1]:
import json
import os
import pandas as pd

from datasets import load_dataset

ds = load_dataset("ccdv/govreport-summarization")


  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 17517/17517 [00:00<00:00, 18084.33 examples/s]
Generating validation split: 100%|██████████| 973/973 [00:00<00:00, 13300.50 examples/s]
Generating test split: 100%|██████████| 973/973 [00:00<00:00, 14597.78 examples/s]


In [2]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['report', 'summary'],
        num_rows: 17517
    })
    validation: Dataset({
        features: ['report', 'summary'],
        num_rows: 973
    })
    test: Dataset({
        features: ['report', 'summary'],
        num_rows: 973
    })
})


In [3]:
ds.save_to_disk("gov_report_dataset")
print("Dataset saved successfully!")

Saving the dataset (2/2 shards): 100%|██████████| 17517/17517 [00:02<00:00, 7573.94 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 973/973 [00:00<00:00, 7011.61 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 973/973 [00:00<00:00, 8082.60 examples/s]

Dataset saved successfully!





In [4]:
ds['test']['report'][0]

'In our prior work, we have found that technological innovation involves not only creating new ideas but also translating those ideas into a new product or service. Innovation, and the research driving it, is inherently risky because the likelihood that research can be translated into a product or service and the ultimate value of that product or service are unknown. The Department of Commerce’s National Institute of Standards and Technology describes the path from innovation to commercialization as comprised of three overarching stages: inventing, transitioning to making, and selling. (See fig. 1 for a description of the path from innovation to commercialization.) FDA and USDA have responsibility for overseeing the safety of the food supply. In general, FDA is responsible for ensuring the safety of virtually all domestic and imported food products except those regulated by USDA. USDA is responsible for ensuring the safety of meat, poultry, processed egg products, and catfish. FDA and 

In [5]:
ds['test']['summary'][0]

"Multiple firms have produced cell-cultured meat as part of their research and development. These products appear likely to become available to consumers in coming years. FDA and USDA are the primary agencies responsible for overseeing the safety of the nation's food supply. However, some stakeholders have expressed concern about the agencies' oversight of cell-cultured meat amidst a fragmented federal food safety oversight system. GAO was asked to review federal oversight of cell-cultured meat. This report (1) describes what is known about methods for commercially producing cell-cultured meat, and (2) examines the extent to which FDA and USDA are collaborating to provide regulatory oversight of cell-cultured meat. GAO conducted a literature review; reviewed documentation from FDA, USDA, and stakeholder groups; analyzed public comments submitted to the agencies; compared agency efforts with leading practices for interagency collaboration; and conducted site visits to selected cell-cult

In [6]:
from dotenv import load_dotenv
from pprint import pprint
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
import pandas as pd
import requests
import os

In [7]:
from sentence_transformers import SentenceTransformer, util
import nltk

nltk.download('punkt')

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jameelamer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
def get_summary(article_text="SSS Document",abstractive_summary=""):
    # # Create parser and summarizer
    # parser = PlaintextParser.from_string(document_content, Tokenizer("english"))
    # summarizer = TextRankSummarizer()
    # summary = summarizer(parser.document, 5)
    # return ". ".join(str(sentence) for sentence in summary)  # Convert Sentence objects to string
    # Sample article and summary
    article_sentences = nltk.sent_tokenize(article_text)
    summary_sentences = nltk.sent_tokenize(abstractive_summary)
    
    # Encode
    article_embeddings = model.encode(article_sentences, convert_to_tensor=True)
    summary_embeddings = model.encode(summary_sentences, convert_to_tensor=True)
    
    # Match summary sentences to most similar article sentence
    extractive_summary = []
    for summary_emb in summary_embeddings:
        cosine_scores = util.cos_sim(summary_emb, article_embeddings)[0]
        best_idx = cosine_scores.argmax()
        extractive_summary.append(article_sentences[best_idx])
    return ". ".join(str(sentence) for sentence in extractive_summary)
    

In [9]:
df=ds['test'].to_pandas()
df.head()

Unnamed: 0,report,summary
0,"In our prior work, we have found that technolo...",Multiple firms have produced cell-cultured mea...
1,"A variety of federal laws, regulations, and po...",Federal advisory committees provide advice to ...
2,"According to the National Research Council, al...",DOD manages a global real-estate portfolio wit...
3,Nursing homes are required to keep residents s...,Nursing homes provide care to about 1.4 millio...
4,This section provides an overview of (1) the l...,Decades of defense activities at DOE's Idaho N...


In [11]:
df['extractive_summary'] = df.apply(lambda x: str(get_summary(x['report'], x['summary'])), axis=1)

In [13]:
df2 = df.rename({'report': 'Article', 'summary': 'Summary'}, axis=1)
df2.head()

Unnamed: 0,Article,Summary,extractive_summary
0,"In our prior work, we have found that technolo...",Multiple firms have produced cell-cultured mea...,Some firms have developed prototypes of cell-c...
1,"A variety of federal laws, regulations, and po...",Federal advisory committees provide advice to ...,EPA’s federal advisory committees play an impo...
2,"According to the National Research Council, al...",DOD manages a global real-estate portfolio wit...,DOD has a global real estate portfolio that su...
3,Nursing homes are required to keep residents s...,Nursing homes provide care to about 1.4 millio...,Nursing home characteristics.. At the national...
4,This section provides an overview of (1) the l...,Decades of defense activities at DOE's Idaho N...,DOE has a process for determining that certain...


In [14]:
df2.to_csv("gov_report_with_articles_and_extractive_summary.csv", index=True)

