In [1]:
from dotenv import load_dotenv
from pprint import pprint
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
import pandas as pd
import requests
import os

In [2]:
from sentence_transformers import SentenceTransformer, util
import nltk

nltk.download('punkt')

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jameelamer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def get_summary(article_text="SSS Document",abstractive_summary=""):
    # # Create parser and summarizer
    # parser = PlaintextParser.from_string(document_content, Tokenizer("english"))
    # summarizer = TextRankSummarizer()
    # summary = summarizer(parser.document, 5)
    # return ". ".join(str(sentence) for sentence in summary)  # Convert Sentence objects to string
    # Sample article and summary
    article_sentences = nltk.sent_tokenize(article_text)
    summary_sentences = nltk.sent_tokenize(abstractive_summary)
    
    # Encode
    article_embeddings = model.encode(article_sentences, convert_to_tensor=True)
    summary_embeddings = model.encode(summary_sentences, convert_to_tensor=True)
    
    # Match summary sentences to most similar article sentence
    extractive_summary = []
    for summary_emb in summary_embeddings:
        cosine_scores = util.cos_sim(summary_emb, article_embeddings)[0]
        best_idx = cosine_scores.argmax()
        extractive_summary.append(article_sentences[best_idx])
    return ". ".join(str(sentence) for sentence in extractive_summary)
    

In [4]:
# Load dataset
df = pd.read_csv("bbc_news_summary_with_articles.csv") 

In [6]:
df.head()

Unnamed: 0,Title,Article,Summary,Category
0,289,Musicians to tackle US red tape\n\nMusicians' ...,Nigel McCune from the Musicians' Union said Br...,entertainment
1,262,"U2's desire to be number one\n\nU2, who have w...",But they still want more.They have to want to ...,entertainment
2,276,Rocker Doherty in on-stage fight\n\nRock singe...,"Babyshambles, which he formed after his acrimo...",entertainment
3,60,Snicket tops US box office chart\n\nThe film a...,A Series of Unfortunate Events also stars Scot...,entertainment
4,74,Ocean's Twelve raids box office\n\nOcean's Twe...,"Ocean's Twelve, the crime caper sequel starrin...",entertainment


In [7]:
df['extractive_summary'] = df.apply(lambda x: str(get_summary(x['Article'], x['Summary'])), axis=1)

In [8]:
df['extractive_summary'][0]

'Nigel McCune from the Musicians\' Union said British musicians are "disadvantaged" compared to their US counterparts.. The Musicians\' Union stance is being endorsed by the Music Managers\' Forum (MMF), who say British artists face "an uphill struggle" to succeed in the US, thanks to the tough visa requirements, which are also seen as impractical.. "The US is the world\'s biggest music market, which means something has to be done about the creaky bureaucracy," says Mr McCune.. A singer hoping to perform in the US can expect to pay $1,300 (£680) simply for obtaining a visa.'

In [9]:
df.size

11125

In [10]:
df.to_csv("bbc_news_with_articles_and_extractive_summary.csv", index=True)

In [11]:
from datasets import Dataset, DatasetDict
# Convert Pandas DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)
# Split into train (80%), validation (10%), test (10%)
dataset = dataset.train_test_split(test_size=0.4, seed=42)
test_valid = dataset["test"].train_test_split(test_size=0.5, seed=42)

# Create final dataset
dataset = DatasetDict({
    "train": dataset["train"],
    "validation": test_valid["train"],
    "test": test_valid["test"]
})

# Show dataset structure
print(dataset)
dataset.save_to_disk("bbc_dataset")
print("Dataset saved successfully!")

DatasetDict({
    train: Dataset({
        features: ['Title', 'Article', 'Summary', 'Category', 'extractive_summary'],
        num_rows: 1335
    })
    validation: Dataset({
        features: ['Title', 'Article', 'Summary', 'Category', 'extractive_summary'],
        num_rows: 445
    })
    test: Dataset({
        features: ['Title', 'Article', 'Summary', 'Category', 'extractive_summary'],
        num_rows: 445
    })
})


Saving the dataset (1/1 shards): 100%|██████████| 1335/1335 [00:00<00:00, 73618.14 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 445/445 [00:00<00:00, 72145.08 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 445/445 [00:00<00:00, 96110.47 examples/s] 

Dataset saved successfully!



