In [1]:
from collections import defaultdict
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from rouge import Rouge
import heapq
from newspaper import Article
import json
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split


In [3]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
data = pd.read_json('News_Category_Dataset_v3.json', lines=True)

In [5]:
def generate_keyword_table(texts):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names()
    keyword_table = {}

    for i, text in enumerate(texts):
        words = word_tokenize(text)
        word_scores = defaultdict(int)
        for word in words:
            if word in feature_names:  
                idx = feature_names.index(word)
                word_scores[word] = tfidf_matrix[i, idx]
        keyword_table[i] = dict(word_scores)

    return keyword_table


def generate_lexical_chains(text):
    words = word_tokenize(text)
    tagged_words = pos_tag(words)
    chain_dict = {}

    for i, (word, tag) in enumerate(tagged_words):
        if tag.startswith('NN'):
            synonyms = []
            for syn in wordnet.synsets(word):
                for lemma in syn.lemmas():
                    synonyms.append(lemma.name())
            synonyms = list(set(synonyms))
            chain_dict[word] = synonyms

    return chain_dict


def evaluate_rouge(generated_summary, reference_summary):
    rouge = Rouge()
    scores = rouge.get_scores(generated_summary, reference_summary)
    return scores


def summarize_article(url, reference_summary):
    try:
        article = Article(url)
        article.download()
        article.parse()
        article.nlp()

        print(f"Title: {article.title}\n")

        text = article.text
        keyword_table = generate_keyword_table([text])[0]
        lexical_chains = generate_lexical_chains(text)

        sentences = sent_tokenize(text)
        sentence_scores = {}

        for sentence in sentences:
            score = 0
            words = word_tokenize(sentence)
            for word in words:
                for key, value in lexical_chains.items():
                    if word in value:
                        score += 1
                if word in keyword_table:
                    score += keyword_table[word]
            sentence_scores[sentence] = score

        summarized_sentences = heapq.nlargest(5, sentence_scores, key=sentence_scores.get)
        summarized_article = ' '.join(summarized_sentences)

        rouge_scores = evaluate_rouge(summarized_article, reference_summary)
        return summarized_article, rouge_scores
    
    except Exception as e:
        print(f"Skipping URL: {url} - Error: {e}")
        return None, None

In [6]:
data = data[:100]

In [7]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

def train_model(train_data):
    summaries = []
    rouge_scores = []
    
    for idx, row in train_data.iterrows():
        url = row['link']
        reference_summary = row['short_description']
        
        summarized_text, rouge_score = summarize_article(url, reference_summary)
        
        if summarized_text and rouge_score:
            summaries.append(summarized_text)
            rouge_scores.append({'summary': summarized_text, 'rouge': rouge_score}) 
    
    return summaries, rouge_scores


def test_model(test_data, trained_summaries, trained_rouge_scores):
    test_rouge_scores = []
    
    for idx, row in test_data.iterrows():
        url = row['link']
        reference_summary = row['short_description']
        
        summarized_text, rouge_score = summarize_article(url, reference_summary)
        
        if summarized_text and rouge_score:
            test_rouge_scores.append({'summary': summarized_text, 'rouge': rouge_score})
    

    average_trained_rouge = calculate_average_rouge(trained_rouge_scores)
    average_test_rouge = calculate_average_rouge(test_rouge_scores)
    
    return average_trained_rouge, average_test_rouge


def calculate_average_rouge(rouge_scores):
    if rouge_scores:
        rouge_1_scores = [score['rouge']['rouge-1']['f'] for score in rouge_scores if 'rouge-1' in score['rouge']]
        rouge_2_scores = [score['rouge']['rouge-2']['f'] for score in rouge_scores if 'rouge-2' in score['rouge']]
        rouge_l_scores = [score['rouge']['rouge-l']['f'] for score in rouge_scores if 'rouge-l' in score['rouge']]

        avg_rouge = {
            'rouge-1': sum(rouge_1_scores) / len(rouge_1_scores) if rouge_1_scores else 0,
            'rouge-2': sum(rouge_2_scores) / len(rouge_2_scores) if rouge_2_scores else 0,
            'rouge-l': sum(rouge_l_scores) / len(rouge_l_scores) if rouge_l_scores else 0,
        }
        return avg_rouge
    else:
        return None

In [8]:
# Training the model
trained_summaries, trained_rouge_scores = train_model(train_data)

Title: TikTok Search Results Riddled With Misinformation: Report

Title: At Least 32 Dead In Fire At Karaoke Parlor In South Vietnam

Title: Las Vegas Aces Win First WNBA Title, Chelsea Gray Named MVP

Title: Racism Seen As Root Of Water Crisis In Mississippi Capital

Title: Biden Honors 9/11 Victims, Vows Commitment To Thwart Terror

Title: Russian Cosmonaut Valery Polyakov Who Broke Record With 437-Day Stay In Space Dies At 80

Title: Bill To Help Afghans Who Escaped Taliban Faces Long Odds In The Senate

Title: Kody Clemens Strikes Out MVP Shohei Ohtani, Trails Dad Roger By 4,671 Ks

Title: Biden At UN To Call Russian War An Affront To Body's Charter

Title: Payment Processor Visa To Start Categorizing Sales At Gun Stores

Title: Man Sets Himself On Fire In Apparent Protest Of Funeral For Japan's Abe

Title: Viola Davis Feared A Heart Attack During 'The Woman King' Training

Title: Norman Reedus Opens Up About 'Walking Dead' Injury: 'I Thought I Was Going To Die'

Title: James Camer

In [9]:
# Testing the model
average_trained_rouge, average_test_rouge = test_model(test_data, trained_summaries, trained_rouge_scores)

Title: Politician's DNA Connected To Las Vegas Journalist’s Murder, Police Say

Title: Possible Nationwide Rail Strike Is Already Impacting Travelers, Businesses

Title: 9/11 Attacks Still Reverberate As U.S. Marks 21st Anniversary

Title: Meet Alex Aster, The TikToker Changing The Publishing Industry For The Better

Title: Mark Meadows Complies With Justice Dept. Subpoena: Report

Title: Amazon Greenlights 'Blade Runner 2099' Limited Series Produced By Ridley Scott

Title: Hurricane Fiona Bears Down On Dominican Republic After Pounding Puerto Rico

Title: 'Our Hearts Are Broken': Historic Front Pages Mark The Queen's Death

Title: World Cup Captains Want To Wear Rainbow Armbands In Qatar

Title: Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters

Title: 4 Russian-Controlled Ukrainian Regions Schedule Votes This Week To Join Russia

Title: Biden Says Queen's Death Left 'Giant Hole' For Royal Family

Title: US, Trump Team Propose Names For Arbiter In Mar-A-Lago

In [10]:
print(f"Average ROUGE score on trained data: {average_trained_rouge}")
print(f"Average ROUGE score on test data: {average_test_rouge}")

Average ROUGE score on trained data: {'rouge-1': 0.32, 'rouge-2': 0.53, 'rouge-l': 0.27}
Average ROUGE score on test data: {'rouge-1': 0.67, 'rouge-2': 0.78, 'rouge-l': 0.52}


In [16]:
# Example usage
url = 'https://www.huffpost.com/entry/covid-boosters-uptake-us_n_632d719ee4b087fae6feaac9'

reference_summary = "Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall."

summarized_text, rouge_scores = summarize_article(url, reference_summary)
print(f"Summarized Text:\n{summarized_text}\n")

Title: Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters

Summarized Text:
https://t.co/7SixTE3OMT pic.twitter.com/s5fyjRpYuX — 60 Minutes (@60Minutes) September 19, 2022

By Wednesday on Facebook, when a Kansas health department posted where residents could find the new booster shots, the first commenter remarked snidely:

Advertisement

“But Biden says the pandemic is over.”

The president’s statement, despite his attempts to clarify it, adds to public confusion, said Josh Michaud, associate director of global health policy with the Kaiser Family Foundation in Washington. “No one would go looking at our flu shot uptake at this point and be like, ‘Oh, what a disaster,’” said Dr. David Dowdy, an infectious disease epidemiologist at Johns Hopkins Bloomberg School of Public Health. “If we start to see a large uptick in cases, I think we’re going to see a lot of people getting the (new COVID) vaccine.”

A temporary shortage of Moderna vaccine caused some pharmac

In [17]:
data

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22
...,...,...,...,...,...,...
95,https://www.huffpost.com/entry/fairview-fire-c...,Fast-Moving Fairview Fire Kills At Least 2 In ...,U.S. NEWS,A second fire killed two people in Northern Ca...,Nick Visser,2022-09-06
96,https://www.huffpost.com/entry/kody-clemens-st...,"Kody Clemens Strikes Out MVP Shohei Ohtani, Tr...",SPORTS,The Detroit Tigers' rookie utility player got ...,,2022-09-06
97,https://www.huffpost.com/entry/jackson-water-p...,Mississippi Governor Says Water Pressure Is No...,U.S. NEWS,"The city remains under a boil water notice, wh...",Marita Vlachou,2022-09-06
98,https://www.huffpost.com/entry/meta-400-millio...,"Meta, Parent Company Of Instagram, Fined $400 ...",U.S. NEWS,The social media giant was fined for violation...,Nick Visser,2022-09-06
