In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("bbc_news_articles.csv")

# Remove duplicate rows based on Content
df = df.drop_duplicates(subset=["Content"])

# Check for missing values and drop rows where Content or Title is missing
df = df.dropna(subset=["Content", "Title"])

# Optional: If you're using 'bert-base-uncased', lowercase the text
df["Content"] = df["Content"].str.lower()

# Save the cleaned dataset to CSV
df.to_csv("processed_news_articles.csv", index=False)

# Content length statistics
df["Content Length"] = df["Content"].apply(len)
print(df["Content Length"].describe())

# Display the first few rows
print(df.head())


count      424.000000
mean      2803.912736
std       2876.872521
min        200.000000
25%       1716.250000
50%       2010.000000
75%       2693.750000
max      32131.000000
Name: Content Length, dtype: float64
                                               Title  \
0  Powell’s Fed Appears Headed for Another Collis...   
1  A Ride-Hailing Start-Up in Washington Tries to...   
3  Automakers Thrived in the Pandemic. Many Are N...   
4  An Epidemic of Vicious School Brawls, Fueled b...   

                                                 URL  \
0  https://www.nytimes.com/2024/12/16/business/ec...   
1  https://www.nytimes.com/2024/12/16/technology/...   
2  https://www.nytimes.com/2024/12/16/business/ec...   
3  https://www.nytimes.com/2024/12/15/business/au...   
4  https://www.nytimes.com/2024/12/15/technology/...   

                                             Content  Content Length  
0  trumptransition advertisement rates may not co...            2280  
1  a company called empower

# Bertsum extractive summarization

In [2]:
from summarizer import Summarizer

# Initialize the BERT summarizer model
model = Summarizer()

# Function to generate summaries for a given text
def generate_summary(text, ratio=0.2):
    return model(text, ratio=ratio)

# Apply summarization to each article's content and store the results in a new column 'Summary'
df["Summary"] = df["Content"].apply(lambda x: generate_summary(x, ratio=0.2))

# Save the summarized dataset to a new CSV file
df.to_csv("summarized_news_articles.csv", index=False)

# Display the first few rows with the summaries
print(df[["Title", "Content", "Summary"]].head())


  attn_output = torch.nn.functional.scaled_dot_product_attention(


                                               Title  \
0  Powell’s Fed Appears Headed for Another Collis...   
1  A Ride-Hailing Start-Up in Washington Tries to...   
3  Automakers Thrived in the Pandemic. Many Are N...   
4  An Epidemic of Vicious School Brawls, Fueled b...   

                                             Content  \
0  trumptransition advertisement rates may not co...   
1  a company called empower is trying to take on ...   
2   by a staff report by the senate labor committ...   
3   by changing technology, political turmoil and...   
4  undefined this is an increasingly familiar sce...   

                                             Summary  
0  trumptransition advertisement rates may not co...  
1  a company called empower is trying to take on ...  
2  by a staff report by the senate labor committe...  
3  by changing technology, political turmoil and ...  
4  undefined this is an increasingly familiar sce...  


In [3]:
from rouge import Rouge
import pandas as pd

# Initialize the ROUGE scorer
rouge = Rouge()

# Function to calculate ROUGE scores
def calculate_rouge_scores(original, summary):
    if not summary.strip():  # Skip empty summaries
        return None
    
    scores = rouge.get_scores(summary, original)[0]
    return {
        "ROUGE-1_F1": scores["rouge-1"]["f"],
        "ROUGE-2_F1": scores["rouge-2"]["f"],
        "ROUGE-L_F1": scores["rouge-l"]["f"],
    }

# Apply the ROUGE scoring function to each row (skip empty summaries)
rouge_scores = df.apply(lambda row: calculate_rouge_scores(row["Content"], row["Summary"]), axis=1)

# Filter out rows where the ROUGE score is None (due to empty summaries)
rouge_scores = rouge_scores.dropna()

# Convert the list of ROUGE scores dictionaries into a DataFrame
rouge_df = pd.DataFrame(list(rouge_scores))

# Calculate the average ROUGE scores across all articles
average_rouge = rouge_df.mean()

# Print the average ROUGE scores in the desired format
print("Model Performance")
print("The table below displays the model's performance for different seeds.")
print("Rouge-1 ↑\tRouge-2 ↑\tRouge-L ↑")
print(f"{average_rouge['ROUGE-1_F1']:.2f}\t{average_rouge['ROUGE-2_F1']:.2f}\t{average_rouge['ROUGE-L_F1']:.2f}")


Model Performance
The table below displays the model's performance for different seeds.
Rouge-1 ↑	Rouge-2 ↑	Rouge-L ↑
0.47	0.39	0.47


# Bart Abstractive Summarization

In [8]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
import pandas as pd

# Load BART tokenizer and model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)

# Function to generate abstractive summaries
def generate_abstractive_summary(text, max_length=130, min_length=30, num_beams=4):
    if not isinstance(text, str) or not text.strip():  # Skip empty content
        return ""
    inputs = tokenizer(text, return_tensors='pt', max_length=1024, truncation=True).to(device)
    summary_ids = model.generate(
        inputs['input_ids'], 
        max_length=max_length, 
        min_length=min_length, 
        num_beams=num_beams,
        length_penalty=2.0,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Apply summarization to each article's content
df['Abstractive_Summary'] = df['Content'].apply(lambda x: generate_abstractive_summary(x))

# Save the results to a CSV file
df.to_csv('bart_summarized_articles.csv', index=False)

# Display the first few rows
print(df[['Title', 'Content', 'Abstractive_Summary']].head())


                                               Title  \
0  Powell’s Fed Appears Headed for Another Collis...   
1  A Ride-Hailing Start-Up in Washington Tries to...   
3  Automakers Thrived in the Pandemic. Many Are N...   
4  An Epidemic of Vicious School Brawls, Fueled b...   

                                             Content  \
0  trumptransition advertisement rates may not co...   
1  a company called empower is trying to take on ...   
2   by a staff report by the senate labor committ...   
3   by changing technology, political turmoil and...   
4  undefined this is an increasingly familiar sce...   

                                 Abstractive_Summary  
0  trumptransition advertisement rates may not co...  
1  Ride-hailing start-upempowerhas become a serio...  
2  A report by the senate labor committee, led by...  
3  Nissan, the japanese automaker, is laying off ...  
4  Fights like these reflect a troubling national...  


In [9]:
from rouge import Rouge
import pandas as pd

# Initialize the ROUGE scorer
rouge = Rouge()

# Function to calculate ROUGE scores
def calculate_rouge_scores(original, summary):
    if not summary.strip():  # Skip empty summaries
        return None
    
    scores = rouge.get_scores(summary, original)[0]
    return {
        "ROUGE-1_F1": scores["rouge-1"]["f"],
        "ROUGE-2_F1": scores["rouge-2"]["f"],
        "ROUGE-L_F1": scores["rouge-l"]["f"],
    }

# Calculate ROUGE scores for each article
rouge_scores = df.apply(lambda row: calculate_rouge_scores(row["Content"], row["Abstractive_Summary"]), axis=1)

# Filter out rows where the ROUGE score is None (due to empty summaries)
rouge_scores = rouge_scores.dropna()

# Convert the list of ROUGE scores dictionaries into a DataFrame
rouge_df = pd.DataFrame(list(rouge_scores))

# Calculate the average ROUGE scores across all articles
average_rouge = rouge_df.mean()

# Print the average ROUGE scores
print("Model Performance")
print("The table below displays the model's performance for different seeds.")
print("Rouge-1 ↑\tRouge-2 ↑\tRouge-L ↑")
print(f"{average_rouge['ROUGE-1_F1']:.2f}\t{average_rouge['ROUGE-2_F1']:.2f}\t{average_rouge['ROUGE-L_F1']:.2f}")


Model Performance
The table below displays the model's performance for different seeds.
Rouge-1 ↑	Rouge-2 ↑	Rouge-L ↑
0.27	0.19	0.27


# Combining Bart and Bertsum summarization 

In [12]:
from summarizer import Summarizer
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
import pandas as pd

# Load BERT Summarizer model
bert_model = Summarizer()

# Load BART tokenizer and model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)

# Function for BERT extractive summarization
def generate_extractive_summary(text, ratio=0.2):
    if not isinstance(text, str) or not text.strip():  # Skip empty content
        return ""
    return bert_model(text, ratio=ratio)

# Function for BART abstractive summarization
def generate_abstractive_summary(text, max_length=130, min_length=30, num_beams=4):
    if not isinstance(text, str) or not text.strip():  # Skip empty content
        return ""
    inputs = tokenizer(text, return_tensors='pt', max_length=1024, truncation=True).to(device)
    summary_ids = bart_model.generate(
        inputs['input_ids'], 
        max_length=max_length, 
        min_length=min_length, 
        num_beams=num_beams,
        length_penalty=2.0,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Combined summarization pipeline
def generate_combined_summary(content, bert_ratio=0.2, bart_max_length=130, bart_min_length=30, bart_num_beams=4):
    # Step 1: Generate an extractive summary using BERT
    extractive_summary = generate_extractive_summary(content, ratio=bert_ratio)
    
    # Step 2: Generate an abstractive summary from the extractive summary using BART
    combined_summary = generate_abstractive_summary(extractive_summary, 
                                                    max_length=bart_max_length, 
                                                    min_length=bart_min_length, 
                                                    num_beams=bart_num_beams)
    return combined_summary

# Apply the combined summarization pipeline to each article's content
df['Combined_Summary'] = df['Content'].apply(lambda x: generate_combined_summary(x))

# Save the results to a new CSV file
df.to_csv('combined_summarized_articles.csv', index=False)

# Display the first few rows
print(df[['Title', 'Content', 'Combined_Summary']].head())


                                               Title  \
0  Powell’s Fed Appears Headed for Another Collis...   
1  A Ride-Hailing Start-Up in Washington Tries to...   
3  Automakers Thrived in the Pandemic. Many Are N...   
4  An Epidemic of Vicious School Brawls, Fueled b...   

                                             Content  \
0  trumptransition advertisement rates may not co...   
1  a company called empower is trying to take on ...   
2   by a staff report by the senate labor committ...   
3   by changing technology, political turmoil and...   
4  undefined this is an increasingly familiar sce...   

                                    Combined_Summary  
0  trumptransition advertisement rates may not co...  
1  a company called empower is trying to take on ...  
2  Staff report by the senate labor committee, le...  
3  A few years ago, automakers were celebrating r...  
4  Schools are struggling to remove phones, which...  


In [13]:
from rouge import Rouge
import pandas as pd

# Initialize the ROUGE scorer
rouge = Rouge()

# Function to calculate ROUGE scores
def calculate_rouge_scores(original, summary):
    if not isinstance(summary, str) or not summary.strip():  # Skip empty summaries
        return None
    
    scores = rouge.get_scores(summary, original)[0]
    return {
        "ROUGE-1_F1": scores["rouge-1"]["f"],
        "ROUGE-2_F1": scores["rouge-2"]["f"],
        "ROUGE-L_F1": scores["rouge-l"]["f"],
    }

# Apply the ROUGE scoring function to each row (skip rows with empty Combined_Summary)
rouge_scores = df.apply(lambda row: calculate_rouge_scores(row["Content"], row["Combined_Summary"]), axis=1)

# Filter out rows where the ROUGE score is None (due to empty summaries)
rouge_scores = rouge_scores.dropna()

# Convert the list of ROUGE scores dictionaries into a DataFrame
rouge_df = pd.DataFrame(list(rouge_scores))

# Calculate the average ROUGE scores across all articles
average_rouge = rouge_df.mean()

# Print the average ROUGE scores in the desired format
print("Model Performance for Combined Summarization")
print("The table below displays the model's performance for different seeds.")
print("Rouge-1 ↑\tRouge-2 ↑\tRouge-L ↑")
print(f"{average_rouge['ROUGE-1_F1']:.2f}\t{average_rouge['ROUGE-2_F1']:.2f}\t{average_rouge['ROUGE-L_F1']:.2f}")


Model Performance for Combined Summarization
The table below displays the model's performance for different seeds.
Rouge-1 ↑	Rouge-2 ↑	Rouge-L ↑
0.23	0.16	0.23


# Sentiment analysis 

In [17]:
from transformers import pipeline

# Load sentiment-analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis")

# Function to analyze sentiment of the summary or text
def get_sentiment(text):
    if not isinstance(text, str) or not text.strip():  # Skip empty content
        return None
    sentiment = sentiment_analyzer(text)
    return sentiment[0]['label']  # Returns the label ('POSITIVE', 'NEGATIVE', 'NEUTRAL')

# Add sentiment analysis to your summaries
df['Sentiment'] = df['Combined_Summary'].apply(lambda x: get_sentiment(x))

# Display the results
print(df[['Title', 'Combined_Summary', 'Sentiment']].head())


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


                                               Title  \
0  Powell’s Fed Appears Headed for Another Collis...   
1  A Ride-Hailing Start-Up in Washington Tries to...   
3  Automakers Thrived in the Pandemic. Many Are N...   
4  An Epidemic of Vicious School Brawls, Fueled b...   

                                    Combined_Summary Sentiment  
0  trumptransition advertisement rates may not co...  NEGATIVE  
1  a company called empower is trying to take on ...  POSITIVE  
2  Staff report by the senate labor committee, le...  NEGATIVE  
3  A few years ago, automakers were celebrating r...  NEGATIVE  
4  Schools are struggling to remove phones, which...  NEGATIVE  


In [19]:
df.to_csv('final_summarized_articles.csv', index=False)
print(df[['Title', 'Combined_Summary', 'Sentiment']].head())

                                               Title  \
0  Powell’s Fed Appears Headed for Another Collis...   
1  A Ride-Hailing Start-Up in Washington Tries to...   
3  Automakers Thrived in the Pandemic. Many Are N...   
4  An Epidemic of Vicious School Brawls, Fueled b...   

                                    Combined_Summary Sentiment  
0  trumptransition advertisement rates may not co...  NEGATIVE  
1  a company called empower is trying to take on ...  POSITIVE  
2  Staff report by the senate labor committee, le...  NEGATIVE  
3  A few years ago, automakers were celebrating r...  NEGATIVE  
4  Schools are struggling to remove phones, which...  NEGATIVE  
