INSTALL DEPENDENCIES

In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate
!pip install evaluate
!pip install numpy as np
!pip install request
!pip install bs4

LOAD BART-LARGE-CNN MODEL AND CNN_DAILYMAIL TEST DATASET FOR EVALUATION

In [None]:
from transformers import pipeline

bart_pipe = pipeline("summarization", model = "facebook/bart-large-cnn")


In [None]:
from datasets import load_dataset
ds_test = load_dataset("abisee/cnn_dailymail","3.0.0", split = "test")

EVALUATE BART-LARGE-CNN MODEL USING BERTSCORE

In [None]:
no_eval_articles = 50

In [None]:
def chunked_text(text, chunk_size):
    chunks = []
    for i in range(0,len(text), chunk_size):
        chunk = text[i:i+chunk_size]
        chunks.append(chunk)
    return chunks

In [None]:
def summarize(bart_pipe, text, chunk_size):
    chunks = chunked_text(text,chunk_size)
    
    summaries = []
    
    for chunk in chunks:
        summary = bart_pipe(chunk,max_length = 130, min_length = 1, do_sample = False)[0]['summary_text']
        
        summaries.append(summary)
        
    return ' '.join(summaries)
    

In [None]:
bart_summaries = []
ref_summaries = []
CHUNK_SIZE = 1024
for i in range(no_eval_articles):
    article = ds_test[i]['article']
    summary = ds_test[i]['highlights']
    # SUMMARIZE
    bart_summary = summarize(bart_pipe,article,CHUNK_SIZE)
    
    bart_summaries.append(article, bart_summary)
    ref_summaries.append(article, summary)

In [None]:
from evaluate import load

bert_score = load("bertscore")

results = bert_score.compute(predictions=bart_summaries, references=ref_summaries, model_type="facebook/bart-large-cnn")

f1s = results['f1']
precisions = results['precision']

In [None]:
print(f"F1: {np.average(f1s,4)}")
print(f"Precisions: {np.average(precisions,4)}")

GET DATA FROM CNN WEBSITE AND SUMMARIZE IT

In [None]:
import requests
from bs4 import BeautifulSoup

def scrape_cnn_article(url):
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        
        # For CNN articles
        if "cnn.com" in url:
            title = soup.find('h1').get_text()
            article_body = soup.find_all('p', class_="paragraph inline-placeholder vossi-paragraph")
            content = " ".join([p.get_text() for p in article_body])       
            return title, content
        else:
            return None,None
    
    else:
        print(f"Failed to retrieve the article. Status code: {response.status_code}")
        return None, None

In [None]:
title, content = scrape_cnn_article("https://edition.cnn.com/2024/10/20/politics/mcdonalds-donald-trump-pennsylvania/index.html")

data ={
    'title': title,
    'article': content,
}

In [None]:
cnn_summary = summarize(bart_pipe,data['article'],CHUNK_SIZE)

In [None]:
print(cnn_summary)