<a href="https://colab.research.google.com/github/Khoawawa/text-summarization/blob/main/text_summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

INSTALL DEPENDENCIES

In [1]:
!pip install -U transformers
!pip install -U datasets
!pip install -U accelerate
!pip install -U evaluate
!pip install -U requests
!pip install -U bs4
!pip install -U bert-score
!pip install -U torch

Collecting transformers
  Downloading transformers-4.46.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.46.0-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.20.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-

LOAD BART-LARGE-CNN MODEL AND CNN_DAILYMAIL TEST DATASET FOR EVALUATION

In [2]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset

In [11]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn").to(device)

In [12]:
ds_test = load_dataset("abisee/cnn_dailymail","3.0.0", split = "test")

EVALUATE BART-LARGE-CNN MODEL USING BERTSCORE

In [13]:
def abstract_summarize(text,max_length=250,min_length=30):
    tokenized_text = tokenizer(text,
                               max_length = 1024,
                               padding = "max_length",
                               truncation = True,
                               return_tensors = "pt"
                               ).to(device)

    output = model.generate(
        tokenized_text["input_ids"],
        max_length = max_length,
        min_length = min_length
        )

    summary = tokenizer.decode(output[0], skip_special_tokens=True)

    return summary


In [14]:
def summarize(text):
    abstractive_summary = abstract_summarize(text)
    return abstractive_summary

In [15]:
bart_summaries = []
ref_summaries = []
no_eval_articles = 1
CHUNK_SIZE = 1024
for i in range(no_eval_articles):
    article = ds_test[i]['article']
    summary = ds_test[i]['highlights']
    # SUMMARIZE
    bart_summary = summarize(article)

    bart_summaries.append(bart_summary)
    ref_summaries.append(summary)

In [16]:
from evaluate import load

bert_score = load("bertscore")

results = bert_score.compute(predictions=bart_summaries, references=ref_summaries, model_type="facebook/bart-large-cnn")
score = {
    'f1': results['f1'],
    'precision':results['precision'],
    'recall': results['recall']
}

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [17]:
import numpy

print(f"F1: {numpy.average(score['f1'])}")
print(f"Precisions: {numpy.average(score['precision'])}")
print(f"Recall: {numpy.average(score['recall'])}")

F1: 0.695124089717865
Precisions: 0.7186445593833923
Recall: 0.6730945110321045


GET DATA FROM CNN WEBSITE AND SUMMARIZE IT

In [18]:
import requests
from bs4 import BeautifulSoup

def scrape_cnn_article(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")

        # For CNN articles
        if "cnn.com" in url:
            title = soup.find('h1').get_text()
            article_body = soup.find_all('p', class_="paragraph inline-placeholder vossi-paragraph")
            content = " ".join([p.get_text() for p in article_body])
            return title, content
        else:
            return None,None

    else:
        print(f"Failed to retrieve the article. Status code: {response.status_code}")
        return None, None

In [19]:
title, content = scrape_cnn_article("https://edition.cnn.com/2024/10/20/politics/mcdonalds-donald-trump-pennsylvania/index.html")

data ={
    'title': title,
    'article': content,
}

In [20]:
cnn_summary = summarize(data['article'])

In [21]:
print(cnn_summary)

Donald Trump stopped by a McDonald’s in Pennsylvania during his Sunday swing. He handed customers food through the drive-thru window, telling them he had made it himself. It's the same job Vice President Kamala Harris has said she held as a young woman. Trump has grown fixated on Harris’ employment there.
