In [43]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urlparse

import torch
from transformers import BertTokenizer, BertLMHeadModel

In [2]:
urls = [
    'https://www.wsj.com/',
    'https://www.cnn.com/',
    'https://www.nytimes.com/',
    'https://www.theguardian.com/international',
    'https://www.reuters.com/news/world'
]

In [3]:
# Initialize an empty list to store the scraped data
articles = []

In [4]:
for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    articles += soup.find_all('article')

In [5]:
data = []
for article in articles:
    article_url = article.find('a')['href']
    if not article_url.startswith('http'):
        article_url = f'https://{urlparse(url).netloc}{article_url}'

    title = article.find('h3')
    if title:
        title = title.text.strip()

    date = article.find('time')
    if date:
        try:
            date = datetime.strptime(date['datetime'], '%Y-%m-%dT%H:%M:%S.%fZ')
            date = date.strftime('%Y-%m-%d %H:%M:%S')
        except ValueError:
            date = None

    author = article.find('span', class_='c-byline__name')
    if author:
        author = author.text.strip()

    content = ''
    content_url = article_url
    content_response = requests.get(content_url)
    content_soup = BeautifulSoup(content_response.content, 'html.parser')
    paragraphs = content_soup.find_all('p')
    for paragraph in paragraphs:
        content += paragraph.text

    data.append({
        'title': title,
        'date': date,
        'author': author,
        'content': content,
        'url': article_url
    })

In [22]:
df = pd.DataFrame(data)
print(df.tail())

                                                title  date author   
19  With abortion rights in the balance, Wisconsin...  None   None  \
20   Trump claims criminal charges driven by politics  None   None   
21  Investor pressure builds for Seven & i shake-u...  None   None   
22  In listening mode, new Bayer CEO keeps 'open m...  None   None   
23  Exclusive-Amundi discussed possible Italian op...  None   None   

                                              content   
19  April 4 (Reuters) - Wisconsin voters on Tuesda...  \
20  PALM BEACH, Florida, April 4 (Reuters) - A sub...   
21  NEW YORK/TOKYO, April 5 (Reuters) - Another ma...   
22  [1/3] Bill Anderson, who will take over as CEO...   
23  [1/2] A logo of Amundi is seen outside the com...   

                                                  url  
19  https://www.reuters.com/article/usa-election-w...  
20  https://www.reuters.com/article/usa-trump-spee...  
21  https://www.reuters.com/article/seven-i-hldgs-...  
22  https://

In [44]:
# load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertLMHeadModel.from_pretrained('bert-base-uncased')

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [53]:
# define function to preprocess text data for BERT
def preprocess_text(text):
    # tokenize text
    tokenized_text = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        truncation=True,
        max_length=127,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    return tokenized_text

In [54]:
# define function to generate summary using BERT
def generate_summary(text):
    # preprocess text
    tokenized_text = preprocess_text(text)

    # generate summary
    summary_ids = model.generate(
        input_ids=tokenized_text['input_ids'],
        attention_mask=tokenized_text['attention_mask'],
        max_length=128,
        num_beams=4,
        early_stopping=True
    )

    # decode summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

In [55]:
# preprocess content and generate summary for each article in dataframe
df['content_preprocessed'] = df['content'].apply(preprocess_text)
df['summary'] = df['content'].apply(generate_summary)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

In [57]:
# view final dataframe with summary column added
print(df.tail())

                                                title  date author   
19  With abortion rights in the balance, Wisconsin...  None   None  \
20   Trump claims criminal charges driven by politics  None   None   
21  Investor pressure builds for Seven & i shake-u...  None   None   
22  In listening mode, new Bayer CEO keeps 'open m...  None   None   
23  Exclusive-Amundi discussed possible Italian op...  None   None   

                                              content   
19  April 4 (Reuters) - Wisconsin voters on Tuesda...  \
20  PALM BEACH, Florida, April 4 (Reuters) - A sub...   
21  NEW YORK/TOKYO, April 5 (Reuters) - Another ma...   
22  [1/3] Bill Anderson, who will take over as CEO...   
23  [1/2] A logo of Amundi is seen outside the com...   

                                                  url   
19  https://www.reuters.com/article/usa-election-w...  \
20  https://www.reuters.com/article/usa-trump-spee...   
21  https://www.reuters.com/article/seven-i-hldgs-...   
22  http