# Scraping function
Scrape articles' titles, summaries and URLs from the desired section of BBC News. The number of pages to load can be specified

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException
from bs4 import BeautifulSoup
import time


def scrape_bbcnews(base_url, n_pages=50):

    # Navigate to the webpage
    driver = webdriver.Chrome()
    driver.get(base_url)

    # Wait for me to close cookie overlays
    time.sleep(7)

    # Initialize variables
    articles = []
    titles = set()
    counter = 0

    try:
        while counter < n_pages:      
            # Wait for new articles to load
            WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, "lx-stream")))

            # Find article containers
            soup = BeautifulSoup(driver.page_source, "html.parser")
            lx_stream_div = soup.find("div", id="lx-stream")
            article_containers = lx_stream_div.find_all("li", class_="lx-stream__post-container")

            for container in article_containers:
                article = []
                # Extract the title
                title = container.find("header", class_="lx-stream-post__header")
                if title:
                    # Skip duplicate page if title was already present
                    if title in titles:
                        counter -= 1
                        break
                    else:
                        titles.add(title)

                    article.append(title.text.strip())
                else:
                    article.append(None)

                # Extract the summary text
                summary = container.find("p", class_="lx-stream-related-story--summary")
                if summary:
                    article.append(summary.text.strip())
                else:
                    article.append(None)

                # Extract the URL
                link = container.find("a", class_="qa-story-cta-link")
                if link and 'href' in link.attrs:
                    article.append(link['href'])
                else:
                    article.append(None) 

                if None not in article:
                    articles.append(article)

            # Attempt to find the "Next" button and exit if there is none
            try:
                next_button = driver.find_element(By.CLASS_NAME, "qa-pagination-next-page")
                next_button.click()
            except ElementClickInterceptedException:
                break

            counter += 1

    finally:
        driver.quit()
        return articles

# Filtering function
Scrapes the webpage, filters interesting articles and outputs an HTML file containing articles titles, summaries and links

In [2]:
import webbrowser
import os

def predict_bbcnews(website_link, vectorizer, model):
    
    n_pages = int(input("Enter the number of pages to scrape: "))

    articles = scrape_bbcnews(website_link, n_pages)

    interesting_articles = []

    # Loop through the articles, predict their interestingness, and filter the interesting ones
    for article in articles:
        title, summary, link = article

        # Vectorize the text
        text = (title + ' ' + summary).lower()
        X = vectorizer.transform([text])
        
        # Append article if interesting
        if model.predict(X)[0]:
            interesting_articles.append({'title': title, 'summary': summary, 'link': link})

    # Generate HTML content for interesting articles
    html_output = f'<html><head><title>Interesting articles from {website_link}</title></head><body><div style="width: 1000px; max-width: fit-content; margin: 20px auto">'

    for article in interesting_articles:
        title = article['title']
        summary = article['summary']
        link = article['link']
        
        html_output += f'<h2><a href="https://www.bbc.com{link}" target="_blank">{title}</a></h2>'
        html_output += f"<p>{summary}</p>"
        html_output += '<br>'

    html_output += "</div></body></html>"

    # Save the HTML file and open it
    output_filename = "interesting_articles.html"
    with open(output_filename, 'w', encoding='utf-8') as html_file:
        html_file.write(html_output)
    webbrowser.open(os.path.abspath(output_filename))


# Run the filtering function

In [3]:
bbc_world_url = "https://www.bbc.com/news/world"
bbc_science_url = "https://www.bbc.com/news/science_and_environment"
bbc_tech_url = "https://www.bbc.com/news/technology"

In [4]:
import joblib

model = joblib.load("bbc_world_model.pkl")
vectorizer = joblib.load("bbc_world_vectorizer.pkl")

predict_bbcnews(bbc_world_url, vectorizer, model)