# Scraping function
Scrape articles' titles, summaries and URLs from the desired section of BBC News. The number of pages to load can be specified

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException
from bs4 import BeautifulSoup
import time


def scrape_bbcnews(base_url, n_pages):

    # Navigate to the webpage
    driver = webdriver.Chrome()
    driver.get(base_url)

    # Wait for me to close cookie overlays
    time.sleep(7)

    # Initialize variables
    articles = []
    titles = set()
    counter = 0

    try:
        while counter < n_pages:      
            # Wait for new articles to load
            WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, "lx-stream")))

            # Find article containers
            soup = BeautifulSoup(driver.page_source, "html.parser")
            lx_stream_div = soup.find("div", id="lx-stream")
            article_containers = lx_stream_div.find_all("li", class_="lx-stream__post-container")

            for container in article_containers:
                article = []
                # Extract the title
                title = container.find("header", class_="lx-stream-post__header")
                if title:
                    # Skip duplicate page if title was already present
                    if title in titles:
                        counter -= 1
                        break
                    else:
                        titles.add(title)

                    article.append(title.text.strip())
                else:
                    article.append(None)

                # Extract the summary text
                summary = container.find("p", class_="lx-stream-related-story--summary")
                if summary:
                    article.append(summary.text.strip())
                else:
                    article.append(None)

                # Extract the URL
                link = container.find("a", class_="qa-story-cta-link")
                if link and 'href' in link.attrs:
                    article.append(link['href'])
                else:
                    article.append(None) 

                if None not in article:
                    articles.append(article)

            # Attempt to find the "Next" button and exit if there is none
            try:
                next_button = driver.find_element(By.CLASS_NAME, "qa-pagination-next-page")
                next_button.click()
            except ElementClickInterceptedException:
                break

            counter += 1

    finally:
        driver.quit()
        return articles

# Add to CSV function
Save the scraped information to CSV for later use

In [2]:
import csv
import os

def write_articles_to_csv(articles, file_name):
    # Check if the file already exists
    file_exists = os.path.isfile(file_name)

    # Open the CSV file in append mode
    with open(file_name, mode='a', newline='', encoding='utf-8') as csv_file:
        fieldnames = ["Title", "Summary", "URL", "Interesting"]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        # If the file doesn't exist, write the header row
        if not file_exists:
            writer.writeheader()

        # Write each article
        for article in articles:
            writer.writerow({"Title": article[0], "Summary": article[1], "URL": article[2], "Interesting": ""})

# Remove duplicates from CSV function

In [3]:
import csv
import os

def remove_duplicates_from_csv(file_name):
    # Create a set to keep track of seen titles
    seen_titles = set()

    # Create a temporary file to write the deduplicated data
    temp_file_name = file_name + ".tmp"

    with open(file_name, mode='r', newline='', encoding='utf-8') as csv_file, open(temp_file_name, mode='w', newline='', encoding='utf-8') as temp_file:
        reader = csv.DictReader(csv_file)
        fieldnames = reader.fieldnames

        writer = csv.DictWriter(temp_file, fieldnames=fieldnames)
        writer.writeheader()

        for row in reader:
            title = row["Title"]

            # Check if the title has been seen before
            if title not in seen_titles:
                seen_titles.add(title)
                writer.writerow(row)

    # Replace the original file with the deduplicated data
    os.replace(temp_file_name, file_name)

# Fill Interesting column
Print articles' titles and summaries one by one and fill the "Interesting" label based on keyboard input

In [9]:
import csv
import time
import keyboard

def set_interesting(file_name):
    try:
        with open(file_name, mode='r', newline='', encoding='utf-8') as csv_file:
            reader = csv.DictReader(csv_file)
            fieldnames = reader.fieldnames

            # Ensure "Interesting" is one of the headers
            if "Interesting" not in fieldnames:
                print("'Interesting' column not found in the CSV file.")
                return

            # Read all rows into a list
            all_rows = list(reader)

        for i, row in enumerate(all_rows):
            title = row["Title"]
            summary = row["Summary"]
            interesting = row["Interesting"]

            # If "Interesting" is already set or if it's not empty, keep the row as is
            if interesting and interesting.strip() != "":
                continue

            print(title)
            print(summary)

            time.sleep(0.2)

            try:
                choice = input("Enter '0' for not interesting or '1' for interesting (or press Enter to skip, Esc to exit): ")

                # Check for keyboard interruption (Ctrl+C)
                # if choice == '':
                #     break
                # Check if esc key was pressed
                if keyboard.is_pressed('esc'):
                    break

                # Validate input
                if choice not in ('0', '1'):
                    print("Invalid input. Please enter '0' or '1'.\n")
                    continue

                # Update "Interesting" cell
                all_rows[i]["Interesting"] = choice
                print("Updated!\n")

            except KeyboardInterrupt:
                break

        # Write all rows back to the new file
        with open(file_name, mode='w', newline='', encoding='utf-8') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(all_rows)

    except Exception as e:
        print(f"An error occurred: {str(e)}")


# Run functions

In [5]:
bbc_world_url = "https://www.bbc.com/news/world"
bbc_science_url = "https://www.bbc.com/news/science_and_environment"
bbc_tech_url = "https://www.bbc.com/news/technology"

In [6]:
write_articles_to_csv(scrape_bbcnews(bbc_world_url, 10), 'bbc_world_train.csv')
# write_articles_to_csv(scrape_bbcnews(bbc_science_url, 50), 'bbc_science_train.csv')
# write_articles_to_csv(scrape_bbcnews(bbc_tech_url, 5), 'bbc_tech_train.csv')

In [7]:
# remove_duplicates_from_csv('bbc_world_train.csv')
# remove_duplicates_from_csv('bbc_science_train.csv')
remove_duplicates_from_csv('bbc_tech_train.csv')

In [11]:
set_interesting('bbc_world_train.csv')

Fiction and truth about today's emergency test
False rumours are swirling online about a routine trial of the US alert system.
Updated!

Couple killed by bear in Canada 'loved the outdoors'
A family member said Doug Inglis and Jenny Gusse, both 62, were experienced backcountry hikers.
Updated!

