In [136]:
!pip install beautifulsoup4
!pip install requests
!pip install pandas
!pip install selenium

#!pip install newspaper4k

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [137]:
"""
BBC Search Results Scraper
Adapted from the Straits Times WebScraper code.

This script:
1. Iterates over a list of BBC search result URLs (for "Crime and Cybersecurity").
2. Downloads each page using the requests module with a custom User-Agent.
3. Parses the page with BeautifulSoup.
4. Extracts the news article title and URL from each <article> element.
5. Normalises relative URLs to full BBC URLs.
6. Aggregates the results and saves them as a CSV file.
"""

'\nBBC Search Results Scraper\nAdapted from the Straits Times WebScraper code.\n\nThis script:\n1. Iterates over a list of BBC search result URLs (for "Crime and Cybersecurity").\n2. Downloads each page using the requests module with a custom User-Agent.\n3. Parses the page with BeautifulSoup.\n4. Extracts the news article title and URL from each <article> element.\n5. Normalises relative URLs to full BBC URLs.\n6. Aggregates the results and saves them as a CSV file.\n'

In [138]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

In [139]:
# Function to scrape a single BBC search result page
def scrape_bbc_search_page(url):
    headers = {
        'User-Agent': (
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
            'AppleWebKit/537.36 (KHTML, like Gecko) '
            'Chrome/114.0.0.0 Safari/537.36'
        )
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise an error for bad status codes
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    results = []
    
 # Find all article containers
    article_containers = soup.find_all('div', attrs={'data-testid': 'newport-card'})
    
    for container in article_containers:
        # Find the headline element
        headline = container.find('h2', attrs={'data-testid': 'card-headline'})
        
        if headline:
            title = headline.get_text(strip=True)
            
            # Find the link (it's in the 'a' tag that contains the headline)
            link = container.find('a', attrs={'data-testid': 'internal-link'})
            
            if link and link.has_attr('href'):
                href = link['href']
                
                # If the URL is relative, prepend the BBC base URL
                if href.startswith('/'):
                    full_url = "https://www.bbc.com" + href
                else:
                    full_url = href
                    
                results.append({
                    'Title': title,
                    'URL': full_url
                })
    
    return results

In [140]:
def main():
    # Read URLs from a CSV file
    try:
        # Assuming the CSV has a column named 'url'
        urls_df = pd.read_csv('BBC_Drug_Trafficking_URLs.csv')
        
        if 'URL' not in urls_df.columns:
            print("Error: CSV file must contain a column named 'URL'")
            return
            
        all_results = []
        
        # Process each URL
        for index, row in urls_df.iterrows():
            url = row['URL']
            print(f"Scraping: {url}")
            
            results = scrape_bbc_search_page(url)
            all_results.extend(results)
            
            # Add a small delay between requests to be polite
            time.sleep(random.uniform(1, 3))
        
        # Save results to a CSV file
        if all_results:
            results_df = pd.DataFrame(all_results)
            results_df.to_csv('BBC_Drug_Trafficking.csv', index=False)
            print(f"Scraped {len(all_results)} articles. Results saved to bbc_articles.csv")
        else:
            print("No articles found.")
            
    except FileNotFoundError:
        print("Error: BBC_Drug_Trafficking_URLs.csv file not found")
    except Exception as e:
        print(f"An error occurred: {e}")

In [141]:
if __name__ == "__main__":
    main()

Scraping: https://www.bbc.com/search?q=%22Organised+Crime%22%2C+%22Drug+Trafficking%22&edgeauth=eyJhbGciOiAiSFMyNTYiLCAidHlwIjogIkpXVCJ9.eyJrZXkiOiAiZmFzdGx5LXVyaS10b2tlbi0xIiwiZXhwIjogMTc0MDc1NDQ2OCwibmJmIjogMTc0MDc1NDEwOCwicmVxdWVzdHVyaSI6ICIlMkZzZWFyY2glM0ZxJTNEJTI1MjJPcmdhbmlzZWQlMkJDcmltZSUyNTIyJTI1MkMlMkIlMjUyMkRydWclMkJUcmFmZmlja2luZyUyNTIyIn0.5cpJuM_5cjw5dj0guQMrMeccN8H3jEXb8iwavtph2kQ
Scraping: https://www.bbc.com/search?q=%22Organised+Crime%22%2C+%22Drug+Trafficking%22&page=1&edgeauth=eyJhbGciOiAiSFMyNTYiLCAidHlwIjogIkpXVCJ9.eyJrZXkiOiAiZmFzdGx5LXVyaS10b2tlbi0xIiwiZXhwIjogMTc0MDgxMDg3NSwibmJmIjogMTc0MDgxMDUxNSwicmVxdWVzdHVyaSI6ICIlMkZzZWFyY2glM0ZxJTNEJTI1MjJPcmdhbmlzZWQlMkJDcmltZSUyNTIyJTI1MkMlMkIlMjUyMkRydWclMkJUcmFmZmlja2luZyUyNTIyJTI2cGFnZSUzRDEifQ.KjTLaeXNHo-67dujk2_aRBJh9dlH0AukJ6McKsFZuUs
Scraping: https://www.bbc.com/search?q=%22Organised+Crime%22%2C+%22Drug+Trafficking%22&page=2&edgeauth=eyJhbGciOiAiSFMyNTYiLCAidHlwIjogIkpXVCJ9.eyJrZXkiOiAiZmFzdGx5LXVyaS10b2tlbi0xIiwi

In [142]:
pwd

'C:\\Users\\Bertrand Tan\\Documents\\Python Scripts\\Web_Scraping\\01 - Arachne'