In [None]:
import requests
from bs4 import BeautifulSoup
import os
import re
import time
from datetime import datetime
from requests.adapters import HTTPAdapter, Retry

# Base URL
base_url = "https://www.bankofengland.co.uk"

# Sitemap URL for speeches
sitemap_url = base_url + "/sitemap/speeches"

# Directory to save speeches
speeches_dir = "/Users/kylenabors/Documents/Database/Training Data/boe/boe_speeches"
os.makedirs(speeches_dir, exist_ok=True)

# Set up a requests session with retries
session = requests.Session()
retries = Retry(
    total=5,  # Total number of retries
    backoff_factor=0.5,  # A backoff factor to apply between attempts
    status_forcelist=[500, 502, 503, 504],  # HTTP status codes to retry
    allowed_methods=["HEAD", "GET", "OPTIONS"],  # Methods to retry
)
adapter = HTTPAdapter(max_retries=retries)
session.mount("http://", adapter)
session.mount("https://", adapter)

# Get the sitemap content
try:
    response = session.get(sitemap_url, timeout=10)
    response.raise_for_status()
except requests.exceptions.RequestException as e:
    print(f"Error fetching sitemap: {e}")
    exit(1)

soup = BeautifulSoup(response.content, "html.parser")


# Function to parse date from string
def parse_date(date_str):
    try:
        return datetime.strptime(date_str, "%d %B %Y")
    except ValueError:
        return None


# Compile regex patterns
pdf_pattern = re.compile(r"\.pdf$")
date_pattern = re.compile(r"(\d{1,2} \w+ \d{4})")

# Iterate over all speech links
for link in soup.find_all("a", href=True):
    href = link["href"]
    if pdf_pattern.search(href):
        # Construct full URL
        if href.startswith("http"):
            pdf_url = href
        else:
            pdf_url = base_url + href

        # Extract filename
        filename = os.path.basename(href)

        # Check if file already exists
        file_path = os.path.join(speeches_dir, filename)
        if os.path.exists(file_path):
            print(f"Skipping {filename} (already downloaded)")
            continue

        # Extract date from link text or filename
        text = link.get_text(strip=True)
        date_match = date_pattern.search(text)
        if date_match:
            date_str = date_match.group(1)
            date_obj = parse_date(date_str)
        else:
            # Try to extract date from filename
            date_match = date_pattern.search(filename)
            if date_match:
                date_str = date_match.group(1)
                date_obj = parse_date(date_str)
            else:
                date_obj = None

        # Filter speeches from the last 30 years
        # Attempt to download the PDF with retries
        for attempt in range(5):
            try:
                pdf_response = session.get(pdf_url, timeout=10)
                pdf_response.raise_for_status()
                with open(file_path, "wb") as f:
                    f.write(pdf_response.content)
                print(f"Downloaded {filename}")
                break  # Break the retry loop if successful
            except requests.exceptions.RequestException as e:
                print(f"Error downloading {filename}: {e}")
                if attempt < 4:
                    wait_time = (attempt + 1) * 2  # Exponential backoff
                    print(f"Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    print(f"Failed to download {filename} after multiple attempts.")

In [None]:
import os
import re
import pandas as pd
from datetime import datetime

# Use PyMuPDF for better text extraction
import fitz

# Directory where PDFs are stored
speeches_dir = "BoE_Speeches"

# List to store extracted data
data = []

# Regex patterns
date_pattern = re.compile(r"(\d{1,2} \w+ \d{4})")
speaker_pattern = re.compile(r"By ([A-Za-z ,\.]+)")

# Iterate over PDFs
for filename in os.listdir(speeches_dir):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(speeches_dir, filename)
        print(f"Processing {filename}...")
        text_content = ""
        try:
            with fitz.open(pdf_path) as doc:
                for page in doc:
                    text_content += page.get_text()

            # Extract date from text or filename
            date_match = date_pattern.search(text_content)
            if date_match:
                date_str = date_match.group(1)
                date_obj = datetime.strptime(date_str, "%d %B %Y")
            else:
                # Try to extract date from filename
                date_match = date_pattern.search(filename)
                if date_match:
                    date_str = date_match.group(1)
                    date_obj = datetime.strptime(date_str, "%d %B %Y")
                else:
                    print(f"Date not found for {filename}. Skipping.")
                    continue

            # Extract speaker
            speaker_match = speaker_pattern.search(text_content)
            if speaker_match:
                speaker = speaker_match.group(1).strip()
            else:
                speaker = "Unknown"

            # Append to data list
            data.append(
                {
                    "date": date_obj.strftime("%Y-%m-%d"),
                    "group": speaker,
                    "segment": text_content,
                }
            )
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("BoE_Speeches.csv", index=False)
print("CSV file has been created successfully.")