In [None]:
import requests
from bs4 import BeautifulSoup
import os
import re
import time
from datetime import datetime
from requests.adapters import HTTPAdapter, Retry

# Base URL
base_url = "https://www.bankofengland.co.uk"

# Sitemap URL for speeches
sitemap_url = base_url + "/sitemap/speeches"

# Directory to save speeches
speeches_dir = "/Users/kylenabors/Documents/Database/Training Data/boe/boe_speeches"
os.makedirs(speeches_dir, exist_ok=True)

# Set up a requests session with retries
session = requests.Session()
retries = Retry(
    total=1,  # Total number of retries
    backoff_factor=0.5,  # A backoff factor to apply between attempts
    status_forcelist=[500, 502, 503, 504],  # HTTP status codes to retry
    allowed_methods=["HEAD", "GET", "OPTIONS"],  # Methods to retry
)
adapter = HTTPAdapter(max_retries=retries)
session.mount("http://", adapter)
session.mount("https://", adapter)

# Get the sitemap content
try:
    response = session.get(sitemap_url, timeout=10)
    response.raise_for_status()
except requests.exceptions.RequestException as e:
    print(f"Error fetching sitemap: {e}")
    exit(1)

soup = BeautifulSoup(response.content, "html.parser")


# Function to parse date from string
def parse_date(date_str):
    try:
        return datetime.strptime(date_str, "%d %B %Y")
    except ValueError:
        return None


# Compile regex patterns
pdf_pattern = re.compile(r"\.pdf$")
date_pattern = re.compile(r"(\d{1,2} \w+ \d{4})")

# Iterate over all speech links
for link in soup.find_all("a", href=True):
    href = link["href"]
    if pdf_pattern.search(href):
        # Construct full URL
        if href.startswith("http"):
            pdf_url = href
        else:
            pdf_url = base_url + href

        # Extract filename
        filename = os.path.basename(href)

        # Check if file already exists
        file_path = os.path.join(speeches_dir, filename)
        if os.path.exists(file_path):
            print(f"Skipping {filename} (already downloaded)")
            continue

        # Extract date from link text or filename
        text = link.get_text(strip=True)
        date_match = date_pattern.search(text)
        if date_match:
            date_str = date_match.group(1)
            date_obj = parse_date(date_str)
        else:
            # Try to extract date from filename
            date_match = date_pattern.search(filename)
            if date_match:
                date_str = date_match.group(1)
                date_obj = parse_date(date_str)
            else:
                date_obj = None

        # Filter speeches from the last 30 years
        # Attempt to download the PDF with retries
        for attempt in range(1):
            try:
                pdf_response = session.get(pdf_url, timeout=10)
                pdf_response.raise_for_status()
                with open(file_path, "wb") as f:
                    f.write(pdf_response.content)
                print(f"Downloaded {filename}")
                break  # Break the retry loop if successful
            except requests.exceptions.RequestException as e:
                print(f"Error downloading {filename}: {e}")
                if attempt < 4:
                    wait_time = (attempt + 1) * 2  # Exponential backoff
                    print(f"Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
                else:
                    print(f"Failed to download {filename} after multiple attempts.")

In [15]:
import os
import re
import pandas as pd
from datetime import datetime
from pdfminer.high_level import extract_text

# Directory where PDFs are stored
speeches_dir = "/Users/kylenabors/Documents/Database/Training Data/boe/boe_speeches"

# List to store extracted data
data = []

# Regex patterns
date_pattern = re.compile(r"(\d{1,2} \w+ \d{4})")
speaker_pattern = re.compile(r"By ([A-Za-z ,\.]+)")

# Iterate over PDFs
for filename in os.listdir(speeches_dir):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(speeches_dir, filename)
        print(f"Processing {filename}...")
        try:
            text_content = extract_text(pdf_path)
            # Extract date from text or filename
            date_match = date_pattern.search(text_content)
            if date_match:
                date_str = date_match.group(1)
                date_obj = datetime.strptime(date_str, "%d %B %Y")
            else:
                # Try to extract date from filename
                date_match = date_pattern.search(filename)
                if date_match:
                    date_str = date_match.group(1)
                    date_obj = datetime.strptime(date_str, "%d %B %Y")
                else:
                    print(f"Date not found for {filename}. Skipping.")
                    continue

            # Extract speaker
            speaker_match = speaker_pattern.search(text_content)
            if speaker_match:
                speaker = speaker_match.group(1).strip()
            else:
                speaker = "Unknown"

            # Append to data list
            data.append(
                {
                    "date": date_obj.strftime("%Y-%m-%d"),
                    "group": speaker,
                    "segment": text_content,
                }
            )
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv(
    "/Users/kylenabors/Documents/Database/Training Data/boe/boe_speeches/boe_Speeches.csv",
    index=False,
)
print("CSV file has been created successfully.")

Processing the-structure-of-regulation-lessons-from-the-crisis-of-2007.pdf...
Processing monetary-policy-and-the-supply-side.pdf...
Processing why-prudential-regulation-matters-speech-by-andrew-bailey.pdf...
Processing getting-back-to-business-speech-by-andrew-sentence.pdf...
Processing foreign-exchange-as-a-business-in-the-21st-century.pdf...
Processing cbi-west-midlands-economic-dinner.pdf...
Processing regulatory-reform-its-possible-market-consequences-and-the-case-of-securities.pdf...
Processing household-indebtedness-the-exchange-rate-and-risks-to-the-uk-economy.pdf...
Processing the-puzzle-of-uk-business-investment.pdf...
Processing risk-uncertainty-and-monetary-policy-regimes.pdf...
Processing government-debt-and-unconventional-monetary-policy.pdf...
Processing what-a-difference-a-decade-makes.pdf...
Processing think-global-act-local.pdf...
Processing stuck.pdf...


The PDF <_io.BufferedReader name='/Users/kylenabors/Documents/Database/Training Data/boe/boe_speeches/control-rights-and-wrongs-speech-by-andrew-haldane.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Processing control-rights-and-wrongs-speech-by-andrew-haldane.pdf...
Processing the-uk-bank-resolution-regime.pdf...
Processing running-out-of-room-revisiting-the-3d-perspective-on-low-interest-rates-speech-by-gertjan-vlieghe.pdf...
Processing uk-monetary-policy-good-for-business.pdf...
Processing the-state-of-the-markets-four-issues.pdf...
Processing a-21st-century-approach-to-dealing-with-failed-banks.pdf...
Processing inflation-targeting-in-practice-the-uk-experience.pdf...
Processing mervyn-king-cbi-dinner.pdf...
Processing the-credit-crunch-and-the-uk-economy.pdf...
Processing uk-monetary-policy-in-a-changing-world.pdf...
Processing edward-george-st-pauls-cathedral-lecture.pdf...
Processing from-design-to-delivery-stability-in-the-new-retail-payments-infrastructure.pdf...
Processing ambidexterity.pdf...
Processing achieving-a-sustainable-recovery-where-next-for-business-investment.pdf...
Processing cbi-scotland-dinner.pdf...
Processing monetary-policy-as-the-output-gap-closes-spee

The PDF <_io.BufferedReader name='/Users/kylenabors/Documents/Database/Training Data/boe/boe_speeches/deepen-and-diversify-uk-financial-infrastucture-to-enable-small-business-growth-slides.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Processing scottish-engineering-senior-executive-dinner.pdf...
Processing deepen-and-diversify-uk-financial-infrastucture-to-enable-small-business-growth-slides.pdf...
Processing ideas-and-institutions-a-growth-story-speech-by-andy-haldane.pdf...
Processing the-uk-current-account-deficit-risky-or-risk-sharing.pdf...
Processing oil-price-falls-what-consequences-for-monetary-policy.pdf...
Processing in-giving-how-much-do-we-receive-the-social-value-of-volunteering.pdf...
Processing remarks-at-the-accounting-for-sustainability-summit-2018.pdf...
Processing speech-by-mervyn-king.pdf...
Processing pra-solvency-2-countdown-to-implementation-david-rule.pdf...
Processing policy-priorities-for-prudential-regulation-and-supervision.pdf...
Processing the-sustainable-development-goal-imperative.pdf...
Processing speech-slides-by-jonathan-haskel-on-uk-inflation.pdf...
Error processing speech-slides-by-jonathan-haskel-on-uk-inflation.pdf: time data '1 to 2024' does not match format '%d %B %Y'
Proces

The PDF <_io.BufferedReader name='/Users/kylenabors/Documents/Database/Training Data/boe/boe_speeches/the-yield-curve-and-qe-speech-by-gertjan-vlieghe.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Processing the-yield-curve-and-qe-speech-by-gertjan-vlieghe.pdf...
Processing remarks-given-as-a-discussant-of-passthrough-efficiency-slides.pdf...
Processing inflation-and-the-global-economy.pdf...
Processing global-economic-tsunamis-coincidence-common-shocks-or-contagion.pdf...
Processing britain-in-europe-at-the-borsen.pdf...
Processing monetary-policy-issues-past-present-future.pdf...
Processing why-are-interest-rates-low.pdf...
Processing outlier-or-laggard-divergence-and-convergence-in-the-uks-recent-inflation-performance.pdf...
Processing back-to-the-future-speech-by-dave-ramsden.pdf...
Processing 50-character-selection-and-future-forum-launch.pdf...
Processing promoting-financial-system-resilience-in-modern-global-capital-markets-some-issues.pdf...
Processing de-globalisation-and-inflation.pdf...
Processing banks-and-the-systemic-risk-theory-and-evidence.pdf...
Processing financial-crisis-and-g20-financial-regulatory-reform-speech-by-paul-tucker.pdf...
Processing sustaining-the

The PDF <_io.BufferedReader name='/Users/kylenabors/Documents/Database/Training Data/boe/boe_speeches/international-financial-law-reviews-regulatory-contribution-award.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


Processing international-financial-law-reviews-regulatory-contribution-award.pdf...
Processing mark-carney-speech-at-the-trades-union-congress-transcript.pdf...
Date not found for mark-carney-speech-at-the-trades-union-congress-transcript.pdf. Skipping.
Processing developments-in-uk-financial-markets.pdf...
Processing telling-tails-of-oil-and-global-inflation.pdf...
Processing managing-cyber-risk-the-global-banking-perspective.pdf...
Processing quantitative-tightening-speech-by-dave-ramsden.pdf...
Processing uk-inflation-since-the-pandemic-how-did-we-get-here-and-where-are-we-going-speech-by-jonathan-haskel.pdf...
Processing andrew-bailey-speech-at-the-lord-mayors-banquet-london.pdf...
Processing strengthening-regimes-for-controlling-liquidity-risk-some-lessons-from-the-recent-turmoil.pdf...
Processing remarks-given-by-mark-carney-governor-regarding-polymer-notes-and-the-review-of-the-banknote-charact.pdf...
Processing forcasting-future-banknote-demand.pdf...
Processing when-why-and-wh