In [4]:
import pandas as pd

In [5]:
# minutes = pd.read_csv('/Users/kylenabors/Documents/Database/Training Data/boe/minutes/mpc_minutes.txt', delimiter='\t')

In [6]:
import requests
from bs4 import BeautifulSoup
import os

# Base URL of the Bank of England website
base_url = "https://www.bankofengland.co.uk"

# URL of the sitemap containing the minutes
sitemap_url = base_url + "/sitemap/minutes"

# Create a folder to store the minutes
os.makedirs("BoE_Minutes", exist_ok=True)

# Get the sitemap content
response = requests.get(sitemap_url)
soup = BeautifulSoup(response.content, "html.parser")

# Find all links to PDF files
for link in soup.find_all("a", href=True):
    href = link["href"]
    if href.endswith(".pdf"):
        # Handle relative and absolute URLs
        if href.startswith("http"):
            file_url = href
        else:
            file_url = base_url + href
        file_name = os.path.basename(file_url)
        print(f"Downloading {file_name} from {file_url}...")
        file_response = requests.get(file_url)
        # Check if the request was successful
        if file_response.status_code == 200:
            with open(os.path.join("BoE_Minutes", file_name), "wb") as f:
                f.write(file_response.content)
        else:
            print(f"Failed to download {file_name}")

Downloading court-1694-1697-index.pdf from https://www.bankofengland.co.uk/-/media/boe/files/minutes/1600-1700/1694/court-1694-1697-index.pdf...
Downloading court-july-1694-march-1695.pdf from https://www.bankofengland.co.uk/-/media/boe/files/minutes/1600-1700/1694/court-july-1694-march-1695.pdf...
Downloading court-march-1695-april-1696.pdf from https://www.bankofengland.co.uk/-/media/boe/files/minutes/1600-1700/1695/court-march-1695-april-1696.pdf...
Downloading court-april-1695-july-1696.pdf from https://www.bankofengland.co.uk/-/media/boe/files/minutes/1600-1700/1696/court-april-1695-july-1696.pdf...
Downloading court-1697-1713-index.pdf from https://www.bankofengland.co.uk/-/media/boe/files/minutes/1600-1700/1697/court-1697-1713-index.pdf...
Downloading court-july-1697-august-1698.pdf from https://www.bankofengland.co.uk/-/media/boe/files/minutes/1600-1700/1697/court-july-1697-august-1698.pdf...
Downloading court-august-1698-april-1700.pdf from https://www.bankofengland.co.uk/-/me

In [16]:
pip install pycryptodome

Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import re
import pandas as pd
import PyPDF2
from datetime import datetime

# Directory where the PDFs are stored
pdf_directory = "BoE_Minutes"

# List to store the data
data = []

# Regular expression pattern to extract date from filename
filename_date_pattern = re.compile(r"(\d{4})-(\d{2})-(\d{2})")

# Regular expression pattern to extract date from text content
date_pattern = re.compile(
    r"\b(0?[1-9]|[12][0-9]|3[01])\s+"
    r"(January|February|March|April|May|June|July|August|September|October|November|December)\s+"
    r"(\d{4})\b",
    re.IGNORECASE,
)

# Mapping of month names to numbers
month_mapping = {
    "January": "01",
    "February": "02",
    "March": "03",
    "April": "04",
    "May": "05",
    "June": "06",
    "July": "07",
    "August": "08",
    "September": "09",
    "October": "10",
    "November": "11",
    "December": "12",
}


# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            if reader.is_encrypted:
                try:
                    reader.decrypt("")
                except:
                    print(f"Cannot decrypt {pdf_path}. Skipping.")
                    return text
            for page in reader.pages:
                text += page.extract_text() or ""
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text


# Function to extract date from filename or PDF content
def extract_date(file_name, text_content):
    # Try to extract date from the filename
    filename_date_match = filename_date_pattern.search(file_name)
    if filename_date_match:
        date_str = "-".join(filename_date_match.groups())
        return date_str

    # If not found, try to extract date from the text content
    text_date_match = date_pattern.search(text_content)
    if text_date_match:
        day, month_str, year = text_date_match.groups()
        month_str = month_str.capitalize()
        if month_str in month_mapping:
            month = month_mapping[month_str]
            try:
                date_obj = datetime.strptime(
                    f"{year}-{month}-{int(day):02d}", "%Y-%m-%d"
                )
                date_str = date_obj.strftime("%Y-%m-%d")
                return date_str
            except ValueError as ve:
                print(f'Invalid date "{day} {month_str} {year}" in {file_name}: {ve}')
                return None

    # If date is not found, return None
    return None


# Get the current year
current_year = datetime.now().year

# Iterate over all PDF files in the directory
for file_name in os.listdir(pdf_directory):
    if file_name.endswith(".pdf"):
        # Filter PDFs from the last 20 years
        file_year_match = re.search(r"(\d{4})", file_name)
        if file_year_match:
            file_year = int(file_year_match.group(1))
            if file_year < current_year - 30:
                print(
                    f"Skipping {file_name} (year {file_year} is older than 30 years)."
                )
                continue
        else:
            # If year is not in filename, you might want to decide whether to skip or process
            print(f"Year not found in filename {file_name}. Skipping this file.")
            continue

        pdf_path = os.path.join(pdf_directory, file_name)
        print(f"Processing {file_name}...")
        # Extract text from the PDF
        text_content = extract_text_from_pdf(pdf_path)
        if not text_content.strip():
            print(f"No text extracted from {file_name}. Skipping this file.")
            continue
        # Extract date
        date_str = extract_date(file_name, text_content)
        if date_str:
            # Append to the data list
            data.append({"date": date_str, "text": text_content})
        else:
            print(f"Date not found for {file_name}. Skipping this file.")

# Create a DataFrame from the data list
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv("BoE_Minutes.csv", index=False)
print("CSV file has been created successfully.")

Skipping court-april-1903-april-1904-index.pdf (year 1903 is older than 20 years).
Processing rfr-march-2015.pdf...
Date not found for rfr-march-2015.pdf. Skipping this file.
Processing minutes-january-2013.pdf...
Processing court-november-2016.pdf...
Processing minutes-january-2007.pdf...
Skipping court-october-1852-march-1853.pdf (year 1852 is older than 20 years).
Processing wholesale-distribution-steering-group-minutes-november-2019.pdf...
Skipping court-may-1744-april-1746.pdf (year 1744 is older than 20 years).
Processing minutes-october-2006.pdf...
Processing court-jan-jun-1999.pdf...
Processing minutes-october-2012.pdf...
Skipping court-december-1898-april-1899.pdf (year 1898 is older than 20 years).
Skipping court-march-1882-march-1883-index.pdf (year 1882 is older than 20 years).
Processing minutes-december-2009.pdf...
Processing rfr-november-2019.pdf...
Date not found for rfr-november-2019.pdf. Skipping this file.
Skipping court-april-1843-april-1844-index.pdf (year 1843 is 

unknown widths : 
[0, IndirectObject(887, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(891, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(895, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(891, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(891, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(891, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(887, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(891, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(895, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(891, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(887, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(891, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(891, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(887, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(895, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(891, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(891, 0, 5487217360)]
unknown widths

Date not found for meeting-slides-cbdc-engagement-forum-february-2022.pdf. Skipping this file.
Processing fxjsc-may-2015.pdf...
Skipping court-november-1832-april-1833.pdf (year 1832 is older than 20 years).
Skipping court-october-1864-march-1865.pdf (year 1864 is older than 20 years).
Skipping court-january-1769-october-1771.pdf (year 1769 is older than 20 years).
Skipping court-april-1848-april-1849-index.pdf (year 1848 is older than 20 years).
Processing court-2007.pdf...


unknown widths : 
[0, IndirectObject(891, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(887, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(887, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(891, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(895, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(887, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(891, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(549, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(887, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(891, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(891, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(891, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(895, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(545, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(891, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(887, 0, 5487217360)]
unknown widths : 
[0, IndirectObject(891, 0, 5487217360)]
unknown widths

Skipping court-april-1873-april-1874-index.pdf (year 1873 is older than 20 years).
Processing meeting-slides-cbdc-technology-forum-january-2022.pdf...
Date not found for meeting-slides-cbdc-technology-forum-january-2022.pdf. Skipping this file.
Skipping court-october-1851-april-1852.pdf (year 1851 is older than 20 years).
Skipping court-april-1864-march-1865-index.pdf (year 1864 is older than 20 years).
Processing minutes-august-2001.pdf...
Processing minutes-august-2015.pdf...
Processing minutes-november-2002.pdf...
Skipping court-april-1842-september-1842.pdf (year 1842 is older than 20 years).
Skipping court-april-1854-october-1854.pdf (year 1854 is older than 20 years).
Skipping court-november-1854-april-1855.pdf (year 1854 is older than 20 years).
Processing post-trade-task-force-minutes-july-2020.pdf...
No text extracted from post-trade-task-force-minutes-july-2020.pdf. Skipping this file.
Skipping court-april-1922-april-1923-index.pdf (year 1922 is older than 20 years).
Processi

unknown widths : 
[0, IndirectObject(910, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(914, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(918, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(914, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(918, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(914, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(914, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(914, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(914, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(914, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(914, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(910, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(914, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(557, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(918, 0, 5483657232)]


Skipping court-april-1941-april-1942.pdf (year 1941 is older than 20 years).
Processing standards-advisory-panel-april-2020.pdf...
Processing wholesale-distribution-steering-group-minutes-december-2019.pdf...
Processing court-2008-book1.pdf...


unknown widths : 
[0, IndirectObject(914, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(914, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(914, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(910, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(914, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(914, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(910, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(914, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(914, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(910, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(914, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(910, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(918, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(914, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(910, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(910, 0, 5483657232)]
unknown widths : 
[0, IndirectObject(914, 0, 5483657232)]
unknown widths

Skipping court-april-1877-march-1878-index.pdf (year 1877 is older than 20 years).
Processing court-april-2020.pdf...
Skipping court-november-1818-october-1819.pdf (year 1818 is older than 20 years).
Processing court-jul-dec-1994.pdf...
Processing march-2019.pdf...
Processing fxjsc-september-2017.pdf...
Skipping court-march-1951-february-1952.pdf (year 1951 is older than 20 years).
Skipping court-april-1843-october-1843.pdf (year 1843 is older than 20 years).
Processing securities lending committee november 2018.pdf...
Processing minutes-april-2003.pdf...
Processing december-2016.pdf...
Processing minutes-april-2015.pdf...
Processing minutes-april-2001.pdf...
Skipping court-april-1704-january-1706.pdf (year 1704 is older than 20 years).
Processing fxjsc-september-2015.pdf...
Processing rfr-23-september-2015.pdf...
Skipping court-april-1871-september-1871.pdf (year 1871 is older than 20 years).
Processing court-february-2013.pdf...
Processing court-jul-dec-1996.pdf...
Skipping court-apr

unknown widths : 
[0, IndirectObject(733, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(737, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(741, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(733, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(741, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(733, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(741, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(737, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(733, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(741, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(737, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(733, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(741, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(733, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(741, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(439, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(737, 0, 5350839120)]
unknown widths

Skipping court-april-1751-september-1757-index.pdf (year 1751 is older than 20 years).
Skipping court-april-1903-october-1903.pdf (year 1903 is older than 20 years).
Skipping court-september-1795-june-1798-index.pdf (year 1795 is older than 20 years).
Processing rfr-2-november-2016.pdf...
Processing court-2008-book2.pdf...


unknown widths : 
[0, IndirectObject(741, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(448, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(741, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(737, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(448, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(741, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(448, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(741, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(733, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(452, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(741, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(733, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(737, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(733, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(741, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(452, 0, 5350839120)]
unknown widths : 
[0, IndirectObject(733, 0, 5350839120)]
unknown widths

Processing slides-cbdc-technology-forum-october-2024.pdf...
Date not found for slides-cbdc-technology-forum-october-2024.pdf. Skipping this file.
Skipping court-april-1822-april-1823-index.pdf (year 1822 is older than 20 years).
Processing monetary-policy-summary-and-minutes-february-2023.pdf...
Processing minutes-april-2000.pdf...
Processing december-2017.pdf...
Processing minutes-april-2014.pdf...
Processing rfr-jan-2018.pdf...
Date not found for rfr-jan-2018.pdf. Skipping this file.
Skipping court-march-1919-march-1920-index.pdf (year 1919 is older than 20 years).
Processing court-jan-may-1998.pdf...
Processing minutes-april-2004.pdf...
Processing court-february-2016.pdf...
Processing march-2022.pdf...
Skipping court-november-1827-march-1828.pdf (year 1827 is older than 20 years).
Processing fxjsc-june-2020.pdf...
Processing monetary-policy-summary-and-minutes-august-2023.pdf...
Skipping court-january-1723-october-1725.pdf (year 1723 is older than 20 years).
Processing rfr-may-2017.

unknown widths : 
[0, IndirectObject(551, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(551, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(331, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(551, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]


Date not found for meeting-slides-cbdc-technology-forum-meeting-september-2021.pdf. Skipping this file.
Processing court-july-2015.pdf...
Processing securities lending committee september 2018.pdf...
Skipping court-april-1895-april-1896-index.pdf (year 1895 is older than 20 years).
Skipping court-november-1867-april-1868.pdf (year 1867 is older than 20 years).
Skipping court-april-1820-april-1821-index.pdf (year 1820 is older than 20 years).
Skipping court-october-1757-september-1763-index.pdf (year 1757 is older than 20 years).
Processing court-2009.pdf...


unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(551, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(551, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(551, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(551, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(551, 0, 5356034960)]
unknown widths : 
[0, IndirectObject(555, 0, 5356034960)]
unknown widths

Processing minutes-march-2011.pdf...
Skipping court-april-1932-march-1933-index.pdf (year 1932 is older than 20 years).
Processing minutes-march-2005.pdf...
Processing march13-2020.pdf...
Skipping court-october-1819-april-1820.pdf (year 1819 is older than 20 years).
Processing court-october-2020.pdf...
Skipping court-april-1815-october-1815.pdf (year 1815 is older than 20 years).
Skipping court-april-1806-april-1807.pdf (year 1806 is older than 20 years).
Processing court-december-2013.pdf...
Skipping court-april-1713-september-1715.pdf (year 1713 is older than 20 years).
Processing monetary-policy-summary-and-minutes-december-2022.pdf...
Processing february-2016.pdf...
Skipping court-october-1918-march-1919.pdf (year 1918 is older than 20 years).
Processing may-2019.pdf...
Processing wholesale-distribution-steering-group-minutes-december-2020.pdf...
Date not found for wholesale-distribution-steering-group-minutes-december-2020.pdf. Skipping this file.
Processing court-april-2019.pdf..