In [1]:
import requests
from bs4 import BeautifulSoup
import os
import re
import pandas as pd
from PyPDF2 import PdfReader
from urllib.parse import urljoin
import pdfkit  # For converting .htm to .pdf

# Directories
pdf_dir = "/Users/kylenabors/Documents/Database/Training Data/boj/boj_speeches/PDFs"
csv_path = "/Users/kylenabors/Documents/Database/Training Data/boj/boj_speeches/boj_speeches.csv"

# Create the PDF directory if it doesn't exist
if not os.path.exists(pdf_dir):
    os.makedirs(pdf_dir)

# Base URLs
current_base_url = "https://www.boj.or.jp"
archive_base_url = "https://www2.boj.or.jp"

# Define the range of years you want to scrape
start_year = 1999
end_year = 2024
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
    " Chrome/98.0.4758.102 Safari/537.36"
}

speech_urls = []

# Generate per-year speech URLs
year_urls = []
for year in range(start_year, end_year + 1):
    year_url = (
        f"{current_base_url}/en/about/press/koen_{year}/index.htm"
        if year >= 2011
        else f"{archive_base_url}/archive/en/announcements/press/koen_{year}/index.htm"
    )
    year_urls.append((year, year_url))

print(f"Found {len(year_urls)} year URLs.")

# Step 2: Iterate over each year's page to collect speech URLs
for year, year_url in year_urls:
    print(f"Processing year URL: {year_url}")
    response = requests.get(year_url)
    if response.status_code != 200:
        print(f"Failed to retrieve {year_url}, status code {response.status_code}")
        continue

    soup = BeautifulSoup(response.content, "html.parser")
    table = soup.find("table", class_="js-tbl") or soup.find("table")

    if not table:
        print(f"No speech table found on page {year_url}")
        continue

    rows = (
        table.find("tbody").find_all("tr")
        if table.find("tbody")
        else table.find_all("tr")
    )

    for row in rows:
        cols = row.find_all("td")
        if len(cols) >= 2:
            date_col = cols[0].get_text(strip=True)
            speaker_col = (
                cols[1].get_text(strip=True) if year >= 2011 and len(cols) >= 3 else ""
            )
            title_col = cols[2] if year >= 2011 else cols[1]

            link = title_col.find("a", href=True)
            if link:
                speech_url = urljoin(year_url, link["href"])
                title = link.get_text(strip=True)
                speech_urls.append(
                    {
                        "date": date_col,
                        "speaker": speaker_col,
                        "title": title,
                        "speech_url": speech_url,
                    }
                )

print(f"Total number of speeches found: {len(speech_urls)}")
speech_urls = [dict(t) for t in {tuple(d.items()) for d in speech_urls}]

if os.path.exists(csv_path) and os.path.getsize(csv_path) > 0:
    existing_df = pd.read_csv(csv_path)
    existing_speech_urls = (
        set(existing_df["speech_url"]) if "speech_url" in existing_df.columns else set()
    )
else:
    existing_df = pd.DataFrame()
    existing_speech_urls = set()

speech_data = []

for idx, speech in enumerate(speech_urls):
    speech_url = speech["speech_url"]
    pdf_filename = speech_url.split("/")[-1].replace(".htm", ".pdf")
    pdf_path = os.path.join(pdf_dir, pdf_filename)

    # Check if the speech is marked as processed but the PDF is missing
    if speech_url in existing_speech_urls and not os.path.exists(pdf_path):
        print(f"PDF for speech at {speech_url} is missing. Re-downloading...")

    # If the speech is already processed and the PDF exists, skip it
    elif speech_url in existing_speech_urls and os.path.exists(pdf_path):
        print(f"Already processed speech at {speech_url}, skipping.")
        continue

    print(f"Processing speech {idx + 1}/{len(speech_urls)}: {speech_url}")

    response = requests.get(speech_url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve {speech_url}, status code {response.status_code}")
        continue

    soup = BeautifulSoup(response.content, "html.parser")
    pdf_links = soup.find_all("a", href=re.compile(r"\.pdf$")) + soup.find_all(
        "a", href=re.compile(r"\.htm$")
    )

    if pdf_links:
        pdf_link_tag = next(
            (link for link in pdf_links if "Full Text" in link.get_text(strip=True)),
            pdf_links[0],
        )
        pdf_url = urljoin(
            archive_base_url if speech_url.startswith(archive_base_url) else speech_url,
            pdf_link_tag["href"],
        )

        # Download the PDF or convert HTML to PDF if needed
        try:
            if pdf_url.endswith(".htm"):
                pdfkit.from_url(pdf_url, pdf_path)
                print(f"Converted HTML to PDF: {pdf_filename}")
            else:
                pdf_response = requests.get(pdf_url, headers=headers)
                pdf_response.raise_for_status()
                with open(pdf_path, "wb") as f:
                    f.write(pdf_response.content)
                print(f"Downloaded PDF: {pdf_filename}")
        except Exception as e:
            print(f"Error downloading or converting {pdf_filename}: {e}")
            continue

    else:
        print(
            f"No PDF links found for speech at {speech_url}. Attempting HTML-to-PDF conversion."
        )
        try:
            pdfkit.from_url(speech_url, pdf_path)
            print(f"Converted speech page to PDF: {pdf_filename}")
        except Exception as e:
            print(f"Failed to convert speech page to PDF: {e}")
            continue

    # Extract text from the PDF
    try:
        with open(pdf_path, "rb") as f:
            reader = PdfReader(f)
            text = "".join([page.extract_text() or "" for page in reader.pages])
    except Exception as e:
        print(f"Error reading PDF {pdf_filename}: {e}")
        text = ""

    # Append data
    speech_data.append({**speech, "text": text})

# Combine new data with existing data and save to CSV
new_df = pd.DataFrame(speech_data)
combined_df = (
    pd.concat([existing_df, new_df], ignore_index=True)
    if not existing_df.empty
    else new_df
)
combined_df.to_csv(csv_path, index=False)
print(f"CSV file updated at {csv_path}")

Found 26 year URLs.
Processing year URL: https://www2.boj.or.jp/archive/en/announcements/press/koen_1999/index.htm
Processing year URL: https://www2.boj.or.jp/archive/en/announcements/press/koen_2000/index.htm
Processing year URL: https://www2.boj.or.jp/archive/en/announcements/press/koen_2001/index.htm
Processing year URL: https://www2.boj.or.jp/archive/en/announcements/press/koen_2002/index.htm
Processing year URL: https://www2.boj.or.jp/archive/en/announcements/press/koen_2003/index.htm
Processing year URL: https://www2.boj.or.jp/archive/en/announcements/press/koen_2004/index.htm
Processing year URL: https://www2.boj.or.jp/archive/en/announcements/press/koen_2005/index.htm
Processing year URL: https://www2.boj.or.jp/archive/en/announcements/press/koen_2006/index.htm
Processing year URL: https://www2.boj.or.jp/archive/en/announcements/press/koen_2007/index.htm
Processing year URL: https://www2.boj.or.jp/archive/en/announcements/press/koen_2008/index.htm
Processing year URL: https://w

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


No PDF links found for speech at https://www2.boj.or.jp/archive/en/announcements/press/koen_2007/ko0705d.pdf. Attempting HTML-to-PDF conversion.
Failed to convert speech page to PDF: wkhtmltopdf exited with non-zero code 1. error:
Exit with code 1, due to unknown error.

Processing speech 264/729: https://www.boj.or.jp/en/about/press/koen_2020/ko201007a.htm
Downloaded PDF: ko201007a.pdf
Processing speech 265/729: https://www2.boj.or.jp/archive/en/announcements/press/koen_2006/ko0609a.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


No PDF links found for speech at https://www2.boj.or.jp/archive/en/announcements/press/koen_2006/ko0609a.pdf. Attempting HTML-to-PDF conversion.
Failed to convert speech page to PDF: wkhtmltopdf exited with non-zero code 1. error:
Exit with code 1, due to unknown error.

Processing speech 266/729: https://www.boj.or.jp/en/about/press/koen_2016/ko160330a.htm
Downloaded PDF: ko160330a.pdf
Processing speech 267/729: https://www.boj.or.jp/en/about/press/koen_2021/ko211115a.htm
Downloaded PDF: ko211115a.pdf
Processing speech 268/729: https://www2.boj.or.jp/archive/en/announcements/press/koen_2008/ko0806a.htm
Converted HTML to PDF: ko0806a.pdf
Processing speech 269/729: https://www.boj.or.jp/en/about/press/koen_2018/ko181116a.htm
Downloaded PDF: ko181116a.pdf
Processing speech 270/729: https://www.boj.or.jp/en/about/press/koen_2018/ko181016a.htm
Downloaded PDF: ko181016a.pdf
Processing speech 271/729: https://www.boj.or.jp/en/about/press/koen_2011/ko110325b.htm
Downloaded PDF: ko110325b.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


No PDF links found for speech at https://www2.boj.or.jp/archive/en/announcements/press/koen_2006/ko0611b.pdf. Attempting HTML-to-PDF conversion.
Failed to convert speech page to PDF: wkhtmltopdf exited with non-zero code 1. error:
Exit with code 1, due to unknown error.

Processing speech 375/729: https://www.boj.or.jp/en/about/press/koen_2011/ko110526a.htm
Downloaded PDF: ko110526a.pdf
Processing speech 376/729: https://www.boj.or.jp/en/about/press/koen_2014/ko140724a.htm
Downloaded PDF: ko140724a.pdf
Processing speech 377/729: https://www2.boj.or.jp/archive/en/announcements/press/koen_1999/ko9908a.htm
Converted HTML to PDF: ko9908a.pdf
Processing speech 378/729: https://www2.boj.or.jp/archive/en/announcements/press/koen_2001/ko0105a.htm
Converted HTML to PDF: ko0105a.pdf
Processing speech 379/729: https://www.boj.or.jp/en/about/press/koen_2024/ko240924a.htm
Downloaded PDF: ko240924a.pdf
Processing speech 380/729: https://www2.boj.or.jp/archive/en/announcements/press/koen_2005/ko0505a

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


No PDF links found for speech at https://www2.boj.or.jp/archive/en/announcements/press/koen_2008/ko0802c.pdf. Attempting HTML-to-PDF conversion.
Failed to convert speech page to PDF: wkhtmltopdf exited with non-zero code 1. error:
Exit with code 1, due to unknown error.

Processing speech 435/729: https://www.boj.or.jp/en/about/press/koen_2014/ko140526a.htm
Downloaded PDF: ko140526a.pdf
Processing speech 436/729: https://www.boj.or.jp/en/about/press/koen_2022/ko220601a.htm
Downloaded PDF: ko220601a.pdf
Processing speech 437/729: https://www.boj.or.jp/en/about/press/koen_2020/ko200514a.htm
Downloaded PDF: ko200514a.pdf
Processing speech 438/729: https://www.boj.or.jp/en/about/press/koen_2023/ko230301a.htm
Downloaded PDF: ko230301a.pdf
Processing speech 439/729: https://www2.boj.or.jp/archive/en/announcements/press/koen_2004/ko0412b.htm
Converted HTML to PDF: ko0412b.pdf
Processing speech 440/729: https://www.boj.or.jp/en/about/press/koen_2015/ko150721a.htm
Downloaded PDF: ko150721a.pdf


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


No PDF links found for speech at https://www2.boj.or.jp/archive/en/announcements/press/koen_2007/ko0704b.pdf. Attempting HTML-to-PDF conversion.
Failed to convert speech page to PDF: wkhtmltopdf exited with non-zero code 1. error:
Exit with code 1, due to unknown error.

Processing speech 444/729: https://www.boj.or.jp/en/about/press/koen_2023/ko230308a.htm
Downloaded PDF: ko230308a.pdf
Processing speech 445/729: https://www.boj.or.jp/en/about/press/koen_2012/ko121129a.htm
Downloaded PDF: ko121129a.pdf
Processing speech 446/729: https://www.boj.or.jp/en/about/press/koen_2011/ko110930a.htm
Downloaded PDF: ko110930a.pdf
Processing speech 447/729: https://www.boj.or.jp/en/about/press/koen_2011/ko111031a.htm
Downloaded PDF: ko111031a.pdf
Processing speech 448/729: https://www.boj.or.jp/en/about/press/koen_2014/ko140929a.htm
Downloaded PDF: ko140929a.pdf
Processing speech 449/729: https://www.boj.or.jp/en/about/press/koen_2022/ko221226a.htm
Downloaded PDF: ko221226a.pdf
Processing speech 45

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


No PDF links found for speech at https://www2.boj.or.jp/archive/en/announcements/press/koen_2007/ko0704a.pdf. Attempting HTML-to-PDF conversion.
Failed to convert speech page to PDF: wkhtmltopdf exited with non-zero code 1. error:
Exit with code 1, due to unknown error.

Processing speech 523/729: https://www.boj.or.jp/en/about/press/koen_2015/ko150123a.htm
Downloaded PDF: ko150123a.pdf
Processing speech 524/729: https://www.boj.or.jp/en/about/press/koen_2020/ko200306a.htm
Downloaded PDF: ko200306a.pdf
Processing speech 525/729: https://www.boj.or.jp/en/about/press/koen_2017/ko171226a.htm
Downloaded PDF: ko171226a.pdf
Processing speech 526/729: https://www2.boj.or.jp/archive/en/announcements/press/koen_2009/ko0905e.htm
Error downloading or converting ko0905e.pdf: 403 Client Error: Forbidden for url: https://www2.boj.or.jp/data/ko0905e.pdf
Processing speech 527/729: https://www.boj.or.jp/en/about/press/koen_2011/ko110720a.htm
Downloaded PDF: ko110720a.pdf
Processing speech 528/729: http