In [1]:
import requests
from bs4 import BeautifulSoup
import os
import re
import pandas as pd
from PyPDF2 import PdfReader
from urllib.parse import urljoin
import pdfkit  # For converting .htm to .pdf

# Directories
pdf_dir = "/Users/kylenabors/Documents/Database/Training Data/japan/japan_speeches/PDFs"
csv_path = "/Users/kylenabors/Documents/Database/Training Data/Japan/japan_speeches/japan_speeches.csv"

# Create the PDF directory if it doesn't exist
if not os.path.exists(pdf_dir):
    os.makedirs(pdf_dir)

# Base URLs
current_base_url = "https://www.boj.or.jp"
archive_base_url = "https://www2.boj.or.jp"

# Define the range of years you want to scrape
start_year = 1999  # Adjust the start year as needed
end_year = 2024  # Adjust the end year as needed
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
    " Chrome/98.0.4758.102 Safari/537.36"
}

speech_urls = []

# Generate per-year speech URLs
year_urls = []
for year in range(start_year, end_year + 1):
    if year >= 2011:
        year_url = f"{current_base_url}/en/about/press/koen_{year}/index.htm"
    else:
        if year >= 1999:
            year_url = f"{archive_base_url}/archive/en/announcements/press/koen_{year}/index.htm"
        else:
            continue  # Skip years before 2003 if not available
    year_urls.append((year, year_url))

print(f"Found {len(year_urls)} year URLs.")

# Step 2: Iterate over each year's page to collect speech URLs
for year, year_url in year_urls:
    print(f"Processing year URL: {year_url}")
    response = requests.get(year_url)
    if response.status_code != 200:
        print(f"Failed to retrieve {year_url}, status code {response.status_code}")
        continue
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the speech table
    table = soup.find("table", class_="js-tbl")
    if not table:
        # For archive pages, the table might not have the same class
        table = soup.find("table")
        if not table:
            print(f"No speech table found on page {year_url}")
            continue

    # Extract rows from the table
    tbody = table.find("tbody")
    if not tbody:
        # For archive pages, there might be no <tbody>
        rows = table.find_all("tr")
    else:
        rows = tbody.find_all("tr")

    for row in rows:
        cols = row.find_all("td")
        if len(cols) >= 2:
            date_col = cols[0].get_text(strip=True)
            # For archive pages, the speaker might be missing
            if year >= 2011 and len(cols) >= 3:
                speaker_col = cols[1].get_text(strip=True)
                title_col = cols[2]
            else:
                speaker_col = ""
                title_col = cols[1]

            # Extract the link to the speech page
            link = title_col.find("a", href=True)
            if link:
                speech_url = urljoin(year_url, link["href"])
                title = link.get_text(strip=True)
                speech_urls.append(
                    {
                        "date": date_col,
                        "speaker": speaker_col,
                        "title": title,
                        "speech_url": speech_url,
                    }
                )

    print(f"Processed speeches from {year_url}")

print(f"Total number of speeches found: {len(speech_urls)}")

# Remove duplicates if any
speech_urls = [dict(t) for t in {tuple(d.items()) for d in speech_urls}]

# Read existing CSV data (if any)
if os.path.exists(csv_path) and os.path.getsize(csv_path) > 0:
    existing_df = pd.read_csv(csv_path)
    if "speech_url" in existing_df.columns:
        existing_speech_urls = set(existing_df["speech_url"])
    else:
        existing_speech_urls = set()
else:
    existing_df = pd.DataFrame()
    existing_speech_urls = set()

speech_data = []

for idx, speech in enumerate(speech_urls):
    speech_url = speech["speech_url"]

    # Skip if speech already processed
    if speech_url in existing_speech_urls:
        print(f"Already processed speech at {speech_url}, skipping.")
        continue

    print(f"Processing speech {idx + 1}/{len(speech_urls)}: {speech_url}")

    response = requests.get(speech_url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to retrieve {speech_url}, status code {response.status_code}")
        continue
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all PDF links on the page

    pdf_links = soup.find_all("a", href=re.compile(r"\.pdf$"))
    htm_links = soup.find_all("a", href=re.compile(r"\.htm$"))

    pdf_links.extend(htm_links)

    if pdf_links:
        # Try to find a link that includes 'Full Text' in its text
        pdf_link_tag = None
        for link in pdf_links:
            link_text = link.get_text(strip=True)
            if (
                "Full Text" in link_text
                or "full text" in link_text
                or "Full text" in link_text
            ):
                pdf_link_tag = link
                break
        # If no 'Full Text' link is found, use the first PDF link
        if not pdf_link_tag:
            pdf_link_tag = pdf_links[0]
            print(f"Using first available PDF link for speech at {speech_url}")

        pdf_href = pdf_link_tag["href"]
        # Adjust base URL if necessary
        if speech_url.startswith(archive_base_url):
            pdf_url = urljoin(archive_base_url, pdf_href)
        else:
            pdf_url = urljoin(speech_url, pdf_href)
        print(f"Found PDF URL: {pdf_url}")  # Logging the PDF URL
        pdf_filename = pdf_url.split("/")[-1]
        pdf_path = os.path.join(pdf_dir, pdf_filename)

        # Download the PDF if not already downloaded
        if not os.path.exists(pdf_path):
            if pdf_url.endswith(".htm"):
                pdfkit.from_url(pdf_url, pdf_path)
            try:
                pdf_response = requests.get(pdf_url, headers=headers)
                pdf_response.raise_for_status()  # Check for HTTP errors
                with open(pdf_path, "wb") as f:
                    f.write(pdf_response.content)
                print(f"Downloaded PDF: {pdf_filename}")

            except Exception as e:
                print(f"Error downloading PDF {pdf_filename}: {e}")
                text = ""
                continue  # Skip to the next speech
        else:
            print(f"PDF {pdf_filename} already downloaded.")

        # Extract text from the PDF
        try:
            with open(pdf_path, "rb") as f:
                reader = PdfReader(f)
                text = ""
                for page in reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text
        except Exception as e:
            print(f"Error reading PDF {pdf_filename}: {e}")
            text = ""
    else:
        print(f"No PDF links found for speech at {speech_url}")
        # Attempt to extract text from the web page
        content_div = soup.find("div", id="contents")
        if not content_div:
            content_div = soup.find("div", id="content")  # Try alternative IDs
        if content_div:
            # Remove unwanted elements
            for unwanted in content_div(
                ["script", "style", "noscript", "header", "footer"]
            ):
                unwanted.decompose()
            text = content_div.get_text(separator="\n", strip=True)
        else:
            text = ""
            print(f"Could not extract text from the web page at {speech_url}")

    # Append data
    speech_data.append(
        {
            "date": speech["date"],
            "speaker": speech["speaker"],
            "title": speech["title"],
            "speech_url": speech_url,
            "text": text,
        }
    )

# Combine new data with existing data and save to CSV
new_df = pd.DataFrame(speech_data)
if not existing_df.empty:
    combined_df = pd.concat([existing_df, new_df], ignore_index=True)
else:
    combined_df = new_df

# Save to CSV
combined_df.to_csv(csv_path, index=False)
print(f"CSV file updated at {csv_path}")

Found 26 year URLs.
Processing year URL: https://www2.boj.or.jp/archive/en/announcements/press/koen_1999/index.htm
Processed speeches from https://www2.boj.or.jp/archive/en/announcements/press/koen_1999/index.htm
Processing year URL: https://www2.boj.or.jp/archive/en/announcements/press/koen_2000/index.htm
Processed speeches from https://www2.boj.or.jp/archive/en/announcements/press/koen_2000/index.htm
Processing year URL: https://www2.boj.or.jp/archive/en/announcements/press/koen_2001/index.htm
Processed speeches from https://www2.boj.or.jp/archive/en/announcements/press/koen_2001/index.htm
Processing year URL: https://www2.boj.or.jp/archive/en/announcements/press/koen_2002/index.htm
Processed speeches from https://www2.boj.or.jp/archive/en/announcements/press/koen_2002/index.htm
Processing year URL: https://www2.boj.or.jp/archive/en/announcements/press/koen_2003/index.htm
Processed speeches from https://www2.boj.or.jp/archive/en/announcements/press/koen_2003/index.htm
Processing year