In [1]:
import requests
from bs4 import BeautifulSoup
import os
import re
from PyPDF2 import PdfReader
import pandas as pd
import pdfkit  # For converting .htm to .pdf

# Directories
pdf_dir = "/Users/kylenabors/Documents/Database/Training Data/boj/boj_minutes/PDFs"
csv_path = (
    "/Users/kylenabors/Documents/Database/Training Data/boj/boj_minutes/boj_minutes.csv"
)

# Create PDF directory if it doesn't exist
if not os.path.exists(pdf_dir):
    os.makedirs(pdf_dir)

# Base URLs
base_url = "https://www.boj.or.jp"
main_url = f"{base_url}/en/mopo/mpmsche_minu/minu_all/index.htm"

# Get list of years
response = requests.get(main_url)
soup = BeautifulSoup(response.content, "html.parser")
year_links = soup.find_all(
    "a", href=re.compile(r"/en/mopo/mpmsche_minu/minu_\d{4}/index\.htm")
)

years = []
for link in year_links:
    year_url = base_url + link["href"]
    years.append(year_url)

# Get PDF/HTM links for each year
pdf_links = []
htm_links = []
for year_url in years:
    response = requests.get(year_url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract the year from the URL
    year_match = re.search(r"minu_(\d{4})", year_url)
    if year_match:
        year = int(year_match.group(1))

    for link in soup.find_all("a", href=True):
        href = link["href"]
        if re.search(r"g\d{6}\.pdf", href):
            pdf_links.append(base_url + href)
        elif re.search(r"g\d{6}\.htm", href) and year > 2006:
            href = href.replace(".htm", ".pdf")
            pdf_links.append(base_url + href)
        elif re.search(r"g\d{6}\.htm", href) and year <= 2006:
            htm_links.append(base_url + href)

for url in htm_links:
    filename = url.split("/")[-1]
    filename = filename.replace(".htm", ".pdf")
    pdf_path = os.path.join(pdf_dir, filename)

    # Skip already downloaded files
    if os.path.exists(pdf_path):
        print(f"Skipping {filename} (already downloaded).")
        continue
    print(url)
    pdfkit.from_url(url, pdf_path)
    print(f"Downloaded {filename} as PDF.")

# Download and process files
for url in pdf_links:
    filename = url.split("/")[-1]
    pdf_path = os.path.join(pdf_dir, filename)

    # Skip already downloaded files
    if os.path.exists(pdf_path):
        print(f"Skipping {filename} (already downloaded).")
        continue

    response = requests.get(url)
    if url.endswith(".htm"):
        html_path = os.path.join(pdf_dir, filename)
        with open(html_path, "wb") as f:
            f.write(response.content)
        print(f"Downloaded {filename} as HTML.")

        # Convert HTML to PDF
        try:
            pdfkit.from_url(htm_links, pdf_path)
            os.remove(html_path)  # Remove the HTML file after conversion
            print(f"Converted {filename} to PDF.")
        except Exception as e:
            print(f"Error converting {filename} to PDF: {e}")
    else:
        with open(pdf_path, "wb") as f:
            f.write(response.content)
        print(f"Downloaded {filename} as PDF.")

# Read existing CSV data (if any)
if os.path.exists(csv_path):
    existing_df = pd.read_csv(csv_path)
    if "date" in existing_df.columns:
        existing_dates = set(pd.to_datetime(existing_df["date"]).dt.date)
    else:
        existing_dates = set()
else:
    existing_df = pd.DataFrame()
    existing_dates = set()

# Extract text from PDFs and extract date from filename
pdf_texts = []
for pdf_file in os.listdir(pdf_dir):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        date_match = re.match(r"g(\d{6})\.pdf", pdf_file)
        if date_match:
            date_str = date_match.group(1)
            date = pd.to_datetime(date_str, format="%y%m%d").date()
        else:
            date = None

        if date in existing_dates:
            print(f"Data for date {date} already in CSV, skipping.")
            continue

        try:
            with open(pdf_path, "rb") as f:
                reader = PdfReader(f)
                text = "".join([page.extract_text() for page in reader.pages])
                pdf_texts.append({"date": date, "text": text})
        except Exception as e:
            print(f"Error reading {pdf_file}: {e}")

# Combine new data with existing data and save to CSV
new_df = pd.DataFrame(pdf_texts)
if not existing_df.empty:
    combined_df = pd.concat([existing_df, new_df], ignore_index=True)
else:
    combined_df = new_df

combined_df.to_csv(csv_path, index=False)
print(f"CSV file updated at {csv_path}")

https://www.boj.or.jp/en/mopo/mpmsche_minu/minu_2006/g060519.htm
Downloaded g060519.pdf as PDF.
https://www.boj.or.jp/en/mopo/mpmsche_minu/minu_2006/g060428.htm
Downloaded g060428.pdf as PDF.
https://www.boj.or.jp/en/mopo/mpmsche_minu/minu_2006/g060411.htm
Downloaded g060411.pdf as PDF.
https://www.boj.or.jp/en/mopo/mpmsche_minu/minu_2006/g060309.htm
Downloaded g060309.pdf as PDF.
https://www.boj.or.jp/en/mopo/mpmsche_minu/minu_2006/g060209.htm
Downloaded g060209.pdf as PDF.
https://www.boj.or.jp/en/mopo/mpmsche_minu/minu_2006/g060120.htm
Downloaded g060120.pdf as PDF.
https://www.boj.or.jp/en/mopo/mpmsche_minu/minu_2005/g051216.htm
Downloaded g051216.pdf as PDF.
https://www.boj.or.jp/en/mopo/mpmsche_minu/minu_2005/g051118.htm
Downloaded g051118.pdf as PDF.
https://www.boj.or.jp/en/mopo/mpmsche_minu/minu_2005/g051031.htm
Downloaded g051031.pdf as PDF.
https://www.boj.or.jp/en/mopo/mpmsche_minu/minu_2005/g051012.htm
Downloaded g051012.pdf as PDF.
https://www.boj.or.jp/en/mopo/mpmsche_mi