In [5]:
import requests
from bs4 import BeautifulSoup
import os
import re
from PyPDF2 import PdfReader
import pandas as pd

# Directories
pdf_dir = "/Users/kylenabors/Documents/Database/Training Data/japan/japan_minutes/PDFs"
csv_path = "/Users/kylenabors/Documents/Database/Training Data/Japan/japan_minutes/japan_minutes.csv"

# Create PDF directory if it doesn't exist
if not os.path.exists(pdf_dir):
    os.makedirs(pdf_dir)

# Base URLs
base_url = "https://www.boj.or.jp"
main_url = f"{base_url}/en/mopo/mpmsche_minu/minu_all/index.htm"

# Get list of years
response = requests.get(main_url)
soup = BeautifulSoup(response.content, "html.parser")
year_links = soup.find_all(
    "a", href=re.compile(r"/en/mopo/mpmsche_minu/minu_\d{4}/index\.htm")
)

years = []
for link in year_links:
    year_url = base_url + link["href"]
    years.append(year_url)

# Get PDF links for each year
pdf_links = []
for year_url in years:
    response = requests.get(year_url)
    soup = BeautifulSoup(response.content, "html.parser")
    for link in soup.find_all(
        "a", href=re.compile(r"/en/mopo/mpmsche_minu/minu_\d{4}/g\d{6}\.pdf")
    ):
        pdf_link = base_url + link["href"]
        pdf_links.append(pdf_link)

# Download PDFs (only new files)
for pdf_url in pdf_links:
    pdf_filename = pdf_url.split("/")[-1]
    pdf_path = os.path.join(pdf_dir, pdf_filename)
    if os.path.exists(pdf_path):
        print(f"Skipping already downloaded file {pdf_filename}")
        continue
    response = requests.get(pdf_url)
    with open(pdf_path, "wb") as f:
        f.write(response.content)
    print(f"Downloaded {pdf_filename}")

# Read existing CSV data (if any)
if os.path.exists(csv_path):
    existing_df = pd.read_csv(csv_path)
    if "date" in existing_df.columns:
        existing_dates = set(pd.to_datetime(existing_df["date"]).dt.date)
    else:
        existing_dates = set()
else:
    existing_df = pd.DataFrame()
    existing_dates = set()

# Extract text from PDFs and extract date from filename
pdf_texts = []
for pdf_file in os.listdir(pdf_dir):
    if pdf_file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        # Extract date from filename, format: 'gYYMMDD.pdf'
        date_match = re.match(r"g(\d{6})\.pdf", pdf_file)
        if date_match:
            date_str = date_match.group(1)
            # Parse date
            date = pd.to_datetime(date_str, format="%y%m%d").date()
        else:
            date = None  # Handle cases where the date isn't in the filename

        # Skip if date is already in existing CSV
        if date in existing_dates:
            print(f"Data for date {date} already in CSV, skipping.")
            continue

        try:
            with open(pdf_path, "rb") as f:
                reader = PdfReader(f)
                text = ""
                for page in reader.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text
                pdf_texts.append({"date": date, "text": text})
        except Exception as e:
            print(f"Error reading {pdf_file}: {e}")

# Combine new data with existing data and save to CSV
new_df = pd.DataFrame(pdf_texts)
if not existing_df.empty:
    combined_df = pd.concat([existing_df, new_df], ignore_index=True)
else:
    combined_df = new_df

# Save to CSV
combined_df.to_csv(csv_path, index=False)
print(f"CSV file updated at {csv_path}")

Skipping already downloaded file g240731.pdf
Skipping already downloaded file g240614.pdf
Skipping already downloaded file g240426.pdf
Skipping already downloaded file g240319.pdf
Skipping already downloaded file g240123.pdf
Skipping already downloaded file g240731.pdf
Skipping already downloaded file g240614.pdf
Skipping already downloaded file g240426.pdf
Skipping already downloaded file g240319.pdf
Skipping already downloaded file g240123.pdf
Skipping already downloaded file g231219.pdf
Skipping already downloaded file g171221.pdf
Skipping already downloaded file g171031.pdf
Skipping already downloaded file g170921.pdf
Skipping already downloaded file g170720.pdf
Skipping already downloaded file g170616.pdf
Skipping already downloaded file g170427.pdf
Skipping already downloaded file g170316.pdf
Skipping already downloaded file g170131.pdf
Skipping already downloaded file g161220.pdf
Skipping already downloaded file g161101.pdf
Skipping already downloaded file g160921.pdf
Skipping a