In [1]:
import os, re, time, requests
from bs4 import BeautifulSoup
from tqdm import tqdm

# ---------- Settings ----------
SAVE_ROOT = "fomc"
FIRST_YEAR = 1936
LAST_YEAR = 2019  # 5-year lag: update to datetime.date.today().year-6 each January
USER_AGENT = "FOMC-histmin-transcripts/0.1 (research@example.com)"
RATE_DELAY = 0.3  # seconds between file downloads


# ---------- Helper functions ----------
def fetch(url, session, tries=3, delay=1.0):
    """GET with basic retry logic."""
    for _ in range(tries):
        r = session.get(url, timeout=20)
        if r.ok:
            return r
        time.sleep(delay)
    print("FAILED:", url)
    return None


def classify(url):
    """Return 'historical_minutes', 'transcript', or None."""
    url_lc = url.lower()
    if "histmin" in url_lc:
        return "historical_minutes"
    if "transcript" in url_lc or re.search(r"/meeting\d{8}", url_lc):
        return "transcript"
    return None


def meeting_date_from(url):
    """Extract YYYYMMDD if present, else '00000000'."""
    m = re.search(r"(\d{8})", url)
    return m.group(1) if m else "00000000"


# ---------- Main crawl ----------
os.makedirs(SAVE_ROOT, exist_ok=True)
with requests.Session() as S:
    S.headers["User-Agent"] = USER_AGENT

    for year in tqdm(range(FIRST_YEAR, LAST_YEAR + 1), desc="Year pages"):
        index_url = (
            f"https://www.federalreserve.gov/monetarypolicy/fomchistorical{year}.htm"
        )
        resp = fetch(index_url, S)
        if not resp:
            continue  # skip missing or error pages

        soup = BeautifulSoup(resp.text, "html.parser")
        for a in soup.find_all("a", href=True):
            href = a["href"]
            if not href.startswith("http"):
                href = "https://www.federalreserve.gov" + href
            doc_type = classify(href)
            if not doc_type:
                continue  # ignore everything else

            meeting_date = meeting_date_from(href)
            ext = os.path.splitext(href)[1] or ".pdf"
            fname = f"{meeting_date}_{doc_type}{ext}"
            outdir = os.path.join(SAVE_ROOT, str(year))
            os.makedirs(outdir, exist_ok=True)
            path = os.path.join(outdir, fname)

            if os.path.exists(path):
                continue  # already downloaded

            file_resp = fetch(href, S)
            if not file_resp:
                continue
            with open(path, "wb") as f:
                f.write(file_resp.content)
            time.sleep(RATE_DELAY)

Year pages: 100%|██████████| 84/84 [04:27<00:00,  3.19s/it]
