In [1]:
import time, os, re, pickle, argparse, shutil
from bs4 import BeautifulSoup
from datetime import datetime
from glob import glob
from tqdm import tqdm
tqdm.pandas()
import pandas as pd
import requests
from daterangeparser import parse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

start_mmddyyyy = "01/01/1994"
end_mmddyyyy = "08/22/2023"

chromedriver_filepath = ""
save_root_dir = './Statements'

url = "https://www.federalreserve.gov/monetarypolicy/materials/"


def prepare_resources_for_scraping(chromedriver_filepath, url, start_mmddyyyy, end_mmddyyyy):
    chrome_options = Options()
    chrome_options.headless = False
    driver = webdriver.Chrome(options=chrome_options)
    driver.get(url)
    time.sleep(5)
    
    # set start date
    start_date = driver.find_element(By.NAME, "startmodel")
    start_date.clear()
    start_date.send_keys(start_mmddyyyy)

    # set end date
    end_date = driver.find_element(By.NAME, "endmodel")
    end_date.clear()
    end_date.send_keys(end_mmddyyyy)

    # select policy statements
    statement_checkbox = driver.find_element(By.XPATH, "//label/input[contains(..,'Policy Statements')]")
    statement_checkbox.click()

    # apply filter
    submit = driver.find_element(By.CSS_SELECTOR, ".btn.btn-primary")
    submit.click()
    
    # get the page control row
    pagination = driver.find_element(By.CLASS_NAME, 'pagination')

    # go to the last page to find the largest page number
    last_page = pagination.find_element(By.LINK_TEXT, 'Last')
    last_page.click()
    pages = pagination.text.split('\n')
    largest_page = int(pages[-3])
    
    return driver, pagination, largest_page

def extract_meetingdate_documentdate_statementurl(soup):
    meeting_date = soup.select('strong')[0].text
    document_date = soup.select('em')[0].text
    statement_url = 'https://www.federalreserve.gov/{}'.format([item for item in soup.select('a') if 'HTML' in item.text][0]['href'])
    return meeting_date, document_date, statement_url

def scrape_URLs_and_meeting_dates_and_document_dates(driver, pagination, largest_page):
    meeting_date_list, document_date_list, statement_url_list = [], [], []
    # go back to first page and start the loop
    first_page = pagination.find_element(By.LINK_TEXT, 'First')
    first_page.click()
    next_page = pagination.find_element(By.LINK_TEXT, 'Next')
    
    for _ in range(largest_page):
        driver.find_element(By.CSS_SELECTOR, ".panel.panel-default") 
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")
        
        rows = soup.select('div.row.fomc-meeting')[1:]
        for one_row in rows:
            if one_row.select('.fomc-meeting__month.col-xs-5.col-sm-3.col-md-4')[0].text.strip()=='Statement':
                # Extract statements written in HTML format
                meeting_date, document_date, statement_url = extract_meetingdate_documentdate_statementurl(one_row)
                meeting_date_list.append(meeting_date)
                document_date_list.append(document_date)
                statement_url_list.append(statement_url)
        
        next_page.click()
    print('Number of URLs: {}'.format(len(statement_url_list)))
    
    return statement_url_list, meeting_date_list, document_date_list

def get_text_for_a_statement_from_2006_to_2022(soup):
    return soup.find('div', class_ = 'col-xs-12 col-sm-8 col-md-8').text.strip()

def get_text_for_a_statement_from_1996_to_2005(soup):
    return '\n'.join([item.text.strip() for item in soup.select('table td')])

def get_text_for_a_statement_from_1994_to_1995(soup):
    return soup.find('div', id="content").text.strip()

doublespace_pattern = re.compile(r'\s+')
def remove_doublespaces(document):
    return doublespace_pattern.sub(' ', document).strip()

stop_phrase_patterns = [re.compile(r'Release Date: [A-z][a-z]{2,8} \d{1,2}, \d{4}')\
                       , re.compile('For immediate release')\
                       , re.compile(r'Home \|.*')\
                       , re.compile(r'\d{4} Monetary policy')\
                       , re.compile('Implementation Note issued.*')\
                       , re.compile('Frequently Asked Questions.*')\
                       , re.compile('For media inquiries.*')\
                       , re.compile(r'\(\d{1,3} KB PDF\)')]
def remove_stop_phrases(document):
    for stop_phrase_pattern in stop_phrase_patterns:
        document = stop_phrase_pattern.sub(' ', document)
        document = remove_doublespaces(document)
    return document

def extract_begin_end_dates(date_range):
    if '-' not in date_range:
        parsed, _ = parse(date_range)
        return parsed, parsed
    
    elif '/' in date_range:
        begin_month, end_month, begin_date, end_date, year = date_range.replace(',', '').replace('-', ' ').replace('/', ' ').split(' ')
        date_range = f'{begin_month} {begin_date}-{end_month} {end_date}, {year}'
        return parse(date_range)
        
    else:
        return parse(date_range)

In [2]:
driver, pagination, largest_page = prepare_resources_for_scraping(chromedriver_filepath, url, start_mmddyyyy, end_mmddyyyy)
statement_url_list, meeting_date_list, document_date_list = scrape_URLs_and_meeting_dates_and_document_dates(driver, pagination, largest_page)

doc_count = 0
for statement_url, meeting_date, document_date in tqdm(zip(statement_url_list, meeting_date_list, document_date_list)):

    # Scrape statements
    statement_resp = requests.get(statement_url)
    statement_soup = BeautifulSoup(statement_resp.content, 'lxml')

    document_date_yyyymmdd = datetime.strftime(datetime.strptime(document_date, "%B %d, %Y"), "%Y%m%d")
    year = int(document_date_yyyymmdd[:4])
    if year >= 2006:
        doc = get_text_for_a_statement_from_2006_to_2022(statement_soup)
    elif year >=1996:
        doc = get_text_for_a_statement_from_1996_to_2005(statement_soup)
    else:
        doc = get_text_for_a_statement_from_1994_to_1995(statement_soup)

    # Clean
    doc = remove_doublespaces(doc)

    # Remove stop-phrases
    doc = remove_stop_phrases(doc)

    
    meeting_date_start, meeting_date_end = extract_begin_end_dates(meeting_date)
    meeting_date_start_string = meeting_date_start.strftime("%Y-%m-%d")
    meeting_date_end_string = meeting_date_end.strftime("%Y-%m-%d")

    # Save data
    save_dir = os.path.join(save_root_dir, document_date_yyyymmdd[:4])
    if not os.path.exists(save_dir): os.makedirs(save_dir)
    save_filepath = os.path.join(save_dir, 'MeetingDate={}-{}_UploadedOn={}.txt'\
                                 .format(meeting_date_start_string, meeting_date_end_string, document_date_yyyymmdd))
    with open(save_filepath, "w", encoding='utf-8-sig') as file:
        file.write(doc)
        doc_count += 1

print('Saved {} unique documents under {}'.format(len(glob('{}/*/*.txt'.format(save_root_dir))), save_root_dir)) 


Number of URLs: 230


230it [02:11,  1.75it/s]

Saved 221 unique documents under ./Statements



