## Imports

In [1]:
# Imports
import requests
import re
from bs4 import BeautifulSoup
import urllib2
import textract

## Global Values

In [2]:
# parent folder of the documents
parent_folder = ""
# folder to save statements in
statement_folder = "statements"
# year from which on to save
lower_bound_year = 2000
# year up to which to save
upper_bound_year = 2020

## Misc. methods

In [3]:
# Create txt-document and save a string in it
def save_content(content, folder, filename):
    with open(folder + "/" + filename + ".txt", "w") as file:
        try:
            file.write(content)
        except UnicodeEncodeError:
            file.write(str(content.encode("utf-8")))
        except TypeError:
            file.write(str(content))

In [4]:
# Filters the content out of a statement-page
def filter_text(url, save = False):
    date = "".join(re.findall('\d\d\d\d\d+', url))
    text = ""
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    try:
        content = soup.find(class_="col-xs-12 col-sm-8 col-md-8")
        for p in content.find_all("p"):
            text = text + p.get_text().strip()
    
    except AttributeError:
        text = ""
        p = soup.find_all("p")
        temp = p[0].get_text().strip().split("\n")
        for t in temp:
            if re.search("For immediate release", t):
                continue
            if len(t.strip()) < 5:
                continue
            if re.search("[0-9]{4} Monetary policy", t):
                break
            text = text + "\n\n" + t.strip()
    
    text = text.strip()

    if save == True:
        filename = date + "_statement"
        save_content(content = text, folder = parent_folder + statement_folder, filename = filename)

    return text

In [5]:
def save_content(content, folder, filename):
    with open(folder + "/" + filename + ".txt", "w") as file:
        try:
            file.write(content)
        except UnicodeEncodeError:
            file.write(str(content.encode("utf-8")))
        except TypeError:
            file.write(str(content))

## Retrieving content from current FOMC Meetings page (2013 - today)

In [6]:
def retrieve_current_statements(save = False):
    current_statements = []
    # Get page content of current meetings page
    current_fomc_page = requests.get("https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm")
    current_fomc_soup = BeautifulSoup(current_fomc_page.content, 'html.parser')
    blocks = current_fomc_soup.find_all(class_="col-xs-12 col-md-4 col-lg-2")
    for m in blocks:
        for a in m.select("a"):
            if re.search("a.htm",str(a)):
                url = "https://www.federalreserve.gov" + a["href"]
                date = "".join(re.findall('\d+', url))
                article = filter_text(url, save)
                current_statements.append((date, url, article))
    return current_statements

## Retrieving content from historical FOMC Meetings page (2000 - 2012)

In [7]:
def retrieve_historical_statements(save = False):
    # Get page content of historical meetings page
    historical_fomc_page = requests.get("https://www.federalreserve.gov/monetarypolicy/fomc_historical_year.htm")
    historical_fomc_soup = BeautifulSoup(historical_fomc_page.content, 'html.parser')
    # historical_archive_minutes_pages = historical_archive_soup.select(".fomc-meeting__minutes")
    
    # Get url for each historical year
    blocks = historical_fomc_soup.select(".panel-default")
    basic_urls = []
    historical_statements_urls = []
    current_statements = []
    
    for block in blocks:
        for a in block.select("a"):
            basic_urls.append("https://www.federalreserve.gov" + a["href"])
    
    # Get URL, Date and Minutes of each Meeting
    # Structure of each tuple in the list: (date, url, content) 
    # date format: yyyymmdd
    for url in basic_urls:
        date = "".join(re.findall('\d+', url))
        if int(date[0:4]) >= lower_bound_year and int(date[0:4]) <= upper_bound_year:
            page = requests.get(url)
            soup = BeautifulSoup(page.content, "html.parser")
            for a in soup.select("a"):
                if re.search(".htm",str(a)):
                    url = "https://www.federalreserve.gov" + a["href"]
                    if re.search("https://www.federalreserve.gov/newsevents/press(releases)?/monetary(/)?[0-9]{8}a.htm", url):
                        historical_statements_urls.append(url)
                    elif re.search("https://www.federalreserve.gov/boarddocs/press/monetary/[0-9]{4}/[0-9]{8}/default.htm", url):
                        historical_statements_urls.append(url)
                elif re.search("/boarddocs/press/(monetary)|(general)/[0-9]{4}/[0-9]{8}/", str(a)):
                        url = "https://www.federalreserve.gov" + a["href"]
                        historical_statements_urls.append(url)
    
    for url in historical_statements_urls:
        date = "".join(re.findall('\d+', url))
        article = filter_text(url, save)
        current_statements.append((date, url, article))
    return current_statements

## main

In [8]:
current_statements = retrieve_current_statements(save = True)
historical_statements = retrieve_historical_statements(save = True)
statements = sorted(historical_statements) + sorted(current_statements)