## Imports

In [5]:
# Imports
import requests
import re
from bs4 import BeautifulSoup
import urllib2
import textract

## Global Values

In [6]:
# parent folder of the documents
parent_folder = ""
# folder to save html in
full_html_folder = "full_html"
# folder to save article-html in
minutes_html_folder = "minutes_html"
# folder to save minutes plain text in
text_folder = "plain_text"
# folder to save pdf files in
pdf_folder = "pdf_files"
# year from which on to save
lower_bound_year = 2009
# year up to which to save
upper_bound_year = 2009

## Misc. methods

In [62]:
# Filters the content out of a minutes-page (FOMC)
def filter_text(url, save = False):
    date = "".join(re.findall('\d\d\d\d\d+', url))
    if ".pdf" in url:
        filename_pdf = date + "_fomc_document"
        filename_text = date + "_fomc_text"
        save_pdf(url, pdf_folder, filename_pdf)
        text = textract.process(parent_folder + pdf_folder + "/" + filename_pdf + ".pdf")
        save_content(text, folder = parent_folder + text_folder, filename = filename_text)       
    else:  
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')

        content = soup.find(id = "article")
        if content is not None:
            text = content.get_text()
            content = content.prettify()

        else:
            content = soup.find(id = "leftText")
            if content is not None:
                text = content.get_text()
                content = content.prettify()

            else:
                content = soup.find(id = "content")
                if content is not None:
                    text = content.get_text()
                    content = content.prettify()

                else:
                    text = ""
                    content = ""
                    tables = soup.find_all("table")
                    for table in tables:
                        try:
                            text += table.get_text()
                            content += str(table)
                        except UnicodeEncodeError:
                            content += str(table)
        if date == "20080130":
            content = soup.find_all("p") 
            text = ''
            content = ''
            print("hallo")
            for string in content:
                text = text + "\n" + string.get_text()
                content = content + string
            
        if save == True:
            filename_full_html = date + "_minutes_full_html"
            filename_text_html = date + "_minutes_html"
            filename_text = date + "_minutes"
            save_content(content = soup.prettify(), folder = parent_folder + full_html_folder, filename = filename_full_html)
            save_content(content = content, folder = parent_folder + minutes_html_folder, filename = filename_text_html)
            save_content(content = text, folder = parent_folder + text_folder, filename = filename_text)

    return text

In [8]:
# Save soup-object in folder
def save_content(content, folder, filename):
    with open(folder + "/" + filename + ".txt", "w") as file:
        try:
            file.write(content)
        except UnicodeEncodeError:
            file.write(str(content.encode("utf-8")))
        except TypeError:
            file.write(str(content))

In [9]:
# Save pdf from url in folder
def save_pdf(url, folder, filename):
    response = urllib2.urlopen(url)
    with open(folder + "/" + filename + ".pdf", "wb") as file:
        file.write(response.read())

## Retrieving content from current FOMC Meetings page (2013 - today)

In [10]:
def retrieve_current_fomc_minutes(save = False):
    # Get page content of current meetings page
    current_fomc_page = requests.get("https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm")
    current_fomc_soup = BeautifulSoup(current_fomc_page.content, 'html.parser')
    current_fomc_blocks = current_fomc_soup.select(".fomc-meeting__minutes")
    # Get URL, Date and Minutes of each Meeting
    # Structure of each tuple in the list: (date, url, content) 
    # date format: yyyymmdd
    current_minutes_fomc = []
    for m in current_fomc_blocks:
        for a in m.select("a"):
            if(".htm" in a["href"]):
                url = "https://www.federalreserve.gov" + a["href"]
                date = "".join(re.findall('\d+', url))
                if int(date[0:4]) >= lower_bound_year and int(date[0:4]) <= upper_bound_year:
                    article = filter_text(url, save)
                    current_minutes_fomc.append((date, url, article))
    return current_minutes_fomc

## Retrieving content from historical FOMC Meetings page (1996 - 2012)

In [11]:
def retrieve_historical_fomc_minutes(save = False):
    # Get page content of historical meetings page
    historical_fomc_page = requests.get("https://www.federalreserve.gov/monetarypolicy/fomc_historical_year.htm")
    historical_fomc_soup = BeautifulSoup(historical_fomc_page.content, 'html.parser')
    # historical_archive_minutes_pages = historical_archive_soup.select(".fomc-meeting__minutes")
    
    # Get url for each historical year
    blocks = historical_fomc_soup.select(".panel-default")
    historical_minutes_urls = []
    for block in blocks:
        for a in block.select("a"):
            historical_minutes_urls.append("https://www.federalreserve.gov" + a["href"])
            
    # Get URL, Date and Minutes of each Meeting
    # Structure of each tuple in the list: (date, url, content) 
    # date format: yyyymmdd
    historical_minutes_fomc = []
    for url in historical_minutes_urls:
        date = "".join(re.findall('\d+', url))
        if int(date[0:4]) >= lower_bound_year and int(date[0:4]) <= upper_bound_year:
            page = requests.get(url)
            soup = BeautifulSoup(page.content, "html.parser")
            blocks = soup.select(".panel-default")
            for block in blocks:
                for a in block.select("a"):
                        # urls are not consistent over the years
                    if ("minutes" in a["href"] or "Minutes" in a) and (".htm" in a["href"]) and ("#" not in a["href"]):
                        if("https://www.federalreserve.gov" in a["href"]):
                            url = a["href"]
                        else:
                            url= "https://www.federalreserve.gov" + a["href"]
                        # date is extracted from the url
                        date = "".join(re.findall("\d\d\d\d\d+", url))
                        article = filter_text(url, save)
                        historical_minutes_fomc.append((date, url, article))
                    elif ("fomcmoa" in a["href"]) and (".pdf" in a["href"]):
                        url = "https://www.federalreserve.gov" + a["href"]
                        filter_text(url, save)
    return historical_minutes_fomc

## main

In [19]:
current_minutes_fomc = retrieve_current_fomc_minutes(save = True)
historical_minutes_fomc = retrieve_historical_fomc_minutes(save = True)
minutes_fomc = sorted(historical_minutes_fomc) + sorted(current_minutes_fomc)

# Testing Area

In [63]:
page = requests.get("https://www.federalreserve.gov/monetarypolicy/fomcminutes20080130.htm")
filter_text("https://www.federalreserve.gov/monetarypolicy/fomcminutes20080130.htm", save = False)

hallo


''