# Imports

In [1]:
import re
from bs4 import BeautifulSoup
import os

# Global Values

In [2]:
# parent folder of the documents
parent_folder = ""
# folder to read article-html from
minutes_html_folder = "minutes_html/"
# folder to read minutes plain text from
text_folder = "plain_text/"
# folder to save pre-processed minutes in
pre_processed_folder = "pre_processed/"
# year from which on to pre-process
lower_bound_year = 1990
# year up to which pre-process
upper_bound_year = 2020

# Splitting algorithms for different years

In [3]:
# Blockquotes are not saved
def split_2009_until_today(url):
    with open(url) as file:
        soup = BeautifulSoup(file, 'html.parser')
        [x.extract() for x in soup.findAll('blockquote')]
    date = "".join(re.findall('\d\d\d\d\d+', url))
    if int(date[:4]) >= 2012:
        contents = soup.find(id = "article").find_all(re.compile("p"))
    else:
        contents = soup.find(id = "leftText").find_all(re.compile("p"))
    contents_cleaned = []
    section = ''
    section_set = False
    help_20091104 = False
    for i in range(0, len(contents)):  
        if date == "20090128":
            if re.search('<strong>', str(contents[i])):
                if re.search('Open Market|Financial Market', str(contents[i].find("strong")), re.IGNORECASE):
                    section = 'Open Market'
                    section_set = True
                elif re.search('Staff Review|Inflation Analysis|Structural Unemployment', str(contents[i].find("strong")), re.IGNORECASE):
                    section = 'Staff Review'
                    section_set = True
                elif re.search('Committee Policy Action', str(contents[i].find("strong")), re.IGNORECASE):
                    section = 'FOMC members'
                    section_set = True
                elif re.search('Voting', str(contents[i].find("strong")), re.IGNORECASE) or re.search('Vote', str(contents[i].find("strong")), re.IGNORECASE):
                    break
            elif re.search("In their discussion of monetary policy for the intermeeting period", contents[i].get_text(), re.IGNORECASE):
                    section = 'Policy'
                    section_set = True
            
        elif date != "20091104" and date != "20090128":
            if re.search('<strong>', str(contents[i])):
                if re.search('Open Market|Financial Market', str(contents[i].find("strong")), re.IGNORECASE):
                    section = 'Open Market'
                    section_set = True
                elif re.search('Staff Review|Inflation Analysis|Structural Unemployment', str(contents[i].find("strong")), re.IGNORECASE):
                    section = 'Staff Review'
                    section_set = True
                elif re.search('Participants', str(contents[i].find("strong")), re.IGNORECASE):
                    section = 'FOMC members'
                    section_set = True
                elif re.search('Committee Policy Action', str(contents[i].find("strong")), re.IGNORECASE):
                    section = 'Policy'
                    section_set = True
                elif re.search('Voting', str(contents[i].find("strong")), re.IGNORECASE) or re.search('Vote', str(contents[i].find("strong")), re.IGNORECASE):
                    break
        else:
            if re.search('<b>', str(contents[i])):
                if re.search('Developments in Financial Markets and the Federal Reserve', str(contents[i].find("b")), re.IGNORECASE):
                    if help_20091104 == False:
                        help_20091104 = True
                        continue
                    section = 'Open Market'
                    section_set = True
                elif re.search('Staff Review|Inflation Analysis|Structural Unemployment', str(contents[i].find("b")), re.IGNORECASE):
                    section = 'Staff Review'
                    section_set = True
                elif re.search('Participants', str(contents[i].find("b")), re.IGNORECASE):
                    section = 'FOMC members'
                    section_set = True
                elif re.search('Committee Policy Action', str(contents[i].find("b")), re.IGNORECASE):
                    section = 'Policy'
                    section_set = True
                elif re.search('Voting', str(contents[i].find("b")), re.IGNORECASE) or re.search('Vote', str(contents[i].find("b")), re.IGNORECASE):
                    break
        if section != '':
            string = contents[i].get_text().strip()
            string = re.sub('\n', '.', string)
            string = re.sub('range of 0 to (\w|\W)+ percent', 'range of 0 to 0.25 percent', string)
            string = re.sub(' \.', '', string)
            string = re.sub(' \.', '', string)
            string = re.sub(' +', ' ', string)
            if section_set == True:
                string = string.split(".")[1:]
                string = ''.join(string)
                section_set = False
            if int(date[:4]) <= 2011 and date != "20090128":
                if re.search("discussion", string) and re.search("members", string) and section == "FOMC members":
                    section = 'Policy'
            if len(string) > 200 and len(string) < 10000:
                if re.search("In conducting operations pursuant to the authorization", string):
                    section = ''
                    continue
                contents_cleaned.append((section, string)) 
    return contents_cleaned

In [4]:
def split_2005_until_2008(url):
    with open(url) as file:
        soup = BeautifulSoup(file.read(), 'html.parser')
        [x.extract() for x in soup.findAll('blockquote')]
    date = "".join(re.findall('\d\d\d\d\d+', url))
    if date.startswith("2008") or date.startswith("2007"):
        contents = soup.find_all("p")
        if date == "20071211":
            contents = soup.find(id = "leftText")
            string = contents.get_text()
            contents = string.splitlines()
    elif date.startswith("2003"):                                               
        contents = soup.find_all("table")[3].find_all("p")
    else:
        contents = soup.find_all("p")
    contents_split = []
    contents_cleaned = []
    section = ''
    number_sections_set = 0
    counter = 1
    for p in contents:
        counter = counter + 1
        if date == "20071211":
            text = p.strip()
        else:
            text = p.get_text().strip()
            
        if number_sections_set == 0:
            if date == "20070628" and "The information  reviewed at the June meeting" in text:
                section = 'Open Market'
                number_sections_set = 1 
            elif text.strip().startswith("The manager of the System Open Market Account") or text.strip().startswith("The Manager of the System Open Market Account"):
                section = 'Open Market'
                number_sections_set = 1
                
        elif number_sections_set == 1:
            if 'forecast' in text and 'staff' in text:
                section = 'Staff Review'
                number_sections_set = 2
                
        elif number_sections_set == 2:
            if 'participants' in text and 'economic situation' in text or 'economic outlook' in text:
                section = 'FOMC members'
                number_sections_set = 3
                
        elif number_sections_set == 3:
            if date == "20050202":
                if 'Committee' in text and 'favored raising' in text:
                    section = 'Policy'
                    number_sections_set = 4
            
            elif 'Committee' in text and 'monetary policy' in text:
                section = 'Policy'
                number_sections_set = 4
                
        elif number_sections_set == 4:
            if 'Notation Vote' in text:
                section = ''
        if section != '':
            #print(section + ": " + text)
            if len(text) > 200 and len(text) < 10000:
                text = re.sub(r'\x92', '', text)
                text = re.sub(r'\x93', '', text)
                text = re.sub(r'\x94', '', text)
                text = re.sub(r'\xa0', '', text)
                text = re.sub(r'\u2019', '', text)
                text = re.sub(r'\u0155', 'a', text)
                text = re.sub(r'\r', '', text)
                text = re.sub('\n', ' ', text)
                text = re.sub(' \.', '', text)
                text = re.sub(' \.', '', text)
                text = re.sub(' +', ' ', text)
                text = text.encode('utf-8').strip()
                contents_cleaned.append((section, text)) 
    if number_sections_set < 4:
        #print("__________ Nicht alle Sektionen gesetzt, Fehler bei Sektion " + str((number_sections_set + 1)) + "__________")
        return None
    return contents_cleaned

In [5]:
def split_1999_until_2004(url):
    
    date = "".join(re.findall('\d\d\d\d\d+', url))
    contents = []
    with open(url) as file:
        contents = file.readlines()
        
    contents_split = []
    contents_cleaned = []
    section = ''
    number_sections_set = 0
    counter = 0
    
    if int(date) <= int(19989999):
        with open(url) as file:
            contents_temp = file.read()
            for p in contents_temp.split("\t"):
                if len(p) > 300 and len(p) < 10000:
                    contents.append(p)

    
    for text in contents:
                
        counter = counter + 1
        text = text.decode('utf-8').strip()
        
        if number_sections_set == 0:
            if date == "20040128":
                if text.startswith("The Manager of the System Open Market Account reported"):
                    section = 'Open Market'
                    number_sections_set = 1
            elif date == "20001219":
                if text.startswith("The Manager of the System Open Market Account also reported"):
                    section = 'Open Market'
                    number_sections_set = 1
            elif re.search("The Manager of the System Open Market Account reported", text, re.IGNORECASE):
                section = 'Open Market'
                number_sections_set = 1
                
        elif number_sections_set == 1:
            if date == "20040128":
                if re.search("The staff forecast prepared for this meeting", text, re.IGNORECASE):
                    section = 'Staff Review'
                    number_sections_set = 2
            elif re.search("forecast", text, re.IGNORECASE) and re.search("staff", text, re.IGNORECASE):
                section = 'Staff Review'
                if date == "20060629" or date == "20060131":
                    number_sections_set = 3
                else:
                    number_sections_set = 2
                
        elif number_sections_set == 2:
            if re.search("member", text, re.IGNORECASE) and re.search("economic", text, re.IGNORECASE):
                section = 'FOMC members'
                number_sections_set = 3
            elif re.search("member", text, re.IGNORECASE) and re.search("discussion", text, re.IGNORECASE):
                section = 'FOMC members'
                number_sections_set = 3
                
        elif number_sections_set == 3:
            if re.search("Committee", text, re.IGNORECASE) and re.search("policy", text, re.IGNORECASE) and re.search("period", text, re.IGNORECASE):
                if re.search("raise", text, re.IGNORECASE) or re.search("raising", text, re.IGNORECASE) or re.search("tighten", text, re.IGNORECASE):
                        section = 'Policy'
                        number_sections_set = 4
                elif re.search("maintain", text, re.IGNORECASE) or re.search("unchanged", text, re.IGNORECASE):
                        section = 'Policy'
                        number_sections_set = 4
                elif ("lower", text, re.IGNORECASE):
                        section = 'Policy'
                        number_sections_set = 4
            elif date == "20041110":
                if re.search("Committee", text, re.IGNORECASE) and re.search("policy", text, re.IGNORECASE) and re.search("raising", text, re.IGNORECASE):
                        section = 'Policy'
                        number_sections_set = 4
            elif date == "19990518":
                if re.search("members", text, re.IGNORECASE) and re.search("policy", text, re.IGNORECASE) and re.search("period", text, re.IGNORECASE):
                        section = 'Policy'
                        number_sections_set = 4
                        
        elif number_sections_set == 4:
            if 'Notation Vote' in text:
                section = ''
                number_sections_set = 5
        if re.search("Footnotes", text, re.IGNORECASE) == True:
            section = ''
                    
        if section != '':
            if len(text) > 200 and len(text) < 10000 :
                text = re.sub(r'\x92', '', text)
                text = re.sub(r'\x93', '', text)
                text = re.sub(r'\x94', '', text)
                text = re.sub(r'\xa0', '', text)
                text = re.sub(r'\u2019', '', text)
                text = re.sub(r'\u0155', 'a', text)
                text = re.sub(r'0xc5', ' ', text)
                text = re.sub(r'\r', '', text)
                text = re.sub('ź', '0.25', text)
                text = re.sub('˝', '0.5', text)
                text = re.sub('ž', '.75', text)
                text = re.sub('¾', '0.75', text)
                text = re.sub('’', "'", text)
                text = re.sub('é', 'e', text)
                text = re.sub('“', '\"', text)
                text = re.sub('”', '\"', text)
                text = re.sub(r'for the System\.s account', "for the System's account", text)
                text = re.sub('\n', ' ', text)
                text = re.sub(' \.', '', text)
                text = re.sub(' \.', '', text)
                text = re.sub(' +', ' ', text)
                text = re.sub('<.+>', '', text)
                if len(text) > 100:
                    contents_cleaned.append((section, text)) 
    if number_sections_set < 4:
        print("__________ Nicht alle Sektionen gesetzt, Fehler bei Sektion " + str((number_sections_set + 1)) + "__________")
    return contents_cleaned

In [6]:
def split_1993_until_1998(url):
    date = "".join(re.findall('\d\d\d\d\d+', url))
    with open(url) as file:
        soup = BeautifulSoup(file.read(), 'html.parser')
        [x.extract() for x in soup.findAll('blockquote')]
        contents = soup.find_all("p")
    contents = str(soup).split("<p>")
    contents_cleaned = []  
    contents_split = []
    for c in contents:
        contents_split.append(c.split("<p>"))
    section = ''
    number_sections_set = 0
    counter = 1
    for c in contents_split:
        text = str(c).strip()
        counter = counter + 1
        #print(text + "\n\n")
        
        if number_sections_set == 0:
            if re.search("The Manager of the System Open Market Account", text, re.IGNORECASE):
                section = 'Open Market'
                number_sections_set = 1
            elif (int(date[0:4]) == 1994 or int(date[0:4]) == 1993) and re.search("The Manager for|The deputy Manager for", text, re.IGNORECASE):
                section = 'Open Market'
                number_sections_set = 1

        elif number_sections_set == 1:
            if ('forecast' in text or 'projection' in text) and 'staff' in text:
                section = 'Staff Review'
                number_sections_set = 2

        elif number_sections_set == 2:
            if ('Committee' in text or 'members' in text) and 'economic' in text and 'discussion' in text:
                section = 'FOMC members'
                number_sections_set = 3

        elif number_sections_set == 3:
            if re.search("Committee", text, re.IGNORECASE) and re.search("policy", text, re.IGNORECASE) and re.search("period", text, re.IGNORECASE):
                if re.search("raise", text, re.IGNORECASE) or re.search("raising", text, re.IGNORECASE) or re.search("tighten", text, re.IGNORECASE):
                        section = 'Policy'
                        number_sections_set = 4
                elif re.search("maintain", text, re.IGNORECASE) or re.search("unchanged", text, re.IGNORECASE):
                        section = 'Policy'
                        number_sections_set = 4
                elif ("lower", text, re.IGNORECASE):
                        section = 'Policy'
                        number_sections_set = 4
                    
        if section != '':
            if len(text) > 200 and len(text) < 10000:
                text = re.sub(r'\x92', '', text)
                text = re.sub(r'\x93', '', text)
                text = re.sub(r'\x94', '', text)
                text = re.sub(r'\xa0', '', text)
                text = re.sub(r'\u2019', '', text)
                text = re.sub(r'\\r', ' ', text)
                text = re.sub(r'\\.', '', text)
                text = re.sub(r'<.+>', ' ', text)
                text = re.sub(r' +', ' ', text)
                #print(section + ": " + text + "\n")
                if len(text) > 200:
                    contents_cleaned.append((section, text)) 
    if number_sections_set < 4:
        #print("__________ Nicht alle Sektionen gesetzt, Fehler bei Sektion " + str((number_sections_set + 1)) + "__________")
        return None
    return contents_cleaned

In [7]:
def split_1992_and_older(url):
    date = "".join(re.findall('\d\d\d\d\d+', url))
    with open(url) as file:
        contents = file.readlines()

    text = ''
    help = ''
    contents_split = []
    begin_sections = False
    for line in contents:
        if line.strip().startswith("The information reviewed") and begin_sections == False:
                begin_sections = True
                help = help + line + " "
        elif begin_sections == True and (line.strip().startswith("Most interest rates") or line.strip().startswith("Short-term interest rates")):
            contents_split.append(help.strip())
            help = line + " "                         
        elif line.strip().startswith("Growth of M2 and M3 ") or line.strip().startswith("M2 grew at "):
            contents_split.append(help.strip())
            help = line + " "
        elif begin_sections == True and (line.strip().startswith("The Federal Open Market Committee") or line.strip().startswith("In the implementation of policy")):
            contents_split.append(help.strip())
            help = line + " "
        elif re.search("It was agreed that the next meeting", line, re.IGNORECASE):
            break
        elif begin_sections == True:
                if len(line) > 4 and line.strip().startswith("-") == False:
                    help = help + line + " "
    contents_split.append(help)
        
    contents_cleaned = []  
    section = ''
    number_sections_set = 0
    counter = 1
    for c in contents_split:
        text = str(c).strip()
        counter = counter + 1
        #print(text + "\n\n")
        
        if number_sections_set == 0:
            if re.search("The information reviewed at this meeting", text, re.IGNORECASE):
                section = 'Open Market'
                number_sections_set = 2
            elif (int(date[0:4]) == 1994 or int(date[0:4]) == 1993) and re.search("The Manager for|The deputy Manager for", text, re.IGNORECASE):
                section = 'Open Market'
                number_sections_set = 2

        elif number_sections_set == 2:
            if ('Federal Open Market Committee' in text):
                section = 'FOMC members'
                number_sections_set = 3

        elif number_sections_set == 3:
            if re.search("In the implementation of policy", text, re.IGNORECASE):
                section = 'Policy'
                number_sections_set = 4
                    
        if section != '':
            if len(text) > 200 and len(text) < 10000:
                text = re.sub(r'\x92', '', text)
                text = re.sub(r'\x93', '', text)
                text = re.sub(r'\x94', '', text)
                text = re.sub(r'\xa0', '', text)
                text = re.sub('\u2019', '', text)
                text = re.sub(r'\r', ' ', text)
                text = re.sub(r'\n', ' ', text)
                text = re.sub(r'\\.', '', text)
                text = re.sub(r'<.+>', ' ', text)
                text = re.sub(r' +', ' ', text)
                #print(section + ": " + text + "\n")
                if len(text) > 200:
                    contents_cleaned.append((section, text)) 
    if number_sections_set < 4:
        #print("__________ Nicht alle Sektionen gesetzt, Fehler bei Sektion " + str((number_sections_set + 1)) + "__________")
        return None
    return contents_cleaned

## Auswertung

In [9]:
files = os.listdir(parent_folder + text_folder)
for f in files:
    for r in range(lower_bound_year, upper_bound_year + 1):
        if f.startswith(str(r)):
            date = "".join(re.findall('\d\d\d\d\d+', f))
            if int(f[0:4]) >= 2009:
                x = split_2009_until_today(parent_folder + minutes_html_folder + f[0:8] + "_minutes_html.txt")
            elif (int(f[0:4]) <= 2004 and int(f[0:4]) >= 1999) or int(f[0:8]) == 20060629:
                x = split_1999_until_2004(parent_folder + text_folder + f)
            elif int(f[0:4]) >= 2005 and int(f[0:4]) <=2008:
                x = split_2005_until_2008(parent_folder + minutes_html_folder + f[0:8] + "_minutes_html.txt")
            elif int(f[0:4]) <= 1998 and int(f[0:4]) >= 1993:
                x = split_1993_until_1998(parent_folder + minutes_html_folder + f[0:8] + "_minutes_html.txt")
            elif int(f[0:4]) <= 1992:
                x = split_1992_and_older(parent_folder + text_folder + f[0:8] + "_minutes.txt")
            else:
                x = None
                
            string = f + ": ["
            counter1 = 0
            counter2 = 0
            counter3 = 0
            counter4 = 0
            
            if x is None:
                string = string + "error]"
            
            else:
                folders = os.listdir(parent_folder + pre_processed_folder)
                if os.path.exists(parent_folder + pre_processed_folder + date):
                    files = os.listdir(parent_folder + pre_processed_folder + date)
                    for f in files:
                        os.remove(parent_folder + pre_processed_folder + date + "/" + f)
                else:
                    os.makedirs(parent_folder + pre_processed_folder + date)
                
                for i in x:
                    
                    if i[0] == "Open Market":
                        counter1 = counter1 + 1
                        if counter1 < 10:
                            file_name = date + "_S1_0" + str(counter1) + ".txt"
                        else:
                            file_name = date + "_S1_" + str(counter1) + ".txt"
                        with open(parent_folder + pre_processed_folder + date + "/" + file_name, "wb") as file:
                            try:
                                file.write(i[1])
                            except UnicodeEncodeError:
                                file.write(str(i[1].encode("utf-8")))
                            except TypeError:
                                file.write(str(i[1]))
                    elif i[0] == "Staff Review":
                        counter2 = counter2 + 1
                        if counter2 < 10:
                            file_name = date + "_S2_0" + str(counter2) + ".txt"
                        else:
                            file_name = date + "_S2_" + str(counter2) + ".txt"
                        with open(parent_folder + pre_processed_folder + date + "/" + file_name, "wb") as file:
                            try:
                                file.write(i[1])
                            except UnicodeEncodeError:
                                file.write(str(i[1].encode("utf-8")))
                            except TypeError:
                                file.write(str(i[1]))
                    elif i[0] == "FOMC members":
                        counter3 = counter3 + 1
                        if counter3 < 10:
                            file_name = date + "_S3_0" + str(counter3) + ".txt"
                        else:
                            file_name = date + "_S3_" + str(counter3) + ".txt"
                        with open(parent_folder + pre_processed_folder + date + "/" + file_name, "wb") as file:
                            try:
                                file.write(i[1])
                            except UnicodeEncodeError:
                                file.write(str(i[1].encode("utf-8")))
                            except TypeError:
                                file.write(str(i[1]))
                    elif i[0] == "Policy":
                        counter4 = counter4 + 1
                        if counter4 < 10:
                            file_name = date + "_S4_0" + str(counter4) + ".txt"
                        else:
                            file_name = date + "_S4_" + str(counter4) + ".txt"
                        with open(parent_folder + pre_processed_folder + date + "/" + file_name, "wb") as file:
                            try:
                                file.write(i[1])
                            except UnicodeEncodeError:
                                file.write(str(i[1].encode("utf-8")))
                            except TypeError:
                                file.write(str(i[1]))
                string = string + str(counter1) + "," + str(counter2) + "," + str(counter3) + "," + str(counter4) + "]"