In [1]:
# Authors: Ben Elenbaas, Jack Klingenberg
# Description: Jupyter notebook that loads text data and creates plots for the presentation

from bs4 import BeautifulSoup
import re
import glob
import os
import requests
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from pathlib import Path
import pandas as pd
import glob
from nltk.corpus import stopwords
import altair as alt
import numpy as np

In [2]:
#function to get the links and dates

def get_links_and_dates():
    links_txt_path = 'data/links.txt'
    with open(links_txt_path, 'r') as links_txt:
        links = links_txt.readlines()

    dates = []
    for link in links:
        file_date = re.sub('https://www.federalreserve.gov/monetarypolicy/fomcminutes', '', link)
        file_date = re.sub('.htm\n', '', file_date)
        dates.append(file_date)
    
    return dates

In [3]:
#function to webscrape the given links, return the text list

def webscrape_link(file):
    text_list = []
    page = file.read()
    soup = BeautifulSoup(page, "html.parser")

    text_string = ""
    for info in soup.find_all("p"):
        text_string = text_string + " " + info.get_text()

    text_string = re.sub("\n", " ", text_string)
    text_string = re.sub("\s+", " ", text_string)
    text_list.append(text_string)  
    
    return text_list

In [4]:
#get the dates 
links = []
with open('data/links.txt', 'r') as file:
    for line in file:
        links.append(line.strip())

dates = get_links_and_dates()

#write text to 'alltxtfiles'
for link, date in zip(links, dates):
    page = requests.get(link)
    newsoup = BeautifulSoup(page.content, "html.parser")

    mystring = ""
    for info in newsoup.find_all("p"):
        mystring = mystring + " " + info.get_text()

    mystring = re.sub("\n", " ", mystring)
    mystring = re.sub("\s+", " ", mystring)
    mystring = re.sub("\[.*?\]", " ", mystring)

    cleanedlink = re.sub('/', '', link)
    filename = date + '.txt'

    with open('alltxtfiles/' + filename, 'w') as file:
        file.write(mystring)

In [6]:
text_files = glob.glob('txtfilesbyyear/*.txt')
file_names = [Path(text).stem for text in text_files]

In [7]:
#generate stop list

stoplist = stopwords.words("english")
stoplist.extend([".", ",", "?", "could", "would", "“", "”", "longer", "developments", "system", "real", "poole", "yields", "banks", "credit", "taf", "’", ";", "!", 's', 'meeting', 'markets', 'financial', 'participants', 'prices', 'economic', 'conditions', 'committee', 'core', 'outlook', 'housing', "'s", 'spending', 'committee', 'remained', 'rate', 'likely', 'also', 'expected', 'consumer', 'quarter', 'data', 'market', 'generally', 'higher', 'period', 'price', 'generally', 'committee', 'june', "''", 'continued', 'progress', 'economy', 'bank', 'reserve', 'june', 'committee\\', 'reserve', ')', 'board', 'division', 'federal', '``', '2', 'goals', 'monetary', 'supply', 'policy', 'u.s.', '(', 'noted', '2022', 'balance', 'percent', 'range', 'treasury', 'activity', 'stance', 'intermeeting', 'recent', 'year', 'time', 'businesses', 'governors', 'appropriate', 'operations', 'support', 'agreed', 'members', '2020', 'director', 'term', 'staff', 'months', '&', 'united', 'states', 'new', 'york', 'achilles', 'sangster', 'annette', 'samuel', 'min', 'michele', 'sylvain', '2-', '5-', 'seth', 'boards', 'met', 'wright', 'met', 'national', 'glenn', 'weekend', 'andrea', 'laura', 'naureen', '737', 'andrew', 'marc', 'ron', 'sally', '1-1/2', '10-', 'ivan', 'april', 'may', 'june', 'july', 'january', 'february', 'march', 'august', 'september', 'october', 'november', 'december', 'purchase', 'year', 'level', 'net', 'firm', 'still', 'judged', 'many', 'observed', 'sector', 'well'])

tfidf_vectorizer = TfidfVectorizer(input='filename', stop_words=stoplist)
tfidf_vector = tfidf_vectorizer.fit_transform(text_files)



In [8]:
#tf-idf (condensed pandas dataframe)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

tfidf_df = pd.DataFrame(tfidf_vector.toarray(), index=file_names, columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df.stack().reset_index()
tfidf_df = tfidf_df.stack().reset_index()
tfidf_df = tfidf_df.rename(columns={0:'tfidf', 'level_0': 'document','level_1': 'term', 'level_2': 'term'})
tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(10)

Unnamed: 0,document,term,tfidf
52473,2007,inflation,0.399868
52058,2007,growth,0.3463
54272,2007,pressures,0.146636
55054,2007,risks,0.122287
50423,2007,correction,0.121503
51280,2007,energy,0.115253
48777,2007,2007,0.114675
53459,2007,mr,0.097536
49792,2007,business,0.092527
51801,2007,foreign,0.090904


In [73]:
#use this line for following output - TF-IDF analaysis for pos/neg words - adjust head(x) appropriately

top_tfidf = tfidf_df.sort_values(by=['document','tfidf'], ascending=[True,False]).groupby(['document']).head(5)

In [11]:
top_tfidf[top_tfidf['term'].str.contains('risk')]

Unnamed: 0,document,term,tfidf
55054,2007,risks,0.122287


In [None]:
top_tfidf[top_tfidf['term'].str.contains('decline')]

In [None]:
top_tfidf[top_tfidf['term'].str.contains('growth')]

Unnamed: 0,document,term,tfidf
52058,2007,growth,0.3463
92653,2008,growth,0.256168
35820,2010,growth,0.139926
43939,2011,growth,0.141428
68296,2012,growth,0.176273
27701,2014,growth,0.171327
19582,2015,growth,0.150506
3344,2016,growth,0.179316
133248,2018,growth,0.195025
125129,2019,growth,0.202284


In [None]:
#TF-IDF heat map

# adding a little randomness to break ties in term ranking
top_tfidf_plusRand = top_tfidf.copy()
top_tfidf_plusRand['tfidf'] = top_tfidf_plusRand['tfidf'] + np.random.rand(top_tfidf.shape[0])*0.0001

# base for all visualizations, with rank calculation
base = alt.Chart(top_tfidf_plusRand).encode(
    x = 'rank:O',
    y = 'document:N'
).transform_window(
    rank = "rank()",
    sort = [alt.SortField("tfidf", order="descending")],
    groupby = ["document"],
)

# heatmap specification
heatmap = base.mark_rect().encode(
    color = 'tfidf:Q'
)

# text labels, white for darker heatmap colors
text = base.mark_text(baseline='middle').encode(
    text = 'term:N',
    color = alt.condition(alt.datum.tfidf >= 0.23, alt.value('white'), alt.value('black'))
)

# display the three superimposed visualizations
(heatmap + text).properties(width = 600)