In [1]:
# Imports
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models, similarities 
import re
import os



In [2]:
# parent folder of the documents
parent_folder = ""
# folder to read minutes plain text from
text_folder = "plain_text/"
# folder to save pre-processed minutes in
pre_processed_folder = "pre_processed/"

In [3]:
# download nltk data
nltk.download('punkt')
nltk.download('english')
nltk.download('wordnet')
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words('english'))
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lb4653\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Error loading english: Package 'english' not found in
[nltk_data]     index
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lb4653\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lb4653\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
def tokenize_and_stem(text):
    text = text.lower()
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    #stemmer = SnowballStemmer("english")
    #stemmer = PorterStemmer()
    wordnet_lemmatizer = WordNetLemmatizer()
    pos = nltk.pos_tag(tokens)
    for token in pos:
        if re.search('[a-zA-Z]', token[0]):
            if token[1].startswith('V'):
                filtered_tokens.append(wordnet_lemmatizer.lemmatize(token[0], 'v'))
            elif token[1].startswith('A'):
                filtered_tokens.append(wordnet_lemmatizer.lemmatize(token[0], 'a'))
            elif token[1].startswith('R'):
                filtered_tokens.append(wordnet_lemmatizer.lemmatize(token[0], 's'))
            else:
                filtered_tokens.append(wordnet_lemmatizer.lemmatize(token[0], 'n'))
    return filtered_tokens

In [6]:
def remove_stopwords(tokens):
    filtered_words = [word for word in tokens if word.lower() not in stop_words]
    return filtered_words

In [7]:
from collections import Counter
def get_most_common_words():
    word_list = []
    folders = os.listdir(parent_folder + pre_processed_folder)
    for folder in folders:
        files = os.listdir(parent_folder + pre_processed_folder + folder)
        for file in files:
            file_object = open(parent_folder + pre_processed_folder + folder + "/"+ file, "r", errors='ignore')
            content = file_object.read()
            stems = tokenize_and_stem(content)
            words = remove_stopwords(stems)
            for word in words:
                word_list.append(word)
    counts = Counter(word_list)
    most_common_words = []
    for word in counts.most_common(100):
        most_common_words.append(word[0])
    return most_common_words

In [8]:
most_common_words = get_most_common_words()

In [9]:
import datetime
re_full_stop = re.compile('\.+$')
re_hyphen = re.compile('^-+.*|-+$')
re_numbers = re.compile('.*[0-9]+.*')
folders = os.listdir(parent_folder + pre_processed_folder)
texts = []
for folder in folders:
    files = os.listdir(parent_folder + pre_processed_folder + folder)
    for file in files:
        date = datetime.datetime(int(file[:4]), int(file[4:6]), int(file[6:8]))
        if date >= datetime.datetime(1990, 1, 1) and date <= datetime.datetime(2020, 12, 31):
            word_list = []
            file_object = open(parent_folder + pre_processed_folder + folder + "/"+ file, "r", errors='ignore')
            content = file_object.read()
            stems = tokenize_and_stem(content)
            words = remove_stopwords(stems)
            for word in words:
                # Removing hyphens at the beginning and full stops at the end of words
                word = re.sub(re_full_stop, '', word)
                word = re.sub(re_hyphen, '', word)
                # No words containing numbers
                if not re.search(re_numbers, word):
                    if len(word) > 2 and word not in most_common_words:
                        word_list.append(word)
            texts.append(word_list)

In [13]:
# passes: Number of passes through the entire corpus
# chunksize: Number of documents to load into memory at a time and process E step of EM.
# update_every: number of chunks to process prior to moving onto the M step of EM.
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=10, no_above=0.4)
corpus = [dictionary.doc2bow(text) for text in texts]
lda = models.LdaModel(corpus, num_topics=8, 
                            id2word=dictionary, 
                            passes=200, 
                            chunksize=5000,
                            update_every = 1)
lda.show_topics()

[(0,
  '0.031*"export" + 0.030*"dollar" + 0.028*"u.s" + 0.020*"import" + 0.015*"value" + 0.015*"trade" + 0.013*"united" + 0.013*"currency" + 0.012*"country" + 0.012*"deficit"'),
 (1,
  '0.009*"cost" + 0.008*"many" + 0.007*"contact" + 0.007*"current" + 0.007*"factor" + 0.007*"aggregate" + 0.007*"relatively" + 0.006*"productivity" + 0.006*"wage" + 0.006*"firm"'),
 (2,
  '0.014*"agree" + 0.012*"objective" + 0.011*"target" + 0.011*"appropriate" + 0.009*"judge" + 0.009*"several" + 0.008*"downside" + 0.007*"saw" + 0.007*"asset" + 0.007*"toward"'),
 (3,
  '0.035*"system" + 0.028*"open" + 0.025*"operation" + 0.023*"transaction" + 0.022*"account" + 0.021*"vote" + 0.019*"agency" + 0.019*"manager" + 0.013*"discussion" + 0.013*"facility"'),
 (4,
  '0.023*"core" + 0.022*"forecast" + 0.021*"gdp" + 0.017*"pce" + 0.015*"compensation" + 0.014*"projection" + 0.014*"longer-run" + 0.013*"project" + 0.013*"end" + 0.012*"food"'),
 (5,
  '0.031*"loan" + 0.023*"home" + 0.021*"mortgage" + 0.021*"household" + 0

In [14]:
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=5, no_above=0.4)
corpus = [dictionary.doc2bow(text) for text in texts]
lda = models.LdaModel(corpus, num_topics=8, 
                            id2word=dictionary, 
                            passes=200, 
                            chunksize=5000,
                            update_every = 1)
lda.show_topics()

[(0,
  '0.033*"loan" + 0.025*"yield" + 0.023*"treasury" + 0.018*"bond" + 0.018*"spread" + 0.014*"commercial" + 0.014*"issuance" + 0.012*"corporate" + 0.012*"equity" + 0.011*"net"'),
 (1,
  '0.018*"equipment" + 0.013*"average" + 0.012*"construction" + 0.012*"capital" + 0.011*"payroll" + 0.011*"end" + 0.010*"worker" + 0.010*"nominal" + 0.009*"july" + 0.009*"october"'),
 (2,
  '0.049*"system" + 0.036*"open" + 0.033*"operation" + 0.030*"transaction" + 0.028*"vote" + 0.027*"account" + 0.025*"manager" + 0.017*"facility" + 0.014*"discussion" + 0.014*"rrp"'),
 (3,
  '0.022*"forecast" + 0.021*"projection" + 0.019*"longer-run" + 0.019*"gdp" + 0.016*"objective" + 0.014*"core" + 0.014*"run" + 0.013*"anticipate" + 0.013*"project" + 0.011*"medium"'),
 (4,
  '0.027*"export" + 0.027*"dollar" + 0.026*"u.s" + 0.017*"import" + 0.013*"trade" + 0.012*"value" + 0.012*"currency" + 0.012*"united" + 0.010*"emerge" + 0.010*"country"'),
 (5,
  '0.017*"target" + 0.013*"agree" + 0.012*"appropriate" + 0.011*"mainta

In [10]:
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=10, no_above=0.6)
corpus = [dictionary.doc2bow(text) for text in texts]
lda = models.LdaModel(corpus, num_topics=8, 
                            id2word=dictionary, 
                            passes=200, 
                            chunksize=5000,
                            update_every = 1)
lda.show_topics()

[(0,
  '0.039*"loan" + 0.018*"bond" + 0.017*"commercial" + 0.017*"mortgage" + 0.016*"issuance" + 0.015*"spread" + 0.011*"corporate" + 0.011*"standard" + 0.011*"firm" + 0.010*"participation"'),
 (1,
  '0.024*"projection" + 0.024*"forecast" + 0.022*"gdp" + 0.021*"longer-run" + 0.015*"objective" + 0.014*"run" + 0.014*"project" + 0.012*"anticipate" + 0.011*"since" + 0.010*"system"'),
 (2,
  '0.038*"core" + 0.027*"index" + 0.027*"end" + 0.025*"food" + 0.024*"pce" + 0.021*"survey" + 0.020*"compensation" + 0.018*"rrp" + 0.018*"discussion" + 0.017*"york"'),
 (3,
  '0.009*"many" + 0.009*"factor" + 0.008*"several" + 0.008*"number" + 0.008*"cost" + 0.007*"household" + 0.007*"contact" + 0.007*"high" + 0.007*"concern" + 0.006*"uncertainty"'),
 (4,
  '0.032*"u.s" + 0.030*"export" + 0.028*"dollar" + 0.021*"import" + 0.015*"trade" + 0.013*"value" + 0.013*"united" + 0.012*"currency" + 0.011*"emerge" + 0.011*"deficit"'),
 (5,
  '0.014*"agree" + 0.013*"target" + 0.012*"objective" + 0.012*"appropriate" + 

In [11]:
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=1, no_above=0.6)
corpus = [dictionary.doc2bow(text) for text in texts]
lda = models.LdaModel(corpus, num_topics=8, 
                            id2word=dictionary, 
                            passes=200, 
                            chunksize=5000,
                            update_every = 1)
lda.show_topics()

[(0,
  '0.031*"system" + 0.024*"open" + 0.021*"operation" + 0.019*"transaction" + 0.018*"account" + 0.017*"debt" + 0.017*"manager" + 0.016*"agency" + 0.016*"vote" + 0.013*"treasury"'),
 (1,
  '0.038*"dollar" + 0.027*"u.s" + 0.018*"currency" + 0.017*"united" + 0.016*"emerge" + 0.016*"country" + 0.014*"japan" + 0.013*"euro" + 0.013*"export" + 0.012*"european"'),
 (2,
  '0.022*"equipment" + 0.022*"manufacturing" + 0.019*"vehicle" + 0.018*"motor" + 0.016*"capital" + 0.016*"industrial" + 0.016*"output" + 0.012*"third" + 0.012*"order" + 0.012*"fourth"'),
 (3,
  '0.009*"many" + 0.009*"household" + 0.008*"factor" + 0.008*"cost" + 0.007*"contact" + 0.007*"number" + 0.007*"several" + 0.007*"high" + 0.007*"relatively" + 0.006*"firm"'),
 (4,
  '0.012*"agree" + 0.011*"target" + 0.011*"objective" + 0.011*"appropriate" + 0.009*"toward" + 0.008*"consistent" + 0.007*"stability" + 0.007*"judge" + 0.007*"assessment" + 0.007*"information"'),
 (5,
  '0.036*"loan" + 0.023*"home" + 0.020*"mortgage" + 0.015*"