In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import gensim
import gensim.corpora as corpora
from gensim import models
from mpl_toolkits import mplot3d
from pprint import pprint
from collections import Counter
from itertools import chain
import time

In [16]:
from functions import apply_doc2bow

In [17]:
fomc = pd.read_pickle('../data/fomc_data.pkl')
fomc.head(3)

Unnamed: 0,minutes_paragraphs,paragraphs_length,minutes_text,text_length
1993-02-03,"[[meeting, federal, open, market, committee, h...","[14, 15, 24, 29, 12, 32, 37, 32, 14, 16, 82, 5...",meeting federal open market committee hold off...,4439
1993-03-23,"[[meeting, federal, open, market, committee, h...","[12, 13, 64, 23, 24, 28, 60, 51, 64, 56, 100, ...",meeting federal open market committee hold off...,2790
1993-05-18,"[[meeting, federal, open, market, committee, h...","[12, 26, 19, 25, 27, 62, 46, 54, 37, 89, 56, 6...",meeting federal open market committee hold off...,2355


In [19]:
fomcminute_full_list = list(chain.from_iterable(fomc['minutes_paragraphs']))
ID2word = corpora.Dictionary(fomcminute_full_list)

corpus = [ID2word.doc2bow(doc) for doc in fomcminute_full_list] # Apply Bag of Words to all documents in corpus
# corpus = list(chain.from_iterable(fomc['doc2bow'])) # alternatively, we can define corpus this way as well. This is in case we add doc2bow to the fomc df.

TFIDF = models.TfidfModel(corpus) # Fit TF-IDF model
trans_TFIDF = TFIDF[corpus] # Apply TF-IDF model


In [22]:
def apply_doc2bow(x):
    """
    Apply the Gensim Dictionary.doc2bow() function to convert a list of documents
    into Bag-of-Words (BoW) representation.

    The function takes a list of documents, where each document is represented as
    a list of tokens. It applies the Gensim Dictionary.doc2bow() function to each
    document to convert it into a BoW representation, which is a list of tuples
    (word_id, word_frequency) for each word in the document.

    Parameters:
        x (list): A list of documents, where each document is represented as a list of tokens.

    Returns:
        list: A list of BoW representations for each document in the input list.
              Each BoW representation is a list of tuples (word_id, word_frequency).

    Example:
        Given the input list of documents 'x':
        [
            ['apple', 'orange', 'banana'],
            ['apple', 'apple', 'grape', 'grape'],
            ['orange', 'orange', 'orange', 'apple']
        ]

        The function will return:
        [
            [(0, 1), (1, 1), (2, 1)],
            [(0, 2), (3, 2)],
            [(0, 1), (2, 3)]
        ]


        In this example, the Gensim Dictionary object (ID2word) is assumed to be defined
        outside this function, containing the mapping of words to word IDs.

    Note:
        The ID2word dictionary should be created using the Gensim corpora.Dictionary class,
        and it should be shared across all the functions in the pipeline for consistency.
    """
    
    return [ID2word.doc2bow(sublist) for sublist in x]

In [23]:
fomc['doc2bow'] = fomc['minutes_paragraphs'].apply(apply_doc2bow)
fomc.head(3)

Unnamed: 0,minutes_paragraphs,paragraphs_length,minutes_text,text_length,doc2bow
1993-02-03,"[[meeting, federal, open, market, committee, h...","[14, 15, 24, 29, 12, 32, 37, 32, 14, 16, 82, 5...",meeting federal open market committee hold off...,4439,"[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, ..."
1993-03-23,"[[meeting, federal, open, market, committee, h...","[12, 13, 64, 23, 24, 28, 60, 51, 64, 56, 100, ...",meeting federal open market committee hold off...,2790,"[[(0, 1), (1, 1), (2, 1), (4, 1), (5, 1), (6, ..."
1993-05-18,"[[meeting, federal, open, market, committee, h...","[12, 26, 19, 25, 27, 62, 46, 54, 37, 89, 56, 6...",meeting federal open market committee hold off...,2355,"[[(0, 1), (1, 1), (2, 1), (4, 1), (5, 1), (6, ..."


In [24]:
start_time = time.time()

SEED = 130 # Set random seed
NUM_topics = 6 # Set number of topics
ALPHA = 0.15 # Set alpha
ETA = 1.25 # Set eta

# Train LDA model using the corpus
lda_model = gensim.models.LdaMulticore(corpus=trans_TFIDF, num_topics=NUM_topics, id2word=ID2word, random_state=SEED, alpha=ALPHA, eta=ETA, passes=100)

# Print topics generated from the training corpus
pprint(lda_model.print_topics(num_words=10))

end_time = time.time()
execution_time = end_time - start_time
print('\n')
print("Execution time:", execution_time, "seconds")

[(0,
  '0.000*"memory" + 0.000*"miss" + 0.000*"racial" + 0.000*"intellectual" + '
  '0.000*"leadership" + 0.000*"friend" + 0.000*"intellect" + 0.000*"colleague" '
  '+ 0.000*"work" + 0.000*"generous"'),
 (1,
  '0.012*"transaction" + 0.012*"system" + 0.010*"open" + 0.010*"vote" + '
  '0.010*"foreign" + 0.009*"operation" + 0.009*"account" + 0.009*"currency" + '
  '0.009*"committee" + 0.008*"security"'),
 (2,
  '0.009*"inflation" + 0.008*"participant" + 0.007*"policy" + 0.005*"economic" '
  '+ 0.005*"risk" + 0.005*"member" + 0.005*"committee" + 0.005*"fund" + '
  '0.005*"rate" + 0.004*"financial"'),
 (3,
  '0.007*"reserve" + 0.005*"seek" + 0.005*"sustainable" + 0.004*"stability" + '
  '0.004*"growth" + 0.004*"acceptable" + 0.004*"restraint" + 0.004*"objective" '
  '+ 0.003*"percent" + 0.003*"committee"'),
 (4,
  '0.007*"quarter" + 0.007*"consumer" + 0.006*"price" + 0.005*"spending" + '
  '0.005*"business" + 0.005*"month" + 0.005*"year" + 0.004*"growth" + '
  '0.004*"sale" + 0.004*"real"')

In [25]:
def topic_prob(x):
    """
    Get the topic probabilities for a list of documents using a pre-trained Latent Dirichlet Allocation (LDA) model.

    The function takes a list of documents, where each document is represented as a list of tokens. It applies a
    pre-trained LDA model to each document to obtain the topic probabilities. The LDA model should be trained using
    the Gensim library and should include the relevant TF-IDF and Dictionary objects (TFIDF and ID2word) used to
    convert the documents into numerical representations.

    Parameters:
        x (list): A list of documents, where each document is represented as a list of tokens.

    Returns:
        list: A list of topic probability distributions for each document in the input list.
              Each topic probability distribution is a list of tuples (topic_id, probability) representing the
              probability of each topic in the document.

    Example:
        Given the input list of documents 'x':
        [
            ['apple', 'orange', 'banana'],
            ['apple', 'apple', 'grape', 'grape'],
            ['orange', 'orange', 'orange', 'apple']
        ]

        Assuming that the LDA model 'lda_model' and the corresponding TF-IDF and Dictionary objects 'TFIDF' and 'ID2word'
        are available and properly trained, the function will return (assuming the specified number of topics in lda_model
        is 3):
        [
            [(0, 0.15), (1, 0.8), (2, 0.05)],
            [(0, 0.4), (1, 0.1), (2, 0.5)],
            [(0, 0.05), (1, 0.9), (2, 0.05)]
        ]

        In this example, each list inside the main list represents the topic probability distribution for each document.

    Note:
        The 'lda_model', 'TFIDF', and 'ID2word' objects should be pre-trained using the Gensim library and shared across
        other functions in the pipeline for consistency. The function assumes that the LDA model has been trained using
        the same set of topics as the number of output topics required in the returned topic probability distribution.
    """
    
    return [lda_model.get_document_topics(TFIDF[ID2word.doc2bow(sublist)], minimum_probability=0) for sublist in x]


In [26]:
fomc['topic_prob'] = fomc['minutes_paragraphs'].apply(topic_prob)
fomc.head(3)

Unnamed: 0,minutes_paragraphs,paragraphs_length,minutes_text,text_length,doc2bow,topic_prob
1993-02-03,"[[meeting, federal, open, market, committee, h...","[14, 15, 24, 29, 12, 32, 37, 32, 14, 16, 82, 5...",meeting federal open market committee hold off...,4439,"[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, ...","[[(0, 0.0355354), (1, 0.035726767), (2, 0.0356..."
1993-03-23,"[[meeting, federal, open, market, committee, h...","[12, 13, 64, 23, 24, 28, 60, 51, 64, 56, 100, ...",meeting federal open market committee hold off...,2790,"[[(0, 1), (1, 1), (2, 1), (4, 1), (5, 1), (6, ...","[[(0, 0.037205704), (1, 0.03742855), (2, 0.037..."
1993-05-18,"[[meeting, federal, open, market, committee, h...","[12, 26, 19, 25, 27, 62, 46, 54, 37, 89, 56, 6...",meeting federal open market committee hold off...,2355,"[[(0, 1), (1, 1), (2, 1), (4, 1), (5, 1), (6, ...","[[(0, 0.037205704), (1, 0.037428267), (2, 0.03..."


In [27]:
dic = {}
for index in fomc.index:
    sum_dict = {}
    for i in range(len(fomc.loc[index, 'topic_prob'])):
        doc_length = fomc.loc[index, 'text_length']
        para_length = fomc.loc[index, 'paragraphs_length'][i]
        for tup in fomc.loc[index, 'topic_prob'][i]:
    #         print(tup[1]/fomc.loc[index, 'text_length']*fomc.loc[index, 'paragraphs_length'][i])
            # for tup in flattened_list:
            idx, val = tup
            if idx not in sum_dict:
                sum_dict[idx] = val/doc_length*para_length
            else:
                sum_dict[idx] += val/doc_length*para_length
    dic[index]=sum_dict

fomc['topic_score'] = dic

In [28]:
fomc.head(3)

Unnamed: 0,minutes_paragraphs,paragraphs_length,minutes_text,text_length,doc2bow,topic_prob,topic_score
1993-02-03,"[[meeting, federal, open, market, committee, h...","[14, 15, 24, 29, 12, 32, 37, 32, 14, 16, 82, 5...",meeting federal open market committee hold off...,4439,"[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, ...","[[(0, 0.0355354), (1, 0.035726767), (2, 0.0356...","{0: 0.021366666452961276, 1: 0.249446655157428..."
1993-03-23,"[[meeting, federal, open, market, committee, h...","[12, 13, 64, 23, 24, 28, 60, 51, 64, 56, 100, ...",meeting federal open market committee hold off...,2790,"[[(0, 1), (1, 1), (2, 1), (4, 1), (5, 1), (6, ...","[[(0, 0.037205704), (1, 0.03742855), (2, 0.037...","{0: 0.019202715935899905, 1: 0.071071036395326..."
1993-05-18,"[[meeting, federal, open, market, committee, h...","[12, 26, 19, 25, 27, 62, 46, 54, 37, 89, 56, 6...",meeting federal open market committee hold off...,2355,"[[(0, 1), (1, 1), (2, 1), (4, 1), (5, 1), (6, ...","[[(0, 0.037205704), (1, 0.037428267), (2, 0.03...","{0: 0.02022220298228687, 1: 0.0590723693616008..."


In [29]:
topics = pd.DataFrame(dic).transpose()
topics.columns = 'topic 1, topic 2, topic 3, topic 4, topic 5, topic 6'.split(', ')
topics.head()

Unnamed: 0,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6
1993-02-03,0.021367,0.249447,0.285851,0.090775,0.317266,0.035295
1993-03-23,0.019203,0.071071,0.389098,0.092606,0.405476,0.022546
1993-05-18,0.020222,0.059072,0.359659,0.10695,0.429912,0.024184
1993-07-07,0.02019,0.057558,0.407705,0.130867,0.360023,0.023656
1993-08-17,0.020382,0.057332,0.281115,0.125689,0.484803,0.030679


In [30]:
fomc = pd.concat([fomc, topics], axis=1)
fomc.head(3)

Unnamed: 0,minutes_paragraphs,paragraphs_length,minutes_text,text_length,doc2bow,topic_prob,topic_score,topic 1,topic 2,topic 3,topic 4,topic 5,topic 6
1993-02-03,"[[meeting, federal, open, market, committee, h...","[14, 15, 24, 29, 12, 32, 37, 32, 14, 16, 82, 5...",meeting federal open market committee hold off...,4439,"[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, ...","[[(0, 0.0355354), (1, 0.035726767), (2, 0.0356...","{0: 0.021366666452961276, 1: 0.249446655157428...",0.021367,0.249447,0.285851,0.090775,0.317266,0.035295
1993-03-23,"[[meeting, federal, open, market, committee, h...","[12, 13, 64, 23, 24, 28, 60, 51, 64, 56, 100, ...",meeting federal open market committee hold off...,2790,"[[(0, 1), (1, 1), (2, 1), (4, 1), (5, 1), (6, ...","[[(0, 0.037205704), (1, 0.03742855), (2, 0.037...","{0: 0.019202715935899905, 1: 0.071071036395326...",0.019203,0.071071,0.389098,0.092606,0.405476,0.022546
1993-05-18,"[[meeting, federal, open, market, committee, h...","[12, 26, 19, 25, 27, 62, 46, 54, 37, 89, 56, 6...",meeting federal open market committee hold off...,2355,"[[(0, 1), (1, 1), (2, 1), (4, 1), (5, 1), (6, ...","[[(0, 0.037205704), (1, 0.037428267), (2, 0.03...","{0: 0.02022220298228687, 1: 0.0590723693616008...",0.020222,0.059072,0.359659,0.10695,0.429912,0.024184


In [31]:
fomc.to_pickle('../data/fomc_topic_modeling.pkl')