In [None]:
#Base Libraries
import os 
import sys
import json
import csv

#Core Libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pygwalker as pyg
import datetime as dt
#Model Libraries
from bertopic import BERTopic
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP
from sentence_transformers import SentenceTransformer

#ML Libraries
import torch 


In [None]:

#Importing Configs
# Define the path where config.py is located
os.chdir('/Users/kylenabors/Documents/GitHub/Finance-ML-Modeling')
config_file_path = os.getcwd()
print(config_file_path)

# Add this path to the sys.path
sys.path.append(config_file_path)

import config

#Configs
database_file = config.database
database_folder = config.database_folder
bert_models = config.bert_models
bert_models_local = config.bert_models_local

In [None]:
df = pd.read_csv("/Users/kylenabors/Documents/Database/Training Data/Fed/Mass Fed Data/fomc_documents.csv")

# Specify the year and month you want to start and end processing files from
start_year_month_day = '2006-12-31'
end_year_month_day = '2023-12-31'
df = df[df['meeting_date'] >= start_year_month_day]
df = df[df['meeting_date'] <= end_year_month_day]

docs = df["text"].to_list()
timestamps = df['meeting_date'].to_list()
type = df['document_kind'].to_list()

#BERT Models
topic_model_BB = torch.load(f"{bert_models_local}/topic_model_fed_BB.pt")
topic_model_all = torch.load(f"{bert_models_local}/topic_model_fed_all.pt")

In [None]:
topics_over_time = topic_model_all.topics_over_time(docs, timestamps, nr_bins=100)

df_tot = pd.DataFrame(topics_over_time, columns=['Topic', 'Words', 'Frequency', 'Timestamp'])
df_tot['Timestamp'] = pd.to_datetime(df_tot['Timestamp'], format='%Y-%m-%d').dt.to_period('D')
df_tot.to_csv(f"{bert_models_local}/tot.csv", index=True)

In [None]:
topics_per_class = topic_model_all.topics_per_class(docs, classes = type)
print(topics_per_class)

df_tpc = pd.DataFrame(topics_per_class, columns=['Topic', 'Words', 'Frequency', 'Class'])
df_tpc.to_csv(f"{bert_models_local}/tpc.csv", index=True)

In [23]:
print(topics_over_time)

     Topic                                              Words  Frequency  \
0        0       would, chairman, inflation, forecast, policy          4   
1        1                      percent, 2007, q4, 2005, 2000          4   
2        2  committee, inflation, moderate, voting, pressures          2   
3        3        funds, rate, federal, inflation, securities          2   
4        5      open, action, confirmation, market, committee          1   
..     ...                                                ...        ...   
813      0   inflation, monetary, market, committee, remained          1   
814      6       activity, demand, continued, district, sales          2   
815      7     reserve, federal, agency, transactions, coupon          2   
816     14             powell, think, inflation, going, smith          2   
817     21      committee, email, monetary, inflation, policy          2   

              Timestamp  
0   2007-01-24 23:31:12  
1   2007-01-24 23:31:12  
2   2007-

In [29]:
categories_all = df_tpc['Class'].to_list()
categories = list(set(categories_all))
print(categories)
topics = topic_model_all.get_topics()
print(topics)

['statement_on_longer_run_goals_and_monetary_policy_strategy', 'agenda', 'tealbook_b', 'greenbook_part_two', 'memos', 'minutes', 'bluebook', 'addendum_to_the_policy_normalization_principles_and_plans', 'statement', 'sep_participant_key', 'presentation_materials', 'tealbook_a', 'transcript', 'accessible_material', 'related_current_faqs', 'policy_normalization_principles_and_plans', 'accessible_version', 'press_conference', 'sep_individual_projections', 'greenbook_part_one', 'greenbook_supplement', 'implementation_note', 'beige_book']
{-1: [('tealbook', 0.019951591913217336), ('percent', 0.017696477692526986), ('2011', 0.013827413085096538), ('rate', 0.01343436351637362), ('gdp', 0.012961347400814734), ('inflation', 0.011252880681693455), ('market', 0.0100266714179151), ('economic', 0.010010954127241518), ('treasury', 0.00946476306272693), ('prices', 0.009248623903862895)], 0: [('inflation', 0.012970657793834538), ('chairman', 0.012690541899607553), ('policy', 0.011732607014438274), ('ra

In [30]:

TEST = pd.DataFrame(columns=['Topic', 'Words', 'Frequency', 'Timestamp', 'Class'])
for category in categories:
    documents = pd.DataFrame({"Document": docs,
                          "ID": range(len(docs)),
                          "Topic": topics,
                          "Category": categories_all})
    
    subset = documents.loc[documents.Category == f'{category}', :]
    subset_labels = sorted(list(subset.Topic.unique()))
    # First, we group the documents per topic
    documents_per_topic = subset.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})

    # Then we calculate the c-TF-IDF representation but we do not fit this method 
    # as it was already fitted on the entire dataset
    topic_model_all.c_tf_idf, words = topic_model_all._c_tf_idf(documents_per_topic, fit=False)

    # Lastly, we extract the words per topic based on the subset_labels,
    # and we update the topic size to correspond with the subset
    topic_model_all.topics = topic_model_all._extract_words_per_topic(words, labels=subset_labels)
    topic_model_all._create_topic_vectors()
    topic_model_all.topic_names = {key: f"{key}_" + "_".join([word[0] for word in values[:4]])
                        for key, values in
                        topic_model_all.topics.items()}
    topic_model._update_topic_size(subset)

ValueError: All arrays must be of the same length

In [26]:

TEST = pd.DataFrame(columns=['Topic', 'Words', 'Frequency', 'Timestamp', 'Class'])
for category in categories:
    
    sub_df = df[df['document_kind'] == f'{category}']
    # Specify the year and month you want to start and end processing files from
    start_year_month_day = '2006-12-31'
    end_year_month_day = '2023-12-31'
    sub_df = sub_df[sub_df['meeting_date'] >= start_year_month_day]
    sub_df = sub_df[sub_df['meeting_date'] <= end_year_month_day]

    sub_docs = sub_df["text"].to_list()
    sub_timestamps = sub_df['meeting_date'].to_list()
    sub_type = sub_df['document_kind'].to_list()
    topic_model_all.update_topics(sub_docs, sub_timestamps, sub_type)
    topics_over_time = topic_model_all.topics_over_time(sub_docs, sub_timestamps, nr_bins=100)


ValueError: All arrays must be of the same length

In [21]:

      
pivot_df_tot = df_tot.pivot(index='Timestamp', columns='Topic', values='Frequency')

pivot_df_tot = pivot_df_tot.fillna(0)
print(pivot_df_tot.head())

pivot_df_tot.to_csv(f"{bert_models_local}/pivot_df_tot.csv", index=True)


Topic       -1    0    1    2    3    4    5    6    7    8   ...   21   22  \
Timestamp                                                     ...             
2007-01-24  0.0  4.0  4.0  2.0  2.0  0.0  1.0  0.0  0.0  0.0  ...  0.0  1.0   
2007-04-01  1.0  2.0  2.0  1.0  1.0  0.0  1.0  0.0  0.0  0.0  ...  0.0  0.0   
2007-05-31  0.0  2.0  2.0  1.0  1.0  0.0  1.0  0.0  0.0  0.0  ...  0.0  0.0   
2007-07-30  0.0  6.0  4.0  4.0  2.0  0.0  2.0  0.0  0.0  0.0  ...  0.0  0.0   
2007-09-28  0.0  2.0  2.0  1.0  1.0  0.0  1.0  0.0  0.0  0.0  ...  0.0  0.0   

Topic        23   24   25   26   27   28   29   30  
Timestamp                                           
2007-01-24  2.0  0.0  0.0  0.0  0.0  0.0  0.0  2.0  
2007-04-01  1.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  
2007-05-31  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  
2007-07-30  2.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  
2007-09-28  1.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  

[5 rows x 32 columns]


In [None]:
gwalker = pyg.walk(df_tot)

In [None]:
gwalker = pyg.walk(df_tpc)