In [1]:
#Base Libraries
import os 
import sys
import json
import csv

#Core Libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pygwalker as pyg
import datetime as dt
#Model Libraries
from bertopic import BERTopic
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP
#import umap.plot
from sentence_transformers import SentenceTransformer

#ML Libraries
import torch 


  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:

#Importing Configs
# Define the path where config.py is located
os.chdir('/Users/kylenabors/Documents/GitHub/Finance-ML-Modeling')
config_file_path = os.getcwd()
print(config_file_path)

# Add this path to the sys.path
sys.path.append(config_file_path)

import config

#Configs
database_file = config.database
database_folder = config.database_folder
bert_models = config.bert_models
bert_models_local = config.bert_models_local

/Users/kylenabors/Documents/GitHub/Finance-ML-Modeling
/Users/kylenabors/Documents/GitHub/Finance-ML-Modeling
/Users/kylenabors/Documents


In [3]:
Body = 'Fed'
Model = 'Monetary Policy Report'
Model_Subfolder = f'/{Body} Texts/{Model}'
Model_Folder = config.texts
Model_Folder = Model_Folder + Model_Subfolder
print(Model_Folder)

#-------------------------------------------------------------------------------------------------------------------
df = pd.read_csv(f"{Model_Folder}/{Model}_texts.csv")  
docs = df["segment"].to_list()
timestamps = df['date'].to_list()
type = df['type'].to_list()

#Initial Model
#topic_model = torch.load(f"{bert_models_local}/topic_model_{Model}.pt")

#Edited Model
topic_model_policy= torch.load(f"{bert_models_local}/topic_model_{Model}_edited.pt")
print("Done Loading Model")

topic_model_policy.set_topic_labels({0: "Inflation", 1: "Bank", 2: "Employment", 3: "Spending", 4: "Uncertainty"})
topic_model_policy.custom_labels_


/Users/kylenabors/Documents/Database/Models/Texts/Fed Texts/Monetary Policy Report
Thu Sep 21 13:53:55 2023 Building and compiling search function
Done Loading Model


['-1, credit, unemployme',
 'Inflation',
 'Bank',
 'Employment',
 'Spending',
 'Uncertainty',
 '5, bureau, department',
 '6, fiscal, tax',
 '7, dow, indices',
 '8, data, weekly',
 '9, housing, mortgage',
 '10, oil, crude',
 '11, historical, low',
 '12, committee, achieve',
 '13, summary, gap',
 '14, japan, canada',
 '15, saving, nonfederal',
 '16, pandemic, population',
 '17, number, range',
 '18, percentage, net',
 '19, survey, loan',
 '20, table, gap',
 '21, repurchase, reverse',
 '22, recession, shaded',
 '23, delinquenc, insurance',
 '24, quarter, conjunctio',
 '25, census, population',
 '26, yield, lynch',
 '27, paper, unsecured',
 '28, michigan, conference',
 '29, equity, eme',
 '30, receipts, trust',
 '31, index, change',
 '32, uncertain, informed',
 '33, middle, median',
 '34, preceding, period',
 '35, speaker, senate',
 '36, likelihood, potential',
 '37, reaffirm, organizati',
 '38, swap, draw',
 '39, nonprofit, nonfarm',
 '40, foreign, internatio',
 '41, interval, confidence'

In [4]:
topics_over_time = topic_model_policy.topics_over_time(docs, timestamps, nr_bins=100)

df_tot = pd.DataFrame(topics_over_time, columns=['Topic', 'Words', 'Frequency', 'Timestamp'])
df_tot['Timestamp'] = pd.to_datetime(df_tot['Timestamp'], format='%Y-%m-%d').dt.to_period('D')
df_tot.to_csv(f"{bert_models_local}/tot.csv", index=True)

34it [01:10,  2.06s/it]


In [5]:

a = [0,1,2,3,4]
df_tot.drop(['Words'], axis=1, inplace=True)
df_tot = df_tot[df_tot['Topic'].isin(a)==True]
print(df_tot.head())

df_tpt = df_tot.pivot_table(index='Timestamp', columns='Topic', values='Frequency')


df_tpt.rename(columns={0: "Inflation", 1: "Bank", 2: "Employment", 3: "Spending", 4: "Uncertainty"}, inplace=True)
print(df_tpt.head())

   Topic  Frequency   Timestamp
1      0         46  2007-02-08
2      1         68  2007-02-08
3      2         27  2007-02-08
4      3         72  2007-02-08
5      4          2  2007-02-08
Topic       Inflation   Bank  Employment  Spending  Uncertainty
Timestamp                                                      
2007-02-08       46.0   68.0        27.0      72.0          2.0
2007-06-13       65.0   58.0        25.0      75.0          2.0
2008-02-06       92.0  104.0        22.0      73.0         13.0
2008-06-05       88.0   98.0        32.0      59.0         21.0
2009-01-29      137.0  112.0        29.0      40.0         21.0


In [6]:
df_tpt.columns
df_tpt.reset_index(inplace=True)
print(df_tpt.head())
#df_tpt.drop(['Topic'], axis = 1 , inplace=True)

Topic   Timestamp  Inflation   Bank  Employment  Spending  Uncertainty
0      2007-02-08       46.0   68.0        27.0      72.0          2.0
1      2007-06-13       65.0   58.0        25.0      75.0          2.0
2      2008-02-06       92.0  104.0        22.0      73.0         13.0
3      2008-06-05       88.0   98.0        32.0      59.0         21.0
4      2009-01-29      137.0  112.0        29.0      40.0         21.0


In [7]:
funds = pd.read_excel('/Users/kylenabors/Documents/Database/Other Data/FedFundsRate.xlsx')
print(funds.head())

                 Data List: FedFundsRate
0               Data Updated: 2023-07-03
1                                    NaN
2   FRED (Federal Reserve Economic Data)
3      Link: https://fred.stlouisfed.org
4  Help: https://fredhelp.stlouisfed.org


In [8]:
df_tpt_m = df_tpt.copy(deep=True)
df_tpt['Timestamp'] = df_tpt["Timestamp"].dt.to_timestamp(freq='D')
df_tpt_m["Timestamp"] = df_tpt_m["Timestamp"].dt.to_timestamp(freq='M')
df_tpt_m['Timestamp'] = df_tpt_m['Timestamp'].dt.to_period('M')
print(df_tpt_m.head())

Topic Timestamp  Inflation   Bank  Employment  Spending  Uncertainty
0       2007-02       46.0   68.0        27.0      72.0          2.0
1       2007-06       65.0   58.0        25.0      75.0          2.0
2       2008-02       92.0  104.0        22.0      73.0         13.0
3       2008-06       88.0   98.0        32.0      59.0         21.0
4       2009-01      137.0  112.0        29.0      40.0         21.0


In [9]:
energy = pd.read_csv('/Users/kylenabors/Documents/Database/Other Data/Energy Prices/US Energy CPI.csv')
print(energy.head())
print(df_tpt_m.head())
energy["DATE"] = pd.to_datetime(energy["DATE"]).dt.to_period('M')
print(energy.head())
df_tpt_m.merge(energy, left_on='Timestamp', right_on='DATE', how='left')

         DATE  CPIENGSL
0  2000-01-01     115.0
1  2000-02-01     118.8
2  2000-03-01     124.3
3  2000-04-01     120.9
4  2000-05-01     120.0
Topic Timestamp  Inflation   Bank  Employment  Spending  Uncertainty
0       2007-02       46.0   68.0        27.0      72.0          2.0
1       2007-06       65.0   58.0        25.0      75.0          2.0
2       2008-02       92.0  104.0        22.0      73.0         13.0
3       2008-06       88.0   98.0        32.0      59.0         21.0
4       2009-01      137.0  112.0        29.0      40.0         21.0
      DATE  CPIENGSL
0  2000-01     115.0
1  2000-02     118.8
2  2000-03     124.3
3  2000-04     120.9
4  2000-05     120.0


Unnamed: 0,Timestamp,Inflation,Bank,Employment,Spending,Uncertainty,DATE,CPIENGSL
0,2007-02,46.0,68.0,27.0,72.0,2.0,2007-02,192.31
1,2007-06,65.0,58.0,25.0,75.0,2.0,2007-06,209.799
2,2008-02,92.0,104.0,22.0,73.0,13.0,2008-02,229.731
3,2008-06,88.0,98.0,32.0,59.0,21.0,2008-06,262.081
4,2009-01,137.0,112.0,29.0,40.0,21.0,2009-01,178.661
5,2009-05,126.0,109.0,43.0,36.0,16.0,2009-05,179.83
6,2010-01,137.0,96.0,26.0,51.0,17.0,2010-01,212.807
7,2010-07,102.0,83.0,26.0,48.0,16.0,2010-07,206.877
8,2011-01,15.0,18.0,3.0,6.0,3.0,2011-01,229.258
9,2011-05,17.0,11.0,2.0,6.0,2.0,2011-05,250.744


In [10]:
funds = pd.read_excel('/Users/kylenabors/Documents/Database/Other Data/FedFundsRate.xlsx', sheet_name='Monthly')
funds['Date Adjusted'] = funds['Date Adjusted'].dt.to_period(freq='M')
df_tpt_m = df_tpt_m.merge(funds, left_on='Timestamp', right_on='Date Adjusted', how='left')

In [11]:
df_tpt_m.drop(['DATE', 'Date Adjusted'], axis=1, inplace=True)

In [12]:

change_df_tpt_m = df_tpt_m.copy(deep=True)
change_df_tpt_m['Inflation'] = change_df_tpt_m['Inflation'].diff()
change_df_tpt_m['Bank'] = change_df_tpt_m['Bank'].diff()
change_df_tpt_m['Employment'] = change_df_tpt_m['Employment'].diff()
change_df_tpt_m['Spending'] = change_df_tpt_m['Spending'].diff()
change_df_tpt_m['Uncertainty'] = change_df_tpt_m['Uncertainty'].diff()
change_df_tpt_m['FEDFUNDS_Change'] = change_df_tpt_m['FEDFUNDS'].diff()
print(change_df_tpt_m.head())


  Timestamp  Inflation  Bank  Employment  Spending  Uncertainty  FEDFUNDS  \
0   2007-02        NaN   NaN         NaN       NaN          NaN      5.26   
1   2007-06       19.0 -10.0        -2.0       3.0          0.0      5.25   
2   2008-02       27.0  46.0        -3.0      -2.0         11.0      2.98   
3   2008-06       -4.0  -6.0        10.0     -14.0          8.0      2.00   
4   2009-01       49.0  14.0        -3.0     -19.0          0.0      0.15   

   FEDFUNDS_Change  
0              NaN  
1            -0.01  
2            -2.27  
3            -0.98  
4            -1.85  


In [13]:
topics_per_class = topic_model_policy.topics_per_class(docs, classes = type)
print(topics_per_class)

# Save topic-terms barcharts as HTML file
topic_model_policy.visualize_barchart(top_n_topics = 100, n_words=8, custom_labels=True).write_html(f"{bert_models}/barchart.html")

1it [00:03,  3.17s/it]


    Topic                                              Words  Frequency  \
0      -1  credit, unemployment, interest, mortgage, trea...       5162   
1       0        inflation, monetary, funds, reserve, target       4063   
2       1        treasury, bond, equity, liquidity, leverage       2715   
3       2  labor, unemployment, employment, productivity,...       1370   
4       3       domestic, spending, consumer, growth, income       1256   
5       4  uncertainty, broadly, forecast, confidence, we...        688   
6       5  bureau, department, statistics, confidence, nu...        605   
7       6               fiscal, tax, receipts, debt, deficit        560   
8       7             dow, indices, exchange, foreign, index        401   
9       8        data, weekly, seasonally, quarterly, series        389   
10      9         housing, mortgage, property, index, wealth        364   
11     10                oil, crude, barrel, brent, drilling        307   
12     11            hist

In [14]:

df_tpc = pd.DataFrame(topics_per_class, columns=['Topic', 'Words', 'Frequency', 'Class'])
df_tpc.to_csv(f"{bert_models_local}/tpc.csv", index=True)

In [15]:
print(topics_over_time)

      Topic                                              Words  Frequency  \
0        -1             quarters, pace, nominal, housing, rise        146   
1         0           congress, core, monetary, reserve, index         46   
2         1            treasury, corporate, bond, credit, debt         68   
3         2  quarters, labor, productivity, compensation, e...         27   
4         3       equipment, consumer, spending, trucks, goods         72   
...     ...                                                ...        ...   
1453     52      submit, participant, funds, execution, signal          1   
1454     55                shaded, key, top, sum, transparency          3   
1455     56            spline, premium, basis, june, plication          1   
1456     57  truncation, accommodation, negative, likelihoo...          2   
1457     58      ice, accommodation, data, municipal, monetary          1   

                   Timestamp  
0    2007-02-08 00:48:57.600  
1    2007-02-

In [16]:
categories_all = df_tpc['Class'].to_list()
categories = list(set(categories_all))
print(categories)
topics = topic_model_policy.get_topics()
print(topics)

['Monetary Policy Report']
{-1: [('credit', 0.008274627330106829), ('unemployment', 0.006990150427370995), ('interest', 0.006903574921544661), ('mortgage', 0.006481420471296709), ('treasury', 0.006365028988668769), ('debt', 0.006292807761505489), ('pace', 0.00624982590622122), ('funds', 0.00620222935244625), ('market', 0.0059237125172004945), ('increase', 0.005802991676597589)], 0: [('inflation', 0.014334688814011192), ('monetary', 0.012721615077305687), ('funds', 0.012509374457514886), ('reserve', 0.01200128233719165), ('target', 0.010021794846927012), ('balance', 0.00966137266462636), ('interest', 0.008081367753120251), ('bank', 0.007200492121108862), ('food', 0.007077615983168302), ('consumption', 0.006918719618350988)], 1: [('treasury', 0.02075290827982679), ('bond', 0.020346671131809952), ('equity', 0.015911988699116482), ('liquidity', 0.015753480748314053), ('leverage', 0.01193931650838103), ('debt', 0.01181041046280381), ('capital', 0.010861553402113044), ('banking', 0.010130715

In [17]:

pivot_df_tot = df_tot.pivot(index='Timestamp', columns='Topic', values='Frequency')

pivot_df_tot = pivot_df_tot.fillna(0)
print(pivot_df_tot.head())

pivot_df_tot.to_csv(f"{bert_models_local}/pivot_df_tot.csv", index=True)

Topic           0      1     2     3     4
Timestamp                                 
2007-02-08   46.0   68.0  27.0  72.0   2.0
2007-06-13   65.0   58.0  25.0  75.0   2.0
2008-02-06   92.0  104.0  22.0  73.0  13.0
2008-06-05   88.0   98.0  32.0  59.0  21.0
2009-01-29  137.0  112.0  29.0  40.0  21.0


In [18]:
df_tpt_m.to_csv(f"{bert_models_local}/tpt monthly merged.csv", index=True)
change_df_tpt_m.to_csv(f"{bert_models_local}/tpt change monthly merged.csv", index=True)

In [19]:
gwalker = pyg.walk(df_tpt_m)

Box(children=(HTML(value='<div id="ifr-pyg-0" style="height: auto">\n    <head>\n        <meta http-equiv="Con…

In [21]:
gwalker = pyg.walk(change_df_tpt_m)

Box(children=(HTML(value='<div id="ifr-pyg-2" style="height: auto">\n    <head>\n        <meta http-equiv="Con…