In [10]:
import os 
import sys
import pandas as pd
import numpy as np

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration
from bertopic.vectorizers import ClassTfidfTransformer

from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
import torch 
import nltk
import spacy

In [11]:
# NLTK English stopwords
nlp = spacy.load("en_core_web_lg")
nltk.download('stopwords') 
stopwords_en = nltk.corpus.stopwords.words('english')
stopwords_sp = nltk.corpus.stopwords.words('spanish')
stopwords_fr = nltk.corpus.stopwords.words('french')
stopwords_it = nltk.corpus.stopwords.words('italian')
stopwords = stopwords_en + stopwords_sp + stopwords_fr + stopwords_it


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kylenabors/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:

#Importing Configs
# Define the path where config.py is located
os.chdir('/Users/kylenabors/Documents/GitHub/Finance-ML-Modeling')
config_file_path = os.getcwd()
print(config_file_path)

# Add this path to the sys.path
sys.path.append(config_file_path)

import config

#Variables, Paramaters, and Pathnames needed for this script
database_file = config.database
database_folder = config.database_folder
bert_models = config.bert_models
bert_models_local = config.bert_models_local
keywords = config.keywords
Model_Folder_Texts = config.texts

#ECB THEN FED!!!!!!!!!!

/Users/kylenabors/Documents/GitHub/Finance-ML-Modeling


In [13]:
Body = config.Body
Model = config.Model
Model_Subfolder = f'/{Body} Texts/{Model}'

Model_Folder = Model_Folder_Texts + Model_Subfolder

df = pd.read_csv(f"{Model_Folder}/{Model}_texts.csv")  
docs = df["segment"].to_list()
timestamps = df['date'].to_list()
type = df['type'].to_list()

In [14]:

Body_2 = config.Body_2
Model_2 = config.Model_2
Model_Subfolder_2 = f'/{Body_2} Texts/{Model_2}'
Model_Folder_2 = Model_Folder_Texts + Model_Subfolder_2

df_2 = pd.read_csv(f"{Model_Folder_2}/{Model_2}_texts.csv")  
docs_2 = df_2["segment"].to_list()
timestamps_2 = df_2['date'].to_list()
type_2 = df_2['type'].to_list()

In [15]:
Body_Cross = Body + '_' + Body_2
Model_Cross = Model + '_' + Model_2

Model_Subfolder_Cross = f'/{Body_Cross} Texts/{Model_Cross}'
Model_Folder_Cross = Model_Folder_Texts + Model_Subfolder_Cross

In [16]:
df_all = pd.concat([df, df_2])
print(df_all.columns)
docs_all = df_all["segment"].to_list()
timestamps_all = df_all['date'].to_list()
type_all = df_all['type'].to_list()

Index(['Unnamed: 0', 'date', 'type', 'segment'], dtype='object')


In [17]:
# Embedding
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
#embeddings = embedding_model.encode(docs, batch_size=20, show_progress_bar=True)

#Reduce Dimensionality
umap_model = UMAP(n_neighbors=5, 
                  n_components=2, 
                  metric='cosine', 
                  n_epochs=500,
                  min_dist=0.0, 
                  target_metric_kwds=keywords, 
                  target_weight=0.95, 
                  verbose=True)

# Clustering model
cluster_model = HDBSCAN(min_cluster_size = 10, 
                        min_samples=10,
                        metric = 'euclidean', 
                        cluster_selection_method = 'eom', 
                        prediction_data = True)

#Representation model
representation_model = MaximalMarginalRelevance(diversity=0.4)

#Create UMAP model
vectorizer_model = CountVectorizer(stop_words=stopwords, ngram_range=(1, 3))

ctfidf_model = ClassTfidfTransformer(bm25_weighting=True)

print("Done Preprocessing Data")

Done Preprocessing Data


In [18]:
topic_model = BERTopic(language= 'english',
                       min_topic_size=15,
                       n_gram_range=(1, 3),
                       nr_topics = 64,
                       embedding_model=embedding_model,
                       umap_model=umap_model,
                       hdbscan_model=cluster_model,
                       vectorizer_model=vectorizer_model,
                       ctfidf_model=ctfidf_model,
                       representation_model=representation_model,
                       verbose=True
                       ).fit(docs)

Batches:   0%|          | 0/7069 [00:00<?, ?it/s]

2023-10-11 12:38:18,208 - BERTopic - Transformed documents to Embeddings


UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.0, n_epochs=500, n_neighbors=5, target_metric_kwds=['interest', 'inflation', 'unemployment', 'credit', 'market', 'trade', 'energy'], target_weight=0.95, verbose=True)
Wed Oct 11 12:38:18 2023 Construct fuzzy simplicial set
Wed Oct 11 12:38:18 2023 Finding Nearest Neighbors
Wed Oct 11 12:38:18 2023 Building RP forest with 29 trees
Wed Oct 11 12:38:23 2023 NN descent for 18 iterations
	 1  /  18
	 2  /  18
	 3  /  18
	 4  /  18
	 5  /  18
	 6  /  18
	Stopping threshold met -- exiting after 6 iterations
Wed Oct 11 12:40:06 2023 Finished Nearest Neighbor Search
Wed Oct 11 12:40:07 2023 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

Wed Oct 11 12:41:06 2023 Finished embedding


2023-10-11 12:41:07,903 - BERTopic - Reduced dimensionality


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2023-10-11 12:41:18,188 - BERTopic - Clustered reduced embeddings
2023-10-11 12:50:50,345 - BERTopic - Reduced number of topics from 3288 to 64


In [19]:
torch.save(topic_model, f"{Model_Folder_Cross}/{Model_Cross}_topic_model_{Body}.pt")

Wed Oct 11 12:51:02 2023 Worst tree score: 0.17427518
Wed Oct 11 12:51:02 2023 Mean tree score: 0.17887270
Wed Oct 11 12:51:02 2023 Best tree score: 0.18698993
Wed Oct 11 12:51:08 2023 Forward diversification reduced edges from 1130970 to 779522
Wed Oct 11 12:51:09 2023 Reverse diversification reduced edges from 779522 to 779522


  self._set_arrayXarray(i, j, x)


Wed Oct 11 12:51:10 2023 Degree pruning reduced edges from 846000 to 822441
Wed Oct 11 12:51:10 2023 Resorting data and graph based on tree order
Wed Oct 11 12:51:10 2023 Building and compiling search function


In [20]:
topic_model_2 = BERTopic(language= 'english',
                       min_topic_size=15,
                       n_gram_range=(1, 3),
                       nr_topics = 64,
                       embedding_model=embedding_model,
                       umap_model=umap_model,
                       hdbscan_model=cluster_model,
                       vectorizer_model=vectorizer_model,
                       ctfidf_model=ctfidf_model,
                       representation_model=representation_model,
                       verbose=True
                       ).fit(docs_2)

Batches:   0%|          | 0/538 [00:00<?, ?it/s]

2023-10-11 12:53:44,005 - BERTopic - Transformed documents to Embeddings


UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.0, n_epochs=500, n_neighbors=5, target_metric_kwds=['interest', 'inflation', 'unemployment', 'credit', 'market', 'trade', 'energy'], target_weight=0.95, verbose=True)
Wed Oct 11 12:53:44 2023 Construct fuzzy simplicial set
Wed Oct 11 12:53:44 2023 Finding Nearest Neighbors
Wed Oct 11 12:53:44 2023 Building RP forest with 12 trees
Wed Oct 11 12:53:44 2023 NN descent for 14 iterations
	 1  /  14
	 2  /  14
	 3  /  14
	 4  /  14
	 5  /  14
	 6  /  14
	 7  /  14
	Stopping threshold met -- exiting after 7 iterations
Wed Oct 11 12:53:44 2023 Finished Nearest Neighbor Search
Wed Oct 11 12:53:44 2023 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

2023-10-11 12:53:48,491 - BERTopic - Reduced dimensionality


Wed Oct 11 12:53:48 2023 Finished embedding
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disab

2023-10-11 12:53:49,380 - BERTopic - Clustered reduced embeddings
2023-10-11 12:55:05,166 - BERTopic - Reduced number of topics from 408 to 64


In [21]:
torch.save(topic_model_2, f"{Model_Folder_Cross}/{Model_Cross}_topic_model_{Body_2}.pt")

Wed Oct 11 12:55:06 2023 Worst tree score: 0.27795861
Wed Oct 11 12:55:06 2023 Mean tree score: 0.28808223
Wed Oct 11 12:55:06 2023 Best tree score: 0.29394327
Wed Oct 11 12:55:06 2023 Forward diversification reduced edges from 86020 to 60173
Wed Oct 11 12:55:06 2023 Reverse diversification reduced edges from 60173 to 60173
Wed Oct 11 12:55:06 2023 Degree pruning reduced edges from 62742 to 62015
Wed Oct 11 12:55:06 2023 Resorting data and graph based on tree order
Wed Oct 11 12:55:06 2023 Building and compiling search function


  self._set_arrayXarray(i, j, x)


In [22]:
topic_model_all = BERTopic(language= 'english',
                       min_topic_size=15,
                       n_gram_range=(1, 3),
                       nr_topics = 64,
                       embedding_model=embedding_model,
                       umap_model=umap_model,
                       hdbscan_model=cluster_model,
                       vectorizer_model=vectorizer_model,
                       ctfidf_model=ctfidf_model,
                       representation_model=representation_model,
                       verbose=True
                       ).fit(docs_all)

Batches:   0%|          | 0/7607 [00:00<?, ?it/s]

2023-10-11 13:22:22,549 - BERTopic - Transformed documents to Embeddings


UMAP(angular_rp_forest=True, metric='cosine', min_dist=0.0, n_epochs=500, n_neighbors=5, target_metric_kwds=['interest', 'inflation', 'unemployment', 'credit', 'market', 'trade', 'energy'], target_weight=0.95, verbose=True)
Wed Oct 11 13:22:22 2023 Construct fuzzy simplicial set
Wed Oct 11 13:22:22 2023 Finding Nearest Neighbors
Wed Oct 11 13:22:22 2023 Building RP forest with 30 trees
Wed Oct 11 13:22:26 2023 NN descent for 18 iterations
	 1  /  18
	 2  /  18
	 3  /  18
	 4  /  18
	 5  /  18
	 6  /  18
	Stopping threshold met -- exiting after 6 iterations
Wed Oct 11 13:22:30 2023 Finished Nearest Neighbor Search
Wed Oct 11 13:22:30 2023 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

Wed Oct 11 13:23:38 2023 Finished embedding


2023-10-11 13:23:40,261 - BERTopic - Reduced dimensionality


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

2023-10-11 13:23:51,092 - BERTopic - Clustered reduced embeddings
2023-10-11 13:33:58,068 - BERTopic - Reduced number of topics from 3482 to 64


In [23]:
torch.save(topic_model_all, f"{Model_Folder_Cross}/{Model_Cross}_topic_model_{Body_Cross}.pt")

Wed Oct 11 13:34:10 2023 Worst tree score: 0.17338680
Wed Oct 11 13:34:10 2023 Mean tree score: 0.17777276
Wed Oct 11 13:34:10 2023 Best tree score: 0.18711740
Wed Oct 11 13:34:13 2023 Forward diversification reduced edges from 1216990 to 839987
Wed Oct 11 13:34:13 2023 Reverse diversification reduced edges from 839987 to 839987
Wed Oct 11 13:34:13 2023 Degree pruning reduced edges from 912096 to 886454
Wed Oct 11 13:34:13 2023 Resorting data and graph based on tree order


  self._set_arrayXarray(i, j, x)


Wed Oct 11 13:34:14 2023 Building and compiling search function


In [24]:
from sklearn.metrics.pairwise import cosine_similarity

sim_matrix = cosine_similarity(topic_model.topic_embeddings_, topic_model_2.topic_embeddings_)

In [25]:
topic = 10

In [26]:
topic_model.get_topic(topic)

[('latest observations', 0.021749176058509127),
 ('2022', 0.020929463956313846),
 ('2019 op', 0.018192814413071016),
 ('2018 op', 0.01752455174169975),
 ('data latest', 0.016899166206944143),
 ('2007', 0.015888847520837586),
 ('2020 op', 0.015857421193154177),
 ('annual data', 0.014986541819297362),
 ('2016', 0.014315126423998183),
 ('annual', 0.0138637516022952)]

In [27]:
most_similar_topic = np.argmax(sim_matrix[topic + 1])-1
topic_model_2.get_topic(most_similar_topic)

[('congressional budget', 0.05283517596589791),
 ('congressional budget office', 0.04601481068997554),
 ('budget office', 0.04601481068997554),
 ('budget economic outlook', 0.03354510962831647),
 ('87th annual report', 0.02494917668440108),
 ('see congressional budget', 0.02494917668440108),
 ('text see congressional', 0.020470930118616216),
 ('budget office 2021', 0.020470930118616216),
 ('2017a 2017b', 0.019959341347520865),
 ('act 2021', 0.01953345457973714)]

In [28]:
similar_topics, similarity = topic_model_all.find_topics("interest", top_n=8)
print(topic_model.get_topic(similar_topics[1]))

topics_per_class = topic_model_all.topics_per_class(docs_all, classes=type_all)
topic_model_all.visualize_topics_per_class(topics_per_class, top_n_topics=32).write_html(f"{bert_models}/topics_per_class.html")

#topics_over_time = topic_model.topics_over_time(docs, timestamps, nr_bins=200)
#save topics over time graph as HTML file
#topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=32).write_html(f"{bert_models}/topics_over_time.html")

# Save intertopic distance map as HTML file
topic_model_all.visualize_topics().write_html(f"{bert_models}/intertopic_dist_map.html")

# Save topic-terms barcharts as HTML file
topic_model_all.visualize_barchart(top_n_topics = 32, n_words=8).write_html(f"{bert_models}/barchart.html")

# Save documents projection as HTML file
topic_model_all.visualize_documents(docs_all).write_html(f"{bert_models}/projections.html")

# Save topics dendrogram as HTML file
topic_model_all.visualize_hierarchy().write_html(f"{bert_models}/hieararchy.html")

print("All Visuals Done")

[('see next slide', 0.07870197231702752), ('side', 0.044569489778491525), ('chart', 0.03558451896945302), ('slide see righthand', 0.03512650857990999), ('markets report european', 0.03512650857990999), ('london economics', 0.032670379251786334), ('financial markets report', 0.03161385772191899), ('macroeconomic impact integration', 0.03161385772191899), ('quantification macroeconomic', 0.03161385772191899), ('quantification macroeconomic impact', 0.03161385772191899)]


50it [05:57,  7.14s/it]


All Visuals Done
