In [3]:
import bertopic
import gc
import torch
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from umap import UMAP

def test_model1(dataset, seed, model_file_path, min_topic_size, num_topics, 
               csv_file_path, bertopic_labels_csv_file_path):
    
    print(f"Testing")

    # loading in umap and hbdscan for reproducibility/seed and used default parameters bertopic used
    embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
    umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=seed)
        
    # min cluster size is min topic size
    hdbscan_model = HDBSCAN(min_cluster_size=min_topic_size, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

    topic_model = bertopic.BERTopic(nr_topics=num_topics, top_n_words=15, umap_model=umap_model, calculate_probabilities=False,
                                    hdbscan_model=hdbscan_model, embedding_model=embedding_model)
        
    topic_model.fit_transform(dataset)

    print("Model done fitting")

    # to get keywords per topic, all we need to do is get_topics() on the topic model
    keywords_representation = {}
    for topic, value in topic_model.get_topics().items():

        # will give topic:keywords
        keywords = []
        for keyword, c_tf_idf in value:
            keywords.append(keyword)

        keywords_representation[topic] = keywords

    bertopic_labels = topic_model.generate_topic_labels(nr_words=3)
    

    with open(csv_file_path, "w", encoding='utf-8') as file:
        for topic, keywords in keywords_representation.items():
            file.write(f"Topic {topic}: \n")
            file.write(f"Topic Keywords: {keywords} \n")
            file.write("----------------------------------------------------------------------------------------------------------------------------")
            file.write("\n")

    # BERTopic labels
    with open(bertopic_labels_csv_file_path, "w", encoding='utf-8') as file:
        for label in bertopic_labels:
            topic = int(label.split('_')[0])
            file.write(f"\nTopic {topic}: \n")
            file.write(f"Topic Keywords: {keywords_representation[topic]}\n")
            file.write(f"BERTopic Generated Label: {label} \n")
            file.write("----------------------------------------------------------------------------------------------------------------------------")
            file.write("\n")

    topic_model.save(model_file_path, serialization="safetensors", save_ctfidf=True,
                         save_embedding_model=embedding_model)
    
    del topic_model
    del hdbscan_model
    del umap_model
    del dataset
    gc.collect()
    
print(torch.cuda.is_available())

True


# Arxiv Parameter Tests

In [None]:
from get_arxiv_abstract_data import get_preprocessed_data_from_csv

data = get_preprocessed_data_from_csv('./data/clean_arxiv_abstracts_dataset_final.csv')

dataset_name = 'arxiv'
datasize = len(data)
min_topic_size = 300
seed = 1
nr_topics = 26
model_file_path = f'./models/arxiv/bertopic_{dataset_name}_{datasize}_{seed}_{min_topic_size}_{nr_topics}_preprocessed_final'
csv_file_path = f'./data/results/arxiv/bertopic_{dataset_name}_{datasize}_{seed}_{min_topic_size}_{nr_topics}_topic_labels_preprocessed_final.csv'
bertopic_labels_csv_file_path = f'./data/results/arxiv/bertopic_{dataset_name}_{datasize}_{seed}_{min_topic_size}_{nr_topics}_topic_labels_with_bert_labels_preprocessed_final.csv'

test_model1(data, seed, model_file_path, min_topic_size, nr_topics, csv_file_path, bertopic_labels_csv_file_path)

# BBC News Parameter Tests

In [None]:
from get_bbc_news_data import get_preprocessed_text

data = get_preprocessed_text(save_dataset=True)

dataset_name = 'bbc_news'
datasize = len(data)
min_topic_size = 15
seed = 1
nr_topics = 16
model_file_path = f'./models/bbc_news/bertopic_preprocessed_{dataset_name}_{datasize}_{seed}_{min_topic_size}_{nr_topics}_final'
csv_file_path = f'./data/results/bbc_news/bertopic_preprocessed_{dataset_name}_{datasize}_{seed}_{min_topic_size}_{nr_topics}_topic_labels_final.csv'
bertopic_labels_csv_file_path = f'./data/results/bbc_news/bertopic_preprocessed_{dataset_name}_{datasize}_{seed}_{min_topic_size}_{nr_topics}_topic_labels_with_bert_labels_final.csv'

test_model1(data, seed, model_file_path, min_topic_size, nr_topics, csv_file_path, bertopic_labels_csv_file_path)

# Amazon Reviews Parameter Tests

In [1]:
from get_amazon_reviews_data import get_preprocessed_data

fraction_to_sample = 0.05
seed = 1
chunk_size = 1_000_000

save_file = f'./data/clean_amazon_reviews_frac={fraction_to_sample}_seed={seed}_chunksize={chunk_size}_retrain'
data = get_preprocessed_data('D:\\topic_modeling_research\\data\\Electronics.jsonl', fraction_to_sample, chunk_size, seed, save_dataset=True, save_file_path=save_file)

In [2]:
from get_amazon_reviews_data import get_preprocessed_data_from_csv

data1 = get_preprocessed_data_from_csv('./data/clean_amazon_reviews_frac=0.05_seed=1_chunksize=1000000_final.csv')
data2 = get_preprocessed_data_from_csv('./data/clean_amazon_reviews_frac=0.05_seed=1_chunksize=1000000_retrain.csv')

if data1 == data2:
    print("They are the same")
else:
    print("They are different")

They are the same


In [2]:
from get_amazon_reviews_data import get_preprocessed_data, get_preprocessed_data_from_csv

fraction_to_sample = 0.05
seed = 1
chunk_size = 1_000_000

data = get_preprocessed_data_from_csv('./data/clean_amazon_reviews_frac=0.05_seed=1_chunksize=1000000_final.csv')
    
dataset_name = 'amazon_reviews'
datasize = len(data)
min_topic_size = 400
nr_topics = 21
model_file_path = f'./models/amazon_reviews/bertopic_{dataset_name}_{datasize}_{seed}_{min_topic_size}_{nr_topics}_preprocessed_retrain_2'
csv_file_path = f'./data/results/amazon_reviews/bertopic_{dataset_name}_{datasize}_{seed}_{min_topic_size}_{nr_topics}_topic_labels_preprocessed_retrain_2.csv'
bertopic_labels_csv_file_path = f'./data/results/amazon_reviews/bertopic_{dataset_name}_{datasize}_{seed}_{min_topic_size}_{nr_topics}_topic_labels_with_bert_labels_preprocessed_retrain_2.csv'
    
test_model1(data, seed, model_file_path, min_topic_size, nr_topics, csv_file_path, bertopic_labels_csv_file_path)

Testing
Model done fitting


# Newsgroup20 Parameter Tests

In [5]:
from get_news_data import get_preprocessed_data_from_csv

data = get_preprocessed_data_from_csv('./data/clean_newsgroup20_final.csv')

dataset_name = 'newsgroup20'
datasize = len(data)
min_topic_size = 25
seed = 1
nr_topics = 26
model_file_path = f'./models/newsgroup20/bertopic_{dataset_name}_{datasize}_{seed}_{min_topic_size}_{nr_topics}_preprocessed_final'
csv_file_path = f'./data/results/newsgroup20/bertopic_{dataset_name}_{datasize}_{seed}_{min_topic_size}_{nr_topics}_topic_labels_preprocessed_final.csv'
bertopic_labels_csv_file_path = f'./data/results/newsgroup20/bertopic_{dataset_name}_{datasize}_{seed}_{min_topic_size}_{nr_topics}_topic_labels_with_bert_labels_preprocessed_final.csv'

test_model1(data, seed, model_file_path, min_topic_size, nr_topics, csv_file_path, bertopic_labels_csv_file_path)

Testing
Model done fitting


# WorldCup Tweets Parameter Tests

In [2]:
from get_worldcup_data import get_preprocessed_data_from_csv

data = get_preprocessed_data_from_csv('./data/clean_worldcup_final.csv')

dataset_name = 'worldcup_tweets'
datasize = len(data)
min_topic_size = 250
seed = 1
nr_topics = 16
model_file_path = f'./models/worlcup/bertopic_{dataset_name}_{datasize}_{seed}_{min_topic_size}_{nr_topics}_preprocessed_final'
csv_file_path = f'./data/results/worlcup/bertopic_{dataset_name}_{datasize}_{seed}_{min_topic_size}_{nr_topics}_topic_labels_preprocessed_final.csv'
bertopic_labels_csv_file_path = f'./data/results/worlcup/bertopic_{dataset_name}_{datasize}_{seed}_{min_topic_size}_{nr_topics}_topic_labels_with_bert_labels_preprocessed_final.csv'

test_model1(data, seed, model_file_path, min_topic_size, nr_topics, csv_file_path, bertopic_labels_csv_file_path)

Testing
Model done fitting


In [3]:
from get_worldcup_data import get_preprocessed_data_from_csv

data = get_preprocessed_data_from_csv('./data/clean_worldcup_final.csv')

dataset_name = 'worldcup_tweets'
datasize = len(data)
min_topic_size = 350
seed = 1
nr_topics = 26
model_file_path = f'./models/worlcup/bertopic_{dataset_name}_{datasize}_{seed}_{min_topic_size}_{nr_topics}_preprocessed_final'
csv_file_path = f'./data/results/worlcup/bertopic_{dataset_name}_{datasize}_{seed}_{min_topic_size}_{nr_topics}_topic_labels_preprocessed_final.csv'
bertopic_labels_csv_file_path = f'./data/results/worlcup/bertopic_{dataset_name}_{datasize}_{seed}_{min_topic_size}_{nr_topics}_topic_labels_with_bert_labels_preprocessed_final.csv'

test_model1(data, seed, model_file_path, min_topic_size, nr_topics, csv_file_path, bertopic_labels_csv_file_path)

Testing
Model done fitting


# Visualizations For BERTopic

In [1]:
import bertopic
import matplotlib.pyplot as plt

import importlib
from sentence_transformers import SentenceTransformer
from umap import UMAP

amazon_model_path = './models/amazon_reviews/bertopic_amazon_reviews_2043299_1_400_21_preprocessed_retrain'
arxiv_model_path = './models/arxiv/bertopic_arxiv_2521247_1_300_26_preprocessed_final'
bbc_news_model_path = './models/bbc_news/bertopic_preprocessed_bbc_news_2225_1_15_16_final'
newsgroup20_model_path = './models/newsgroup20/bertopic_newsgroup20_18811_1_25_26_preprocessed_final'

dataset_dict = {
    'amazon_reviews_dataset': {'filename': 'get_amazon_reviews_data',
                               'model_path': amazon_model_path, 
                               'csv_file_path': './data/clean_amazon_reviews_frac=0.05_seed=1_chunksize=1000000_final.csv'},
    'bbc_news_dataset': {'filename': 'get_bbc_news_data',
                         'model_path': bbc_news_model_path,
                         'csv_file_path': './data/clean_bbc_news_dataset_final.csv'},
    'newsgroup20_dataset': {'filename': 'get_news_data',
                            'model_path': newsgroup20_model_path,
                            'csv_file_path': './data/clean_newsgroup20_final.csv'},
    'arxiv_dataset': {'filename': 'get_arxiv_abstract_data',
                      'model_path': arxiv_model_path,
                      'csv_file_path': './data/clean_arxiv_abstracts_dataset_final.csv'},
}

embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
sentence_model = SentenceTransformer(embedding_model)

umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=1)

In [2]:
from get_news_data import get_preprocessed_data_from_csv

data = get_preprocessed_data_from_csv('./data/clean_newsgroup20_final.csv')

topic_model = bertopic.BERTopic.load(newsgroup20_model_path)

# visualize topics
# fig = topic_model.visualize_topics()
# fig.update_layout(
#     # title=dict(
#     #     text="Newsgroup20: Topic Distribution",
#     #     font=dict(size=24),  # Larger font for title
#     #     x=0.5,  # Center the title
#     #     xanchor="center"
#     # ),
#     width=900,  # Adjusting width for better aspect ratio
#     height=700,  # Adjusting height to match
#     # xaxis=dict(range=[-20, 20], title_font=dict(size=14)),  # Narrowing x-axis range
#     # yaxis=dict(range=[-20, 20], title_font=dict(size=14)),  # Narrowing y-axis range
#     font=dict(size=18)  # Smaller general font size for journal readability
# )
# fig.write_image("./plots/newsgroup20/newsgroup20_visualize_topics_bigger.png")

# visualize wordweights
fig = topic_model.visualize_barchart(top_n_topics=25)
fig.update_layout(
    # title=dict(
    #     text="Amazon Electronics Reviews: Top Words Per Topic",
    #     font=dict(size=24),  # Larger font for title
    #     x=0.5,  # Center the title
    #     xanchor="center"
    # ),
    width=1500,  # Slightly narrower to fit single-column width
    height=1000,  # Adjusted for balance
    font=dict(size=22) # Consistent font size for readability
)

# Iterate over all x-axes in the subplots
for i in range(1, 26):  # Assuming there are 25 topics (1-based index for subplots)
    fig['layout'][f'xaxis{i}']['tickfont']['size'] = 14
    fig['layout'][f'xaxis{i}']['title']['font']['size'] = 16

# Adjust annotation font sizes
for annotation in fig['layout']['annotations']:
    annotation['font']['size'] = 12  # Slightly larger for journal readability


fig.write_image("./plots/newsgroup20/newsgroup20_wordweights_bigger.png")

# visualize documents
# embeddings = sentence_model.encode(data)
# reduced_embeddings = umap_model.fit_transform(embeddings)
#
# fig = topic_model.visualize_documents(
#     data,
#     reduced_embeddings=reduced_embeddings,
#     hide_document_hover=True,
#     hide_annotations=True
# )
# fig.update_layout(
#     # title=dict(
#     #     text="Newsgroup20: Document Clustering",
#     # ),
#     width=900,
#     height=700,
#     font=dict(size=20),
#     legend=dict(font=dict(size=16)),
# )
# fig.write_image("./plots/newsgroup20/newsgroup20_visualize_documents_bigger.png")
#
# del data
# del topic_model
# del embeddings
# del reduced_embeddings

In [3]:
from get_arxiv_abstract_data import get_preprocessed_data_from_csv

data = get_preprocessed_data_from_csv('./data/clean_arxiv_abstracts_dataset_final.csv')

topic_model = bertopic.BERTopic.load(arxiv_model_path)

# visualize topics
# fig = topic_model.visualize_topics()
# fig.update_layout(
#     # title=dict(
#     #     text="Arxiv Abstracts: Topic Distribution",
#     #     font=dict(size=24),  # Larger font for title
#     #     x=0.5,  # Center the title
#     #     xanchor="center"
#     # ),
#     width=900,  # Adjusting width for better aspect ratio
#     height=700,  # Adjusting height to match
#     # xaxis=dict(range=[-20, 20], title_font=dict(size=14)),  # Narrowing x-axis range
#     # yaxis=dict(range=[-20, 20], title_font=dict(size=14)),  # Narrowing y-axis range
#     font=dict(size=18) # Smaller general font size for journal readability
# )
# fig.write_image("./plots/arxiv/arxiv_visualize_topics_bigger.png")

# visualize wordweights
fig = topic_model.visualize_barchart(top_n_topics=25)
fig.update_layout(
    # title=dict(
    #     text="Amazon Electronics Reviews: Top Words Per Topic",
    #     font=dict(size=24),  # Larger font for title
    #     x=0.5,  # Center the title
    #     xanchor="center"
    # ),
    width=1500,  # Slightly narrower to fit single-column width
    height=1000,  # Adjusted for balance
    font=dict(size=22) # Consistent font size for readability
)

# Iterate over all x-axes in the subplots
for i in range(1, 26):  # Assuming there are 25 topics (1-based index for subplots)
    fig['layout'][f'xaxis{i}']['tickfont']['size'] = 14
    fig['layout'][f'xaxis{i}']['title']['font']['size'] = 16

# Adjust annotation font sizes
for annotation in fig['layout']['annotations']:
    annotation['font']['size'] = 12  # Slightly larger for journal readability
    
fig.write_image("./plots/arxiv/arxiv_wordweights_bigger.png")

# visualize documents
# embeddings = sentence_model.encode(data)
# reduced_embeddings = umap_model.fit_transform(embeddings)
#
# fig = topic_model.visualize_documents(
#     data,
#     reduced_embeddings=reduced_embeddings,
#     hide_document_hover=True,
#     hide_annotations=True
# )
# fig.update_layout(
#     # title=dict(
#     #     text="Arxiv Abstracts: Document Clustering",
#     # ),
#     width=900,
#     height=700,
#     font=dict(size=20),
#     legend=dict(font=dict(size=16)),
# )
# fig.write_image("./plots/arxiv/arxiv_visualize_documents_bigger.png")
#
# del data
# del topic_model
# del embeddings
# del reduced_embeddings

In [4]:
from get_bbc_news_data import get_preprocessed_data_from_csv

data = get_preprocessed_data_from_csv('./data/clean_bbc_news_dataset_final.csv')

topic_model = bertopic.BERTopic.load(bbc_news_model_path)

# visualize topics
# fig = topic_model.visualize_topics()
# fig.update_layout(
#     # title=dict(
#     #     text="BBC News Articles: Topic Distribution",
#     #     font=dict(size=24),  # Larger font for title
#     #     x=0.5,  # Center the title
#     #     xanchor="center"
#     # ),
#     width=900,  # Adjusting width for better aspect ratio
#     height=700,  # Adjusting height to match
#     # xaxis=dict(range=[-20, 20], title_font=dict(size=14)),  # Narrowing x-axis range
#     # yaxis=dict(range=[-20, 20], title_font=dict(size=14)),  # Narrowing y-axis range
#     font=dict(size=18)  # Smaller general font size for journal readability
# )
# fig.write_image("./plots/bbc_news/bbc_news_visualize_topics_bigger.png")


# visualize wordweights
fig = topic_model.visualize_barchart(top_n_topics=15)
fig.update_layout(
    # title=dict(
    #     text="Amazon Electronics Reviews: Top Words Per Topic",
    #     font=dict(size=24),  # Larger font for title
    #     x=0.5,  # Center the title
    #     xanchor="center"
    # ),
    width=1500,  # Slightly narrower to fit single-column width
    height=1000,  # Adjusted for balance
    font=dict(size=22) # Consistent font size for readability
)

# Iterate over all x-axes in the subplots
for i in range(1, 16):  # Assuming there are 25 topics (1-based index for subplots)
    fig['layout'][f'xaxis{i}']['tickfont']['size'] = 14
    fig['layout'][f'xaxis{i}']['title']['font']['size'] = 16

# Adjust annotation font sizes
for annotation in fig['layout']['annotations']:
    annotation['font']['size'] = 12  # Slightly larger for journal readability

fig.write_image("./plots/bbc_news/bbc_news_wordweights_bigger.png")

# visualize documents
# embeddings = sentence_model.encode(data)
# reduced_embeddings = umap_model.fit_transform(embeddings)
#
# fig = topic_model.visualize_documents(
#     data,
#     reduced_embeddings=reduced_embeddings,
#     hide_document_hover=True,
#     hide_annotations=True
# )
# fig.update_layout(
#     # title=dict(
#     #     text="BBC News Articles: Document Clustering",
#     # ),
#     width=900,
#     height=700,
#     font=dict(size=20),
#     legend=dict(font=dict(size=16)),
# )
# fig.write_image("./plots/bbc_news/bbc_news_visualize_documents_bigger.png")
#
# del data
# del topic_model
# del embeddings
# del reduced_embeddings

In [5]:
from get_amazon_reviews_data import get_preprocessed_data_from_csv

data = get_preprocessed_data_from_csv('./data/clean_amazon_reviews_frac=0.05_seed=1_chunksize=1000000_final.csv')

topic_model = bertopic.BERTopic.load(amazon_model_path)

# visualize topics
# fig = topic_model.visualize_topics()
# fig.update_layout(
#     # title=dict(
#     #     text="Amazon Electronics Reviews: Topic Distribution",
#     #     font=dict(size=24),  # Larger font for title
#     #     x=0.5,  # Center the title
#     #     xanchor="center"
#     # ),
#     width=900,  # Adjusting width for better aspect ratio
#     height=700,  # Adjusting height to match
#     # xaxis=dict(range=[-20, 20], title_font=dict(size=14)),  # Narrowing x-axis range
#     # yaxis=dict(range=[-20, 20], title_font=dict(size=14)),  # Narrowing y-axis range
#     font=dict(size=18)  # Smaller general font size for journal readability
# )
# fig.write_image("./plots/amazon_reviews/amazon_visualize_topics_bigger.png")

# visualize wordweights
fig = topic_model.visualize_barchart(top_n_topics=20)
fig.update_layout(
    # title=dict(
    #     text="Amazon Electronics Reviews: Top Words Per Topic",
    #     font=dict(size=24),  # Larger font for title
    #     x=0.5,  # Center the title
    #     xanchor="center"
    # ),
    width=1500,  # Slightly narrower to fit single-column width
    height=1000,  # Adjusted for balance
    font=dict(size=22) # Consistent font size for readability
)

# Iterate over all x-axes in the subplots
for i in range(1, 21):  # Assuming there are 25 topics (1-based index for subplots)
    fig['layout'][f'xaxis{i}']['tickfont']['size'] = 14
    fig['layout'][f'xaxis{i}']['title']['font']['size'] = 16

# Adjust annotation font sizes
for annotation in fig['layout']['annotations']:
    annotation['font']['size'] = 12  # Slightly larger for journal readability
    
fig.write_image("./plots/amazon_reviews/amazon_wordweights_bigger.png")

# visualize documents
# embeddings = sentence_model.encode(data)
# reduced_embeddings = umap_model.fit_transform(embeddings)
#
# fig = topic_model.visualize_documents(
#     data,
#     reduced_embeddings=reduced_embeddings,
#     hide_document_hover=True,
#     hide_annotations=True
# )
# fig.update_layout(
#     # title=dict(
#     #     text="Amazon Electronic Reviews: Document Clustering",
#     # ),
#     width=900,
#     height=700,
#     font=dict(size=20),
#     legend=dict(font=dict(size=16)),
# )
# fig.write_image("./plots/amazon_reviews/amazon_visualize_documents_bigger.png")
#
# del data
# del topic_model
# del embeddings
# del reduced_embeddings

In [6]:
world_cup_model_path = "./models/worlcup/bertopic_worldcup_tweets_2407246_1_350_26_preprocessed_final"

from get_worldcup_data import get_preprocessed_data_from_csv

data = get_preprocessed_data_from_csv('D:\\TopicMoldeing_Redo\\data\\clean_worldcup_final.csv')

topic_model = bertopic.BERTopic.load(world_cup_model_path)

# visualize topics
# fig = topic_model.visualize_topics()
# fig.update_layout(
#     # title=dict(
#     #     text="World Cup Tweets: Topic Distribution",
#     #     font=dict(size=24),  # Larger font for title
#     #     x=0.5,  # Center the title
#     #     xanchor="center"
#     # ),
#     width=900,  # Adjusting width for better aspect ratio
#     height=700,  # Adjusting height to match
#     # xaxis=dict(range=[-20, 20], title_font=dict(size=14)),  # Narrowing x-axis range
#     # yaxis=dict(range=[-20, 20], title_font=dict(size=14)),  # Narrowing y-axis range
#     font=dict(size=18) # Smaller general font size for journal readability
# )
# fig.write_image("./plots/worldcup/worldcup_visualize_topics_bigger.png")

# visualize wordweights
topics = [0, 1, 2, 3, 4, 11, 12, 15, 16, 18, 19, 22]
fig = topic_model.visualize_barchart(topics=topics)
fig.update_layout(
    # title=dict(
    #     text="Amazon Electronics Reviews: Top Words Per Topic",
    #     font=dict(size=24),  # Larger font for title
    #     x=0.5,  # Center the title
    #     xanchor="center"
    # ),
    width=1500,  # Slightly narrower to fit single-column width
    height=1000,  # Adjusted for balance
    font=dict(size=22) # Consistent font size for readability
)

# Iterate over all x-axes in the subplots
for i in range(1, 13):  # Assuming there are 25 topics (1-based index for subplots)
    fig['layout'][f'xaxis{i}']['tickfont']['size'] = 14
    fig['layout'][f'xaxis{i}']['title']['font']['size'] = 16

# Adjust annotation font sizes
for annotation in fig['layout']['annotations']:
    annotation['font']['size'] = 12  # Slightly larger for journal readability
    
for i, topic_num in enumerate(topics):
    custom_title = f"Topic {i}"
    fig['layout']['annotations'][i]['text'] = custom_title

fig.write_image("./plots/worldcup/worldcup_wordweights_bigger.png")

# visualize documents
# embeddings = sentence_model.encode(data)
# reduced_embeddings = umap_model.fit_transform(embeddings)
#
# fig = topic_model.visualize_documents(
#     data,
#     reduced_embeddings=reduced_embeddings,
#     hide_document_hover=True,
#     hide_annotations=True
# )
# fig.update_layout(
#     # title=dict(
#     #     text="World Cup Tweets: Document Clustering",
#     # ),
#     width=900,
#     height=700,
#     font=dict(size=20),
#     legend=dict(font=dict(size=16)),
# )
# fig.write_image("./plots/worldcup/worldcup_visualize_documents_bigger.png")
#
# del data
# del topic_model
# del embeddings
# del reduced_embeddings