In [1]:
from bertopic import BERTopic
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora
import matplotlib.pyplot as plt
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance
import sys
from utility import parse_config, seed_everything, custom_print
from preprocess_class import create_datasets
from model_base_class import BaseModel
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from transformers.pipelines import pipeline
from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired

sys.path.append("../..")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tingy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tingy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\tingy\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tingy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
class BERTopic_model(BaseModel):
    """
    BERTopic model for topic modelling. BERTopic is modular and the final topic model is dependent on the submodels chosen for each part of the task
    The parts of the model that an be modified is as follows: 
    1. Document embedding, 2. Dimensionality Reduction, 3. Clustering, 4. Tokenizer, 5. Weighting scheme 6. Representation Tuning (optional)
    """
    def __init__(self, embedding_model = None, dim_reduction_model=None,
                 clustering_model = None, vectorizer_model=None, 
                 ctfidf_model=None,  representation_model=None,
                 min_topic_size = 10):
        """
        @param embedding_model: Model to transform document into matrix of embedding
        @param dim_reduction_model: Dimensionality reduction algorithm to use
        @param clustering_model: Clustering algorithm to use
        @param vectorizer_model: Tokenizer to use
        @param ctfidf_model: weighting scheme to use
        @param representation_model: optional model to use to finetune the representations calculated using ctfidf
        """
        self.topic_model = None
        self.embedding_model = embedding_model
        self.dim_reduction_model = dim_reduction_model
        self.clustering_model = clustering_model
        self.vectorizer_model = vectorizer_model
        self.ctfidf_model = ctfidf_model
        self.representation_model =representation_model
        self.min_topic_size = min_topic_size
    
    def train(self, dataset):
        """
        fit and transform the BERTopic model to the dataset
        @param dataset [Dataset]: Dataset for the model to be fit and transform on
        """
        self.topic_model = BERTopic(embedding_model=self.embedding_model, ctfidf_model=self.ctfidf_model,
                        vectorizer_model=self.vectorizer_model, 
                        min_topic_size= self.min_topic_size, 
                        representation_model=self.representation_model, 
                        umap_model = self.dim_reduction_model, 
                        hdbscan_model = self.clustering_model, 
                        nr_topics= 'auto',
                        calculate_probabilities=False, verbose=True)
        self.topic_model.fit_transform(dataset.text)

    def evaluate(self,dataset):
        """
        Evaluate performance of model using coherence_score. (Using normalise pointwise mutual information, range between -1 and 1, higher score is better)
        prints out coherence score and topic freqenucy
        @param dataset [Dataset]: Dataset to evaluate performance
        """
        c_score = self.get_coherence_score(dataset)
        print(f'Coherence score: {c_score}')
        print(f'Topic frequency:\n{self.topic_model.get_topic_info().head()}')
    
    def predict(self, dataset):
        '''
        Cluster the dataset into topics
        @param dataset Union[str,[Dataset]]: New dataset to predict
        @return prediction: Topic prediction for each document
        '''
        if type(dataset) == str:
            return self.topic_model.transform(dataset)
        else:
            return self.topic_model.transform(dataset.text)

    def get_coherence_score(self, dataset):
        """
        Evaluation metric for model
        @param dataset [Dataset]: Training dataset
        @return c_score [float]: coherence score
        """
        documents = pd.DataFrame({"Document": dataset.text,
                                "ID": range(len(dataset.text)),
                                "Topic": self.topic_model.topics_})
        documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
        cleaned_docs = self.topic_model._preprocess_text(documents_per_topic.Document.values)

        # Extract vectorizer and analyzer from BERTopic
        vectorizer = self.topic_model.vectorizer_model
        analyzer = vectorizer.build_analyzer()

        # Extract features for Topic Coherence evaluation
        tokens = [analyzer(doc) for doc in cleaned_docs]
        dictionary = corpora.Dictionary(tokens)
        corpus = [dictionary.doc2bow(token) for token in tokens]
        topic_words = [[words for words, _ in self.topic_model.get_topic(topic)] 
                    for topic in range(len(set(self.topic_model.topics_))-1)]

        # Evaluate
        cm = CoherenceModel(topics=topic_words, 
                                        texts=tokens, 
                                        corpus=corpus,
                                        dictionary=dictionary, 
                                        coherence='c_npmi', #'u_mass', 'c_v', 'c_uci', 'c_npmi'
                                        topn=5)
        return cm.get_coherence()

In [3]:
df = pd.read_csv('../../data/reviews.csv')
train, test = create_datasets(df)

In [None]:
#Hyperparameters
# train.modify_stop_words_list(exclude_words=[]) #can choose to use or not

# 1. Document embedding
embedding_model = "sentence-transformers/all-MiniLM-L12-v2"
embedding_model = pipeline("feature-extraction", model="bert-base-uncased")

# 2. Dimensionality Reduction
dim_reduction_model = PCA(n_components=5)
dim_reduction_model = UMAP(n_neighbors=15, n_components=10, metric='cosine', low_memory=False) #PCA or truncated SVD

# 3. Clustering
clustering_model = KMeans(n_clusters=10)
clustering_model = HDBSCAN(min_cluster_size=10, metric='euclidean', prediction_data=True) #k-mean, birch

# 4. Tokenizer
vectorizer_params = {'min_df':10, 'max_df':1.0, "ngram_range": (1,2)}
vectorizer_model = CountVectorizer(stop_words=train.stop_words_list, min_df = vectorizer_params['min_df'], max_df = vectorizer_params['min_df'],\
                                   ngram_range=vectorizer_params['ngram_range'])

# 5. Weighting scheme
ctfidf_params = {"bm25_weighting": True, "reduce_frequent_words": True}
ctfidf_model = ClassTfidfTransformer(bm25_weighting= ctfidf_params["bm25_weighting"], reduce_frequent_words= ctfidf_params["reduce_frequent_words"])

# 6. Representation Tuning (optional)
representation_model = MaximalMarginalRelevance(diversity=0.5) #0 least diverse and 1 most diverse

# 7. final hyperparams tuning
min_topic_size = 30

#call model
model = BERTopic_model(embedding_model = embedding_model,
                        dim_reduction_model=dim_reduction_model,
                        clustering_model = clustering_model,
                        vectorizer_model=vectorizer_model, 
                        ctfidf_model=ctfidf_model, 
                        representation_model=representation_model,
                        min_topic_size = min_topic_size)
model.train(train)
model.evaluate(train)

In [53]:
#BERT Base
#Hyperparameters
# train.modify_stop_words_list(exclude_words=[]) #can choose to use or not

# 1. Document embedding
embedding_model = pipeline("feature-extraction", model="bert-base-uncased")

# 2. Dimensionality Reduction
dim_reduction_model = None #UMAP(n_neighbors=15, n_components=10, metric='cosine', low_memory=False) #PCA or truncated SVD

# 3. Clustering
clustering_model = None #HDBSCAN(min_cluster_size=10, metric='euclidean', prediction_data=True) #k-mean, birch

# 4. Tokenizer
vectorizer_params = {'min_df':1, 'max_df':1.0, "ngram_range": (1,2)}
vectorizer_model = CountVectorizer(stop_words=train.stop_words_list, min_df = vectorizer_params['min_df'], max_df = vectorizer_params['min_df'],\
                                   ngram_range=vectorizer_params['ngram_range'])

# 5. Weighting scheme
ctfidf_params = {"bm25_weighting": True, "reduce_frequent_words": True}
ctfidf_model = ClassTfidfTransformer(bm25_weighting= ctfidf_params["bm25_weighting"], reduce_frequent_words= ctfidf_params["reduce_frequent_words"])

# 6. Representation Tuning (optional)
# representation_model = MaximalMarginalRelevance(diversity=0.5) #0 least diverse and 1 most diverse
representation_model = None

# 7. final hyperparams tuning
min_topic_size = 30

#call model
b_model = BERTopic_model(embedding_model = embedding_model,
                        dim_reduction_model=dim_reduction_model,
                        clustering_model = clustering_model,
                        vectorizer_model=vectorizer_model, 
                        ctfidf_model=ctfidf_model, 
                        representation_model=representation_model,
                        min_topic_size = min_topic_size)
b_model.train(train)
b_model.evaluate(train)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 4355/4355 [14:06<00:00,  5.15it/s]
2023-03-29 18:37:30,029 - BERTopic - Transformed documents to Embeddings
2023-03-29 18:37:35,716 - BERT

-0.18674845340284274

In [54]:
print(b_model.topic_model.get_topic_info().head())
b_model.topic_model.visualize_documents(train.text)

   Topic  Count                                               Name
0     -1   1554        -1_xanthan gum_xanthan_crisps_black lentils
1      0   2688                         0_fed_breast_stool_felidae
2      1     64  1_bringing international_took parents_internat...
3      2     49     2_ghiradelli_buy actually_got seeds_able ignor


In [119]:
#Sentence transformers
#Hyperparameters
# train.modify_stop_words_list(exclude_words=[]) #can choose to use or not

# 1. Document embedding
embedding_model = "sentence-transformers/all-MiniLM-L12-v2"

# 2. Dimensionality Reduction
dim_reduction_model = None #UMAP(n_neighbors=15, n_components=10, metric='cosine', low_memory=False) #PCA or truncated SVD

# 3. Clustering
clustering_model = None #HDBSCAN(min_cluster_size=10, metric='euclidean', prediction_data=True) #k-mean, birch

# 4. Tokenizer
vectorizer_params = {'min_df':1, 'max_df':1, "ngram_range": (1,2)}
vectorizer_model = CountVectorizer(stop_words=train.stop_words_list, min_df = vectorizer_params['min_df'], max_df = vectorizer_params['min_df'],\
                                   ngram_range=vectorizer_params['ngram_range'])

# 5. Weighting scheme
ctfidf_params = {"bm25_weighting": True, "reduce_frequent_words": False} #might be why tea isnt appearing
ctfidf_model = ClassTfidfTransformer(bm25_weighting= ctfidf_params["bm25_weighting"], reduce_frequent_words= ctfidf_params["reduce_frequent_words"])

# 6. Representation Tuning (optional)
# representation_model = MaximalMarginalRelevance(diversity=0.5) #0 least diverse and 1 most diverse
# representation_model = None
representation_model = KeyBERTInspired()
# 7. final hyperparams tuning
min_topic_size = 50

#call model
miniLM_model = BERTopic_model(embedding_model = embedding_model,
                        dim_reduction_model=dim_reduction_model,
                        clustering_model = clustering_model,
                        vectorizer_model=vectorizer_model, 
                        ctfidf_model=ctfidf_model, 
                        representation_model=representation_model,
                        min_topic_size = min_topic_size)
miniLM_model.train(train)
miniLM_model.evaluate(train)

Batches:   0%|          | 0/137 [00:00<?, ?it/s]

2023-03-30 00:22:35,439 - BERTopic - Transformed documents to Embeddings
2023-03-30 00:22:40,105 - BERTopic - Reduced dimensionality
2023-03-30 00:22:40,309 - BERTopic - Clustered reduced embeddings
2023-03-30 00:23:08,449 - BERTopic - Reduced number of topics from 19 to 13


-0.27841022510088076

In [None]:
#Change how labels are generated
topic_labels = model.topic_model.generate_topic_labels(nr_words = 3, topic_prefix=False, word_length=15, separator='-')
model.topic_model.set_topic_labels(topic_labels)
model.topic_model.get_topic_info().head(15)

#Or to create for specific
# topic_model.set_topic_labels({0:"Cold Drink", 1:"Coffee beans"})
# topic_model.get_topic_info().head(10)

In [123]:
print(miniLM_model.topic_model.get_topic_info())
miniLM_model.topic_model.visualize_documents(train.text)

    Topic  Count                                               Name
0      -1   1426  -1_vanilla beans_popcorn tastes_coconut water_...
1       0   1245  0_flavor orange_strawberry flavor_tangerine fl...
2       1    291  1_quality tea_tasting tea_recommend tea_strong...
3       2    284  2_regular coffee_recommend coffee_fast coffee_...
4       3    261              3_food cats_food cat_cats eat_cat eat
5       4    165  4_healthy cereals_cereal snack_tasty cereal_gr...
6       5    113  5_sauce tasty_ranchero sauce_sauce veggies_faj...
7       6    111  6_chips flavor_chips tasty_tasting chips_love ...
8       7    110     7_diet bars_tasting bars_bars taste_taste bars
9       8    105  8_bisquick pancakes_gf bisquick_bisquick gf_pa...
10      9    101  9_noodles delicious_miracle noodles_noodles ta...
11     10     88  10_keurig coffee_keurig nice_different coffees...
12     11     55  11_crackers tasty_crackers delicious_crackers ...


In [62]:
miniLM_model.topic_model.visualize_documents(train.text)

In [86]:
miniLM_model.topic_model.reduce_topics(train.text, 6)

2023-03-29 22:13:45,797 - BERTopic - Reduced number of topics from 12 to 6


<bertopic._bertopic.BERTopic at 0x1f2563d43d0>

In [112]:
import os
curr_dir = os.getcwd()
config_path = os.path.join(curr_dir, 'bert_topic_config.yml')
config_file = parse_config(config_path)
# model_name = config_file['model']['model_name']

In [114]:
eval(config_file['model']['ngram_range'])

(1, 2)

In [108]:
SentenceTransformer('all-MiniLM-L12-v2')

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [106]:
eval(config_file['model']['dim_reduction_model'])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<transformers.pipelines.feature_extraction.FeatureExtractionPipeline at 0x1f25dda4fd0>

In [87]:
print(miniLM_model.topic_model.get_topic_info())


   Topic  Count                                             Name
0     -1   1367      -1_coconut water_truffle_sardines_lollipops
1      0   2386                 0_noodles_pet_tangerine_dog food
2      1    292           1_like tea_ice tea_loose tea_white tea
3      2    111  2_bars good_banana nut_bars ve_ingredients bars
4      3    107      3_bisquick_pancake mix_bread mix_shortening
5      4     92      4_keurig machine_keurig coffee_jet_jet fuel


In [77]:
# model.topic_model.reduce_topics(train.text, 10)
print(miniLM_model.topic_model.get_topic_info())

    Topic  Count                                               Name
0      -1   1367        -1_coconut water_truffle_sardines_lollipops
1       0   1609      0_carbonation_orange tangerine_noodle_lentils
2       1    292             1_like tea_ice tea_white tea_loose tea
3       2    263         2_dog food_science_science diet_grain free
4       3    169  3_granola clusters_breakfast cereal_tasty cere...
5       4    111  4_bars good_banana nut_bars taste_ingredients ...
6       5    110  5_salt vinegar_lime flavoring_bag chips_chips ...
7       6    107  6_cookies taste_breakfast cookie_oreo_chip cookie
8       7    107     7_bisquick_pancake mix_bread mix_free bisquick
9       8     92        8_keurig machine_keurig coffee_jet fuel_fog
10      9     73   9_beer nuts_like nuts_taste almonds_like cashews
11     10     55  10_crackers great_rice crackers_love crackers_...


In [51]:
print(model.topic_model.get_topic_info().head(10))
model.topic_model.visualize_documents(train.text)

   Topic  Count                        Name
0     -1    945            -1_the_and_to_it
1      0   3031             0_the_and_it_to
2      1    109      1_noodles_soup_the_and
3      2     85  2_popcorn_butter_peanut_it
4      3     68      3_dog_treats_the_these
5      4     35         4_syrup_maple_to_it
6      5     30     5_jerky_sticks_the_beef
7      6     26        6_bread_it_and_flour
8      7     15            7_mic_it_the_and
9      8     11        8_msg_the_yeast_this


In [35]:
new_topics = model.topic_model.reduce_outliers(train.text, model.topic_model.topics_, threshold = 0.3) #using this ends up bringing in stopwords for some reason

100%|██████████| 2/2 [00:00<00:00,  4.42it/s]


In [34]:
model.topic_model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name
0,-1,1193,-1_shelf_minute_nicely_fun
1,0,1168,0_balance_body_lack_watching
2,1,281,1_sit_poor_medium_acid
3,2,252,2_overwhelming_comparison_set_christmas
4,3,181,3_watery_strange_99_selling


In [36]:
model.topic_model.update_topics(train.text, topics = new_topics)

In [37]:
model.topic_model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name
0,-1,945,-1_and_the_to_it
1,0,1168,0_the_and_to_of
2,1,281,1_coffee_it_this_the
3,2,252,2_tea_it_this_teas
4,3,181,3_price_for_the_at


In [145]:
    curr_dir = os.getcwd()
    config_path = os.path.join(curr_dir, 'bert_topic_config.yml')
    config_file = parse_config(config_path)
    model_name = config_file['model']['model_name']
    embedding_model = eval(config_file['model']['embedding_model'])
    dim_reduction_model = eval(config_file['model']['dim_reduction_model'])
    clustering_model = eval(config_file['model']['clustering_model'])
    vectorizer_params = dict(config_file['model']['vectorizer_params'])    
    ctfidf_params = dict(config_file['model']['ctfidf_params'])
    representation_model = eval(config_file['model']['representation_model'])
    min_topic_size = int(config_file['model']['min_topic_size'])
    data_file = config_file['model']['data_folder']
    home_folder = os.path.abspath(os.path.join(os.getcwd(),'../..'))
    data_df = pd.read_csv(os.path.join(home_folder,data_file))
    # model_path = os.path.join(curr_dir, config_file['model']['model_path'])
    logging_path = os.path.join(curr_dir,config_file['model']['log_path'])
    image_path = os.path.join(curr_dir,config_file['model']['image_folder'])
    train, test = create_datasets(data_df)
    vectorizer_model = CountVectorizer(stop_words=train.stop_words_list, min_df = vectorizer_params['min_df'], max_df = vectorizer_params['min_df'],\
                                   ngram_range=(1,2))#vectorizer_params['ngram_range']) #Need fix
    ctfidf_model = ClassTfidfTransformer(bm25_weighting= ctfidf_params["bm25_weighting"], reduce_frequent_words= ctfidf_params["reduce_frequent_words"])        
    logger = open(os.path.join(curr_dir, logging_path), 'w')
    # custom_print(f'Device availiable: {device}', logger = logger)
    custom_print('Training model',logger = logger)
    # seed_everything()
    custom_print('---------------------------------\n',logger = logger)
    # custom_print("Hyperparameters:",logger = logger)
    custom_print(f"model name: {model_name}",logger = logger)
    # custom_print(f"Number of vectorizer_params: {vectorizer_params}",logger = logger)
    # custom_print(f"number of classes: {n_classes}",logger = logger)
    # custom_print(f"max length: {dim_reduction}",logger = logger)
    # custom_print(f"batch size: {clustering_model}",logger = logger)
    # custom_print(f"learning rate: {ctfidf_params}",logger = logger)
    model = BERTopic_model(embedding_model = embedding_model,
                        dim_reduction_model=dim_reduction_model,
                        clustering_model = clustering_model,
                        vectorizer_model=vectorizer_model, 
                        ctfidf_model=ctfidf_model, 
                        representation_model=representation_model,
                        min_topic_size = min_topic_size)
    model.train(train)
    custom_print(f'Coherence score: {model.evaluate(train)}', logger=logger)
    custom_print(f'{model.topic_model.get_topic_info()}', logger=logger)
    
    for i in range(len(model.topic_model.topic_labels_)-1):
        custom_print(f'Words and score of topic {i}:\n {model.topic_model.get_topic(i)}',
                     logger=logger)
    fig = model.topic_model.visualize_documents(train.text)
    fig.write_html(f"{image_path}doc_viz.html")
    logger.close()

Training model
---------------------------------

model name: sentence-transformers


Batches:   0%|          | 0/137 [00:00<?, ?it/s]

2023-03-30 02:38:22,840 - BERTopic - Transformed documents to Embeddings
2023-03-30 02:38:27,383 - BERTopic - Reduced dimensionality
2023-03-30 02:38:27,557 - BERTopic - Clustered reduced embeddings
2023-03-30 02:39:04,332 - BERTopic - Reduced number of topics from 32 to 12


Coherence score: -0.2643042078520306
Topics:     Topic  Count                                               Name
0      -1   1365  -1_similac organic_raw cacao_organic raw_organ...
1       0   2192     0_quality tea_tasting tea_loose tea_tea really
2       1    211              1_food cat_cats eat_cat eat_pet foods
3       2    182  2_deal product_price stores_item purchased_pri...
4       3     86  3_keurig coffee_different coffees_keurig nice_...
5       4     54  4_salt delicious_regular salt_salt tastes_salt...
6       5     51  5_protein vanilla_protein powders_protein mixe...
7       6     50  6_flavor popcorn_tasting popcorn_popcorn taste...
8       7     46  7_drinking coconut_purchase coconut_pure cocon...
9       8     43   8_olive oils_coconut oils_walnut oil_avocado oil
10      9     38  9_tasting water_trivedi water_water tastes_pur...
11     10     37  10_recommend soup_soup tastes_love soup_said soup
Words and score of topic 0:
 [('quality tea', 0.54296124), ('tasting te

In [144]:
config_file = parse_config(config_path)
vectorizer_params = dict(config_file['model']['vectorizer_params'])    
vectorizer_params['ngram_range']

'(1,2)'

In [46]:
model.topic_model.visualize_heatmap()

In [47]:
model.topic_model.visualize_topics(custom_labels=True)
# topic_model.visualize_barchart(top_n_topics=16, custom_labels=True)

In [13]:
topic_model.visualize_barchart(top_n_topics=16, custom_labels=True)

In [44]:
topics_over_time = model.topic_model.topics_over_time(docs=train.text, 
                                                timestamps=train.date, 
                                                global_tuning=True, 
                                                evolution_tuning=True, 
                                                nr_bins=20)
model.topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=15)

In [25]:
# Save model
topic_model.save("bertopic_model")	

In [28]:
# Load model
topic_model = BERTopic.load("bertopic_model")	