In [1]:
from bertopic import BERTopic
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from re import sub

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#First attempt at bertopics
df = pd.read_csv('reviews.csv')
df['Time'] = pd.to_datetime(df['Time'])
df['Text'] = df['Text'].apply(lambda x: sub("<[^>]+>", "", x).lower().strip())

In [8]:
#Start with vector
# Train BERTopic with a custom CountVectorizer
vectorizer_model = CountVectorizer(min_df=10, stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model)
topics, probs = topic_model.fit_transform(df['Text'])

In [None]:
#OR
topic_model = bertopic.BERTopic(language="english", calculate_probabilities=True, verbose=True)
topics, probs = topic_model.fit_transform(df['Text'])

# Fine-tune topic representations after training BERTopic
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3), min_df=10) #see if need stop words removal
topic_model.update_topics(docs, vectorizer_model=vectorizer_model)

In [10]:
freq = topic_model.get_topic_info()
freq.head(5)
# topic_model.get_topic(0)  # Select the most frequent topic

Unnamed: 0,Topic,Count,Name
0,-1,1269,-1_great_product_good_taste
1,0,347,0_tea_green_iced_drink
2,1,324,1_coffee_cup_starbucks_beans
3,2,207,2_product_tastes_taste_great
4,3,167,3_snacks_snack_tasty_fruit


In [7]:
topic_model.visualize_topics()

In [10]:
# The variable probabilities that is returned from transform() or fit_transform() can be used to understand how confident BERTopic is that certain topics can be found in a document.
#Doesnt seem to be working as expected atm, need to debug
topics, probs = topic_model.transform(df['Text'])
topic_model.visualize_distribution(probs, min_probability=0.2)

In [11]:
topic_model.visualize_hierarchy(top_n_topics=50)

In [13]:
topic_model.visualize_barchart(top_n_topics=10)

In [23]:
#possible things to do
#merge similar topics manually
topic_model.merge_topics(df['Text'], topics_to_merge=[1,4,5])
#iteratively merge
topic_model.reduce_topics(df['Text'], nr_topics=10)

<bertopic._bertopic.BERTopic at 0x145130a6c80>

In [24]:
#can reduce number of topics after building
topic_model.visualize_topics()

In [29]:
topics_over_time = topic_model.topics_over_time(docs=df['Text'], 
                                                timestamps=df['Time'], 
                                                global_tuning=True, 
                                                evolution_tuning=True, 
                                                nr_bins=20)

In [30]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

In [33]:
#Change how labels are generated
topic_labels = topic_model.generate_topic_labels(nr_words = 3, topic_prefix=False, word_length=15, separator='-')
topic_model.set_topic_labels(topic_labels)

#Or to create for specific
topic_model.set_topic_labels({0:"Cold Drink", 1:"Coffee beans"})
topic_model.get_topic_info().head(10)

Unnamed: 0,Topic,Count,Name,CustomName
0,-1,1269,-1_great_product_good_taste,great-product-good
1,0,347,0_tea_green_iced_drink,Cold Drink
2,1,324,1_coffee_cup_starbucks_beans,Coffee beans
3,2,207,2_product_tastes_taste_great,product-tastes-taste
4,3,167,3_snacks_snack_tasty_fruit,snacks-snack-tasty
5,4,147,4_chips_potato_bag_bbq,chips-potato-bag
6,5,135,5_orange_juice_soda_drink,orange-juice-soda
7,6,134,6_price_grocery_store_amazon,price-grocery-store
8,7,109,7_dog_treats_dogs_treat,dog-treats-dogs
9,8,98,8_nuts_almond_roasted_nut,nuts-almond-roasted


In [36]:
#finding specific topic
topic_model.find_topics('animal', top_n=3)

([12, 7, 17], [0.7597972611987327, 0.7444208026393664, 0.6740989352739281])

In [42]:
topic_model.get_topic(17)

[('china', 0.12490522790589333),
 ('treats', 0.08203824691364132),
 ('dogs', 0.05424114952046814),
 ('dog', 0.0519195313209965),
 ('chicken', 0.04853867168122317),
 ('products', 0.035656979428674934),
 ('usa', 0.03498047359496498),
 ('country', 0.017893434052750193),
 ('sick', 0.017820012181890593),
 ('food', 0.01726687124620462)]

In [25]:
# Save model
topic_model.save("bertopic_model")	

In [28]:
# Load model
topic_model = BERTopic.load("bertopic_model")	