In [23]:
pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
from bertopic import BERTopic
import pandas as pd

In [28]:

reviews =  "reviews.csv"

df = pd.read_csv(reviews)
df.head()
data = df['Text']
time = df['Time']
df.head()
data_list = data.to_list()
time_list = time.to_list()

In [29]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [30]:
!pip install umap-learn
from sentence_transformers import SentenceTransformer
import umap.umap_ as UMAP
from hdbscan import HDBSCAN

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
umap_model = UMAP.UMAP(n_neighbors=3, n_components=3, min_dist=0.05)
hdbscan_model = HDBSCAN(min_cluster_size=80, min_samples=40,
                        gen_min_span_tree=True,
                        prediction_data=True)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [27]:
!pip install nltk
import nltk
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

stopwords = list(stopwords.words('english')) + ['http', 'https', 'amp', 'com']

# we add this to remove stopwords that can pollute topcs
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words=stopwords)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [31]:
from bertopic import BERTopic

model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    top_n_words=10,
    language='english',
    calculate_probabilities=True,
    verbose=True
)
topics, probs = model.fit_transform(data)

Batches:   0%|          | 0/171 [00:00<?, ?it/s]

2023-04-11 20:00:01,641 - BERTopic - Transformed documents to Embeddings
2023-04-11 20:00:23,003 - BERTopic - Reduced dimensionality
2023-04-11 20:00:23,914 - BERTopic - Clustered reduced embeddings


In [32]:
# Save the model in the previously created folder with the name 'my_best_model'
model.save("./nigel_bert")

In [35]:
#If the following packages are not already downloaded, the following lines are needed 
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer

filtered_text = []
lemmatizer = WordNetLemmatizer()

for w in data:
  filtered_text.append(lemmatizer.lemmatize(w))
print(filtered_text[:1])

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


['This is a very healthy dog food. Good for their digestion. Also good for small puppies. My dog eats her required amount at every feeding.']


In [37]:
from bertopic.vectorizers import ClassTfidfTransformer
# Step 2.5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

In [43]:
topic_model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
  nr_topics=10                        # Step 6 - Diversify topic words
)

In [None]:
#topics, probabilities = topic_model.fit_transform(filtered_text)

In [50]:
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora

In [51]:
documents = pd.DataFrame({"Document": filtered_text,
                          "ID": range(len(filtered_text)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = model._preprocess_text(documents_per_topic.Document.values)

# Extract vectorizer and analyzer from BERTopic
vectorizer = model.vectorizer_model
analyzer = vectorizer.build_analyzer()

# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names_out()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

# Evaluate
coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
coherence = coherence_model.get_coherence()

In [52]:
print(coherence)

0.6808601199657041


In [None]:

# Load the serialized model
# my_best_model = BERTopic.load("./model_dir/my_best_model")

In [None]:
model.visualize_barchart()

In [14]:
top_10_words_berttopic.to_csv('top_10_words_bert.csv', index=False)

In [13]:
fig = model.visualize_barchart()

In [10]:
fig.write_html("file.html")

In [14]:
fig

In [15]:
topics_over_time = model.topics_over_time(data_list, time_list, nr_bins=20)

19it [00:18,  1.03it/s]


In [17]:
model.visualize_topics_over_time(topics_over_time, top_n_topics=20)