# Topic Modeling with [BERTopic](https://github.com/MaartenGr/BERTopic)
This notebook performs the Topic Modeling task using BERTopic.

In [2]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sentence_transformers import SentenceTransformer
import nltk
from textblob import TextBlob
import random
from nltk.tokenize import sent_tokenize
nltk.download('punkt_tab', quiet=True)
nltk.download('punkt', quiet=True)
pass

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('../1-etl/reviews_clean.csv').fillna('')
corpus = df[df['stars'] <= 5]['content']
corpus = list(filter(lambda x: len(x) > 20, corpus))
corpus = [sent for doc in corpus for sent in sent_tokenize(doc)]
len(corpus)

2821

In [4]:
# Create embeddings from the documents
sentence_model = SentenceTransformer("all-mpnet-base-v2")
embeddings = sentence_model.encode(list(corpus))

In [14]:
# Define sub-models
vectorizer = CountVectorizer(stop_words="english")
umap_model = UMAP(n_neighbors=5, n_components=10,
                  min_dist=0.0, metric='cosine')
# hdbscan_model = HDBSCAN(min_cluster_size=10, min_samples=2, metric='euclidean')

# Train our topic model with BERTopic
topic_model = BERTopic(
    language="english",
    calculate_probabilities=True,
    verbose=True,
    embedding_model=sentence_model,
    # umap_model=umap_model,
    # hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer,
    n_gram_range=(1, 3)
)
topics, probs = topic_model.fit_transform(corpus, embeddings)
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,Representative_Docs
0,-1,1161,-1_the_to_it_and,"the, to, it","[the, to, it, and, printer, this, that, of, fo...","[Constantly ""offline"", disconnecting from my r..."
1,0,124,0_hp_with_support_with hp,"hp, with, support","[hp, with, support, with hp, hp and, and, amaz...",[I spent literal hours on the laptop with hp s...
2,1,83,1_service_customer_customer service_he,"service, customer, customer servic","[service, customer, customer service, he, me, ...",[When calling customer support they immediatel...
3,2,77,2_set_easy_set up_up,"set, easy, set up","[set, easy, set up, up, to set, setup, printer...",[This printer was vry easy to set up and conne...
4,3,74,3_quality_print quality_quality is_prints,"quality, print quality, quality is","[quality, print quality, quality is, prints, t...",[The print quality is not very good on this on...
5,4,74,4_tray_paper_paper tray_the paper,"tray, paper, paper tray","[tray, paper, paper tray, the paper, the, slid...",[The only thing I don’t like is the paper tray...
6,5,67,5_hp_hp printer_hp printers_printers,"hp, hp printer, hp printers","[hp, hp printer, hp printers, printers, years,...",[I bought this HP printer to replace a very si...
7,6,62,6_printer_this_printer is_for,"printer, this, printer is","[printer, this, printer is, for, good, this pr...",[Have not learned everything about it but so f...
8,7,49,7_scan_scanning_to scan_sided,"scan, scanning, to scan","[scan, scanning, to scan, sided, page, documen...",[The duplex function is useless: When trying t...
9,8,47,8_buy_would_price_buy it,"buy, would, price","[buy, would, price, buy it, recommend, would n...","[Would definitely buy again., Would definitely..."


In [None]:
# topic_model.save('model.bertopic')

In [52]:
topics_to_merge = [[-1, 5, 6, 26, 45], [2, 9],
                   [13, 18, 20], [12, 21, 36, 40], [11, 16], [27, 4], [14, 50], [15, 22]]
topic_model.merge_topics(corpus, topics_to_merge=topics_to_merge)

In [None]:
results_df = topic_model.get_topic_info()
results_df.to_csv('topic_modeling.csv')
results_df

Unnamed: 0,Topic,Count,Name,CustomName,Representation,Representative_Docs
0,-1,1325,-1_the_printer_to_it,outliers,"[the, printer, to, it, and, this, hp, for, is,...",[If you don't print much this printer is not f...
1,0,124,0_hp_with_support_and,support,"[hp, with, support, and, with hp, hp and, to, ...",[I spent literal hours on the laptop with hp s...
2,1,122,1_set_set up_easy_up,set up,"[set, set up, easy, up, to set, easy to, setup...","[This printer is easy to set up., This was eas..."
3,2,101,2_ink_hp_the_you,ink and cartridge,"[ink, hp, the, you, to, cartridges, for, and, ...",[I paid for instant ink for months and everyth...
4,3,96,3_wifi_to_my_the,wifi and connection,"[wifi, to, my, the, it, my wifi, wireless, con...",[But since this one is STILL NOT in wifi conne...
5,4,95,4_tray_paper_paper tray_the paper,paper tray,"[tray, paper, paper tray, the paper, the, slid...",[The only thing I don’t like is the paper tray...
6,5,83,5_service_customer_customer service_he,customer service,"[service, customer, customer service, he, me, ...",[When calling customer support they immediatel...
7,6,78,6_print_slow_to print_printer,print time,"[print, slow, to print, printer, it, the, take...",[memories insufficient on the printer it takes...
8,7,74,7_quality_print quality_quality is_print,quality,"[quality, print quality, quality is, print, pr...",[The print quality is not very good on this on...
9,8,60,8_subscription_ink_the subscription_the,subscription,"[subscription, ink, the subscription, the, for...",[I canceled the subscription and was told I wo...


In [54]:
topic_model.set_topic_labels([
    'outliers',
    'support',
    'set up',
    'ink and cartridge',
    'wifi and connection',
    'paper tray',
    'customer service',
    'print time',
    'quality',
    'subscription',
    'will return',
    'scanner',
    'price',
    'free ink',
    'uninstalling',
    '8015',
    'waste',
    'smart',
    'shipping',
    '*surprise',
    'disappointed ',
    'pages',
    'touch screen',
    'works great',
    'other brands',
    'frustrating ',
    'not recommend',
    'fax',
    'flimsy/sturdy',
    'give stars',
    'jams',
    'at home',
    'lot of issues',
    'print with phone',
    'headaches',
    'worst printer, hate',
    'low cartridge',
    'wanted features',
    'Apple users',
    'printing envelopes',
    'customer service problems',
])

In [None]:
topic_model.visualize_barchart(
    top_n_topics=4*4, custom_labels=True, width=300, height=200)

In [59]:
topic_model.save('model_postprocessed.bertopic')



In [5]:
topic_model = BERTopic.load('model.bertopic')

In [89]:
print(corpus[1266])
topic_model.visualize_distribution(topic_model.probabilities_[
                                   1266], min_probability=0.04, custom_labels=True, width=1000, height=400)

Once again I am unable to print anything.It is very hard to take customer service reps and their supervisors seriously when they contradict themselves don't listen to you and don't execute on their promises.In general, I think they have a serious attitude problem in customer service.


In [6]:
fig=topic_model.visualize_topics(custom_labels=True,width=1000,height=700)
fig.write_html('intertopic_map.html')
fig

In [None]:
topic_distr, topic_token_distr = topic_model.approximate_distribution(
    corpus, calculate_tokens=True)

In [69]:
index = 1266  # random.randrange(0, len(corpus))
topic_model.visualize_approximate_distribution(
    corpus[index], topic_token_distr[index], normalize=True)

Unnamed: 0,Once,again,am,unable,to,print,anything,It,is,very,hard,to.1,take,customer,service,reps,and,their,supervisors,seriously,when,they,contradict,themselves,don,listen,to.2,you,and.1,don.1,execute,on,their.1,promises,In,general,think,they.1,have,serious,attitude,problem,in,customer.1,service.1
4_tray_paper_paper tray_the paper,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013,0.028,0.028,0.028,0.016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5_service_customer_customer service_he,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014,0.032,0.061,0.084,0.091,0.073,0.044,0.022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015,0.042,0.042,0.042,0.027
6_print_slow_to print_printer,0.0,0.0,0.0,0.013,0.033,0.048,0.048,0.035,0.015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7_quality_print quality_quality is_print,0.0,0.0,0.0,0.0,0.0,0.013,0.013,0.013,0.013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13_have tried_uninstalling_tried_troubleshooting,0.0,0.013,0.013,0.013,0.013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [106]:
topic_model.visualize_heatmap(width=1000, height=800,custom_labels=True)

In [None]:
fig = topic_model.visualize_documents(
    corpus,
    embeddings=embeddings,
    hide_annotations=False,
    custom_labels=True,
    width=1200,
    height=800,
)
fig.write_image("../img/clustering.png", scale=3.0)
fig.write_html("clustering.html")
fig