In [None]:
# only if you haven't already installed these
!pip install bertopic[all] sentence-transformers


In [1]:
import dask.dataframe as dd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import pandas as pd


In [None]:
# reload & filter to English if needed
DF_PATH = '../Step_3_analysis/top_100_parquet/70.parquet'

df = dd.read_parquet(DF_PATH,
                     columns=['review', 'voted_up', 'review_language'])
df = df[df['review_language']=='english']
df = df.persist()

# split into pos/neg and sample up to 50k each
def sample_reviews(ddf, label, n=50000):
    bucket = ddf[ddf['voted_up']==label]['review'].dropna()
    total = bucket.count().compute()
    frac  = min(1.0, n/total)
    return bucket.sample(frac=frac).compute().tolist()

likes_docs    = sample_reviews(df, True)
dislikes_docs = sample_reviews(df, False)


In [3]:
# use a lightweight SBERT model
emb_model = SentenceTransformer("all-MiniLM-L6-v2")

# BERTopic will use UMAP & HDBSCAN under the hood
topic_model = BERTopic(
    embedding_model=emb_model,
    n_gram_range=(1,2),
    min_topic_size=50,
    verbose=True
)


In [4]:
# Cell 4 (Option B): Fit & inspect “Likes” topics manually
likes_topics, likes_probs = topic_model.fit_transform(likes_docs)

likes_info = topic_model.get_topic_info().head(10)
print("Top ‘like’ topics by frequency:")
display(likes_info)

for topic_id in likes_info.Topic.iloc[1:6]:
    terms = topic_model.get_topic(topic_id)
    # find first index where likes_topics == topic_id
    idx = next((i for i, t in enumerate(likes_topics) if t == topic_id), None)
    example = likes_docs[idx] if idx is not None else "(no example)"
    print(f"\nTopic {topic_id} — terms: {[t for t,_ in terms]}")
    print("Example review:", example)


2025-05-03 00:28:22,469 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/835 [00:00<?, ?it/s]

2025-05-03 00:28:25,053 - BERTopic - Embedding - Completed ✓
2025-05-03 00:28:25,053 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-03 00:28:34,155 - BERTopic - Dimensionality - Completed ✓
2025-05-03 00:28:34,155 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

Top ‘like’ topics by frequency:


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,7237,-1_the_in_this_game,"[the, in, this, game, it, and, in the, to, thi...","[One of the best game ever, Started playing CS..."
1,0,1964,0_de_que_el_la,"[de, que, el, la, juego, si, un, es, jogo, joc]",[El mejor juego de shooter en la historia en s...
2,1,1181,1_fps_best fps_fps game_best,"[fps, best fps, fps game, best, ever, the best...","[Best FPS game ever, Best FPS game ever 3, the..."
3,2,1085,2_counterstrike_the_and_of,"[counterstrike, the, and, of, to, is, that, in...",[alfLife CounterStrike is one of the most unus...
4,3,852,3_shooter_person_first person_shooting,"[shooter, person, first person, shooting, shoo...","[Best first person shooter game in the world, ..."
5,4,797,4_strike_counter_counter strike_strike 16,"[strike, counter, counter strike, strike 16, s...","[counter strike, counter strike, counter strike]"
6,5,697,5_cs 16_16_cs_16 is,"[cs 16, 16, cs, 16 is, is, cs16, and, the, to,...","[cs 16 CS 16, CS 16, CS 16]"
7,6,676,6_1010_game 1010_1010 would_would,"[1010, game 1010, 1010 would, would, game, 101...","[1010, 1010, 1010]"
8,7,576,7_but gold_gold old_gold_old but,"[but gold, gold old, gold, old but, old, but, ...","[Old but gold, Old but gold, old but gold]"
9,8,524,8_csgo_than csgo_better_than,"[csgo, than csgo, better, than, this, but, thi...","[CSGO, CSGO, csgo]"



Topic 0 — terms: ['de', 'que', 'el', 'la', 'juego', 'si', 'un', 'es', 'jogo', 'joc']
Example review: es number uno high damage

Topic 1 — terms: ['fps', 'best fps', 'fps game', 'best', 'ever', 'the best', 'fps games', 'fps ever', 'of', 'the']
Example review: One of my first fps online shooter games Ive ever played and the best one

Topic 2 — terms: ['counterstrike', 'the', 'and', 'of', 'to', 'is', 'that', 'in', 'as', 'its']
Example review: Since then there has been nothing fundamentally new in multiplayer cooperative shooters except for eyebleeding effects and ridiculous mechanics Why look for something better when there is CounterStrike

Topic 3 — terms: ['shooter', 'person', 'first person', 'shooting', 'shooter game', 'first', 'person shooter', 'best', 'best shooter', 'the']
Example review: One of the most legendary shooters of the last decade A competitive shooter of the combination of tactics teamwork game knowledge and aim

Topic 4 — terms: ['strike', 'counter', 'counter strike',

In [5]:
# Cell 5: Fit & Inspect “Dislikes” topics manually (Option B)

# Re-initialize a fresh BERTopic with the same settings
dislike_model = BERTopic(
    embedding_model=emb_model,
    n_gram_range=(1,2),
    min_topic_size=50,
    verbose=True
)

# Fit on the negative reviews
dis_topics, dis_probs = dislike_model.fit_transform(dislikes_docs)

# Show top-10 topic info
dis_info = dislike_model.get_topic_info().head(10)
print("Top ‘dislike’ topics by frequency:")
display(dis_info)

# For each of the next 5 topics, print its top terms and a representative example
for topic_id in dis_info.Topic.iloc[1:6]:  # skip -1 (outliers)
    terms = dislike_model.get_topic(topic_id)
    # find first index where dis_topics == topic_id
    idx = next((i for i, t in enumerate(dis_topics) if t == topic_id), None)
    example = dislikes_docs[idx] if idx is not None else "(no example)"
    print(f"\nTopic {topic_id} — terms: {[t for t,_ in terms]}")
    print("Example review:", example)


2025-05-03 00:28:35,834 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/31 [00:00<?, ?it/s]

2025-05-03 00:28:35,960 - BERTopic - Embedding - Completed ✓
2025-05-03 00:28:35,961 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-03 00:28:37,445 - BERTopic - Dimensionality - Completed ✓
2025-05-03 00:28:37,446 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-03 00:28:37,462 - BERTopic - Cluster - Completed ✓
2025-05-03 00:28:37,463 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-03 00:28:37,500 - BERTopic - Representation - Completed ✓


Top ‘dislike’ topics by frequency:


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,886,0_the_game_and_to,"[the, game, and, to, this, is, you, it, of, in]",[Your Mom probably would enjoy this Kids Every...
1,1,76,1_de_que_jogo_os,"[de, que, jogo, os, eu, no, com, na, se, voc]",[no consegui configurar mouse bugado caindo do...



Topic 1 — terms: ['de', 'que', 'jogo', 'os', 'eu', 'no', 'com', 'na', 'se', 'voc']
Example review: igra dlya pidorov konchenee igri ne bivaet autisti tolko mogyt igrat v eto dermo


In [None]:
# Save the top-10 topics + term lists to JSON
likes_out = {
    int(row.Topic): topic_model.get_topic(int(row.Topic))
    for _, row in likes_info.iloc[1:11].iterrows()
}
dis_out = {
    int(row.Topic): dislike_model.get_topic(int(row.Topic))
    for _, row in dis_info.iloc[1:11].iterrows()
}

import json
with open("bertinsights_likes.json",    "w") as f: json.dump(likes_out, f, indent=2)
with open("bertinsights_dislikes.json", "w") as f: json.dump(dis_out,   f, indent=2)


Exception ignored in: <function ResourceTracker.__del__ at 0x7d33fdd3b1a0>
Traceback (most recent call last):
  File "/home/rgmatr1x/anaconda3/envs/rapids-25.04/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/home/rgmatr1x/anaconda3/envs/rapids-25.04/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/home/rgmatr1x/anaconda3/envs/rapids-25.04/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x73865004f1a0>
Traceback (most recent call last):
  File "/home/rgmatr1x/anaconda3/envs/rapids-25.04/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/home/rgmatr1x/anaconda3/envs/rapids-25.04/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/home/rgmatr1x/anaconda3/envs/rapids-25.04/lib/python3.12/multiprocessing/resource_tracker.py", line 