In [None]:
!pip install "dask[complete]" bertopic sentence-transformers umap-learn hdbscan joblib

In [1]:
# ## 1. Imports and Global Setup

# %%
import os
import gc
import torch
import cudf
import dask_cudf
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from hdbscan import HDBSCAN
from cuml.manifold import UMAP as cuUMAP

# Directory paths
RAW_DIR       = "../Step_3_analysis/top_100_parquet"
MODELS_DIR_L  = "models/bertopic_en_L_gpu"
MODELS_DIR_M  = "models/bertopic_en_M_gpu"
OUTPUT_DIR_L  = "outputs/topic_labels_en_L_gpu"
OUTPUT_DIR_M  = "outputs/topic_labels_en_M_gpu"
for d in [MODELS_DIR_L, MODELS_DIR_M, OUTPUT_DIR_L, OUTPUT_DIR_M]:
    os.makedirs(d, exist_ok=True)

# Instantiate a single GPU embedder
EMBEDDER = SentenceTransformer("all-MiniLM-L6-v2", device="cuda")


In [2]:
# ## 2. Compute English-Only Review Volumes & Buckets

# %%
# Read minimal columns to compute per-game counts
ddf = dask_cudf.read_parquet(f"{RAW_DIR}/*.parquet",
                             columns=["steam_appid","review_language"])
vol = (
    ddf[ddf.review_language == "english"]
      .groupby("steam_appid")
      .size()
      .compute()
      .to_pandas()
      .rename("review_count")
)

# Define buckets
LARGE_EN  = vol[vol > 50_000].index.to_list()
MEDIUM_EN = vol[(vol >= 10_000) & (vol <= 50_000)].index.to_list()
SMALL_EN  = vol[vol < 10_000].index.to_list()

print(f"Large games (>50k reviews): {len(LARGE_EN)}")
print(f"Medium games (10k–50k reviews): {len(MEDIUM_EN)}")
print(f"Small games (<10k reviews): {len(SMALL_EN)}")

Large games (>50k reviews): 41
Medium games (10k–50k reviews): 58
Small games (<10k reviews): 1


In [3]:
# ## 3. Define Per-Game Processing Function

# %%
def process_game_gpu(app_id: str, bucket: str):
    # 3.1 Load English reviews via cuDF
    gdf = cudf.read_parquet(f"{RAW_DIR}/{app_id}.parquet")[["review_language","review"]]
    gdf = gdf[gdf["review_language"] == "english"].dropna(subset=["review"])
    texts = gdf["review"].astype(str).to_pandas().tolist()
    del gdf; gc.collect()
    
    # 3.2 Embed texts on GPU
    embeddings = EMBEDDER.encode(
        texts, batch_size=512, show_progress_bar=False, device="cuda"
    )
    
    # 3.3 Configure UMAP & HDBSCAN
    umap_model    = cuUMAP(n_components=5, random_state=42)
    min_size      = 100 if bucket == "large" else 20
    hdbscan_model = HDBSCAN(min_cluster_size=min_size, core_dist_n_jobs=4)
    
    # 3.4 Fit BERTopic
    topic_model = BERTopic(
        embedding_model=EMBEDDER,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        low_memory=(bucket == "large")
    )
    topics, _ = topic_model.fit_transform(texts, embeddings)
    
    # 3.5 Compute topic count before cleanup
    n_topics = len(set(topics))
    
    # 3.6 Save model & labels
    model_dir = MODELS_DIR_L if bucket == "large" else MODELS_DIR_M
    out_dir   = OUTPUT_DIR_L if bucket == "large" else OUTPUT_DIR_M
    topic_model.save(f"{model_dir}/{app_id}")
    pd.DataFrame({"review": texts, "topic": topics}) \
      .to_csv(f"{out_dir}/{app_id}.csv", index=False)
    
    # 3.7 Cleanup GPU memory
    del embeddings, topic_model, topics
    gc.collect(); torch.cuda.empty_cache()
    
    return app_id, n_topics

In [4]:
 ## 4. Sequential Processing for Large & Medium Games

# %%
results = []
# Large games
for gid in LARGE_EN:
    results.append(process_game_gpu(gid, bucket="large"))
# Medium games
for gid in MEDIUM_EN:
    results.append(process_game_gpu(gid, bucket="medium"))

[2025-05-01 00:34:30.690] [CUML] [info] build_algo set to brute_force_knn because random_state is given


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[2025-05-01 00:34:55.542] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:35:15.877] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:35:31.196] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:35:45.194] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:36:10.190] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:36:32.125] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:36:42.908] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:37:04.387] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:37:22.288] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:37:43.122] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:38:11.742] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:38:27.936] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:38:39.736] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:38:59.792] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:39:21.906] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:39:46.809] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:40:03.246] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:40:22.127] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:40:51.877] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:41:17.683] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:41:37.922] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:41:58.029] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:42:16.723] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:42:30.134] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:42:42.620] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:42:55.894] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:43:13.775] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:43:27.381] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:43:42.437] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:44:03.042] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:44:18.314] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:44:36.065] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:44:55.675] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:45:07.060] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:45:21.732] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:45:37.109] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:45:52.632] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:46:13.869] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:46:30.672] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:46:55.855] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:47:15.393] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:47:22.340] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:47:29.764] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:47:39.988] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:47:50.140] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:47:56.050] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:48:03.330] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:48:12.442] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:48:19.467] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:48:26.659] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:48:34.496] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:48:38.561] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:48:45.243] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:49:01.001] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:49:12.923] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:49:21.398] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:49:32.143] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:49:38.323] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:49:43.431] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:49:52.169] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:49:59.701] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:50:04.382] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:50:12.993] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:50:22.448] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:50:30.238] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:50:37.377] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:50:46.046] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:50:56.675] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:51:03.415] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:51:09.264] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:51:14.192] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:51:20.024] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:51:29.535] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:51:37.516] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:51:44.867] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:51:51.956] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:51:59.772] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:52:12.032] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:52:22.368] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:52:31.579] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:52:38.633] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:52:45.206] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:52:51.330] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:52:57.259] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:53:07.192] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:53:14.719] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:53:26.855] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:53:37.333] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:53:44.624] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:53:52.407] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:53:57.718] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:54:02.640] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:54:09.512] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:54:15.766] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:54:24.369] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:54:31.175] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:54:37.887] [CUML] [info] build_algo set to brute_force_knn because random_state is given




[2025-05-01 00:54:46.871] [CUML] [info] build_algo set to brute_force_knn because random_state is given




In [5]:
# ## 5. Summarize Topic Counts per Game

# %%
df_summary = pd.DataFrame([
    {
      "steam_appid": app_id,
      "bucket": "Large-EN" if app_id in LARGE_EN else "Medium-EN",
      "n_topics": n_topics
    }
    for app_id, n_topics in results
]).sort_values(["bucket","n_topics"], ascending=[True, False]) \
  .reset_index(drop=True)

df_summary.head(20)

Unnamed: 0,steam_appid,bucket,n_topics
0,284160,Large-EN,158
1,322170,Large-EN,120
2,291550,Large-EN,75
3,107410,Large-EN,72
4,311210,Large-EN,54
5,444090,Large-EN,53
6,322330,Large-EN,47
7,294100,Large-EN,41
8,238960,Large-EN,37
9,393380,Large-EN,34


In [6]:
# ## 6. Small Games Fallback Preparation

# %%
if SMALL_EN:
    sid = SMALL_EN[0]
    gdf_small = cudf.read_parquet(f"{RAW_DIR}/{sid}.parquet")[["review_language","review"]]
    df_small = (
        gdf_small[gdf_small["review_language"]=="english"]
        .dropna(subset=["review"])
        .sample(frac=1.0, random_state=42)
        .to_pandas()
    )
    sample_small = df_small.review.sample(n=1000, random_state=42).tolist()
    pd.DataFrame({"sample_review": sample_small}).head()

Exception ignored in: <function ResourceTracker.__del__ at 0x7ed64dd971a0>
Traceback (most recent call last):
  File "/home/rgmatr1x/anaconda3/envs/rapids-25.04/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/home/rgmatr1x/anaconda3/envs/rapids-25.04/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/home/rgmatr1x/anaconda3/envs/rapids-25.04/lib/python3.12/multiprocessing/resource_tracker.py", line 111, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x7c1ce5d1f1a0>
Traceback (most recent call last):
  File "/home/rgmatr1x/anaconda3/envs/rapids-25.04/lib/python3.12/multiprocessing/resource_tracker.py", line 77, in __del__
  File "/home/rgmatr1x/anaconda3/envs/rapids-25.04/lib/python3.12/multiprocessing/resource_tracker.py", line 86, in _stop
  File "/home/rgmatr1x/anaconda3/envs/rapids-25.04/lib/python3.12/multiprocessing/resource_tracker.py", line 

In [8]:
from bertopic import BERTopic
import pandas as pd

app_id = "10"   # replace with your game ID

# 1. Load model
model = BERTopic.load(f"models/bertopic_en_M_gpu/{app_id}")  # or ..._M_gpu

# 2. See topic info
topic_info = model.get_topic_info()     # DataFrame with topic IDs, sizes
print(topic_info.head(10))

# 3. Get the top words per topic
for topic in topic_info.Topic.head(5):
    if topic == -1: continue           # skip outliers
    print("Topic", topic, "words:", model.get_topic(topic))

   Topic  Count                             Name  \
0     -1  15454             -1_game_and_the_this   
1      0   1664                   0_de_que_el_si   
2      1    594              1_fps_best_of_still   
3      2    554  2_shooter_person_shooting_first   
4      3    531                 3_without_what__   
5      4    504       4_counterstrike_and_the_of   
6      5    418            5_1010_910_would_1110   
7      6    353             6_ever_best_game_duh   
8      7    344                7_cs_go_best_love   
9      8    321           8_gold_old_but_spoiler   

                                      Representation  \
0   [game, and, the, this, to, it, of, is, you, for]   
1     [de, que, el, si, juego, la, un, jogo, es, se]   
2  [fps, best, of, still, classic, one, games, mu...   
3  [shooter, person, shooting, first, shoot, shoo...   
4                    [without, what, , , , , , , , ]   
5  [counterstrike, and, the, of, its, that, to, i...   
6  [1010, 910, would, 1110, again, 

In [3]:
# 4. Representative docs for all topics
reprs = model.get_representative_docs()   # returns { topic_id: [doc1, doc2, …] }

# If you only want the first 5 topics:
top_topics = model.get_topic_info().Topic.tolist()[:30]

for topic in top_topics:
    docs = reprs.get(topic, [])[:3]      # take up to 3 example docs
    print(f"\nTopic {topic}:")
    for doc in docs:
        # truncate to 200 chars and remove newlines
        clean = doc.replace("\n", " ")[:200]
        print(" •", clean)



Topic -1:
 • Another update of my almost 10 years old review With the release of CS2 I feel compelled to share my sentiments about this gaming masterpiece It has LITERALLY given me more joy and the ability to thin
 • This will be more of a my experience with this game type of review because saying things like great gameplay will not suit something Ive experienced with CounterStrike Here you go I remember back in 2
 • This will be more of a my experience with this game type of review because saying things like great gameplay will not suit something Ive experienced with CounterStrike Here you go I remember back in 2

Topic 0:
 • Del juego no puedo decir nada es buensimo es un juego para pasarla bien y muy buen nivel de competicin Ahora si lo que buscas es ser Pro y sentirte algo en Chile pierdes el tiempo ac nadie nunca conse
 • Mira Enorme te puedo tirar un par de consejos te cuento una historia ojal te ayude Trabajo en una rotiseria y siempre viene el gordo termotanque hijo de remil p

In [2]:
from bertopic import BERTopic
import pandas as pd
import cudf

app_id = "10"  # your game ID

# 1. Load your model
model = BERTopic.load(f"models/bertopic_en_M_gpu/{app_id}")

# 2. Load the CSV of topic assignments and compute counts
df = pd.read_csv(f"outputs/topic_labels_en_M_gpu/{app_id}.csv")
counts = (df.topic
            .value_counts()
            .rename_axis("topic")
            .reset_index(name="mentions"))

# 3. Load original review metadata to compute avg votes_up per topic
reviews = cudf.read_parquet(f"../Step_3_analysis/top_100_parquet/{app_id}.parquet").to_pandas()[["review","votes_up"]]
merged = df.merge(reviews, on="review")
sentiment = (merged.groupby("topic")
                  .votes_up.mean()
                  .rename("avg_votes_up")
                  .reset_index())

# 4. Combine counts & sentiment
stats = counts.merge(sentiment, on="topic")

# 5. For each of the top 10 topics (by mentions), print stats & top-5 words
top_topics = stats.sort_values("mentions", ascending=False).topic.head(30)

for tid in top_topics:
    row = stats[stats.topic == tid].iloc[0]
    words = [w for w, _ in model.get_topic(int(tid))][:30]  # top 5 words
    print(f"Topic {tid}: mentions={row.mentions:,}, avg_votes_up={row.avg_votes_up:.2f}")
    print("  Top words:", ", ".join(words))
    print()


Topic -1: mentions=15,454.0, avg_votes_up=2.25
  Top words: game, and, the, this, to, it, of, is, you, for

Topic 0: mentions=1,664.0, avg_votes_up=0.61
  Top words: de, que, el, si, juego, la, un, jogo, es, se

Topic 1: mentions=594.0, avg_votes_up=2.04
  Top words: fps, best, of, still, classic, one, games, multiplayer, the, all

Topic 2: mentions=554.0, avg_votes_up=3.16
  Top words: shooter, person, shooting, first, shoot, shooters, best, online, of, the

Topic 3: mentions=531.0, avg_votes_up=0.56
  Top words: without, what, , , , , , , , 

Topic 4: mentions=504.0, avg_votes_up=4.07
  Top words: counterstrike, and, the, of, its, that, to, is, as, 16

Topic 5: mentions=418.0, avg_votes_up=7.07
  Top words: 1010, 910, would, 1110, again, my, childhood, this, 100, game

Topic 6: mentions=353.0, avg_votes_up=1.07
  Top words: ever, best, game, duh, boom, end, true, off, , 

Topic 7: mentions=344.0, avg_votes_up=3.04
  Top words: cs, go, best, love, is, version, the, css, all, time

Top