In [None]:
# Cell 1: Install all dependencies (run once)
!pip install sentence-transformers umap-learn hdbscan scikit-learn transformers dask pandas matplotlib bertopic

In [None]:
# Cell 2: Imports & Seed Themes
import torch
import dask.dataframe as dd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from transformers import pipeline
from collections import defaultdict

# Your domain seed topics:
themes = {
    'gameplay': [ 'gameplay','mechanics','tactical shooter','precision','aim',
                  'headshot','spray-control','burst-fire','recoil','crosshair',
                  'peek','counter-strafe','movement','jump-peek','clutch',
                  'bomb plant','defuse','round','eco-round','overtime' ],
    'weapons':  [ 'weapon','gun','rifle','ak','m4','awp','pistol','deagle',
                  'smg','shotgun','sniper','knife','grenade','flashbang',
                  'smoke','molotov','he-nade','incendiary','zeus','spray',
                  'pull-out time','reload' ],
    'maps':     [ 'map','layout','bombsite','call-out','rotation','angles',
                  'cover','line-up','utility spot','choke-point','dust2',
                  'mirage','inferno','nuke','overpass','ancient','vertigo',
                  'office','train','cache' ],
    'competitive':[ 'competitive','matchmaking','rank','elo','premier','global elite',
                    'silver','faceit','esport','tournament','major','league',
                    'teamplay','strat','timeout','coach','demo review','practice' ],
    'economy & skins':[ 'economy','money','buy','force','save','full-buy','bonus-loss',
                        'skin','knife skin','case','capsule','stattrak','souvenir',
                        'sticker','trade-up','market','auction','rarity','float value',
                        'pattern','lootbox' ],
    'anti_cheat':[ 'cheater','cheat','hacker','hack','wallhack','aimbot','spinbot',
                   'vac','vac ban','prime','overwatch','smurf','rage','backtracking',
                   'triggerbot','report','banwave','trust factor' ],
    'performance':[ 'fps','frame rate','stutter','lag','ping','tickrate','sub-tick',
                    'server','hit-reg','netcode','desync','packet loss','freeze',
                    'crash','memory leak','loading time','update','patch','driver' ],
    'visuals':[ 'visuals','graphics','shader','lighting','smoke effect','blood splatter',
                'particle','texture','model','animation','ui','hud','crosshair style',
                'ray tracing','color','resolution','fov','viewmodel' ],
    'audio':[ 'audio','sound','footstep','sound cue','directional','occlusion',
              'gunshot','reverb','bomb beep','defuse sound','voice chat','callout',
              'microphone','radio command','volume','sound bug','muffle','mix' ],
    'community':[ 'community','friends','lobby','party','team-mate','toxic','grief',
                  'vote kick','chat','text chat','mute','spray logo','workshop',
                  'community server','surf','bhop','mods','plugin','custom map',
                  'training map' ]
}


In [None]:
#Cell 3 - Load & Filter reviews
DF_PATH = '../Step_3_analysis/top_100_parquet/10.parquet'
df = dd.read_parquet(
    DF_PATH,
    columns = ['review', 'votes_up', 'voted_up', 'review_language']
)
df = df[df['review_language'] == 'english'].persist()

In [None]:
#Cell 4 - Sampling function & draw balanced subsets.
def sample_bucket(df, label, n = 50000, random_state=42):
    bucket = df[df['voted_up'] == label][['review', 'votes_up']].dropna()
    total = bucket.shape[0].compute()
    frac = min(1.0, n/total) if total > 0 else 0
    sampled = (bucket.sample(frac=frac, random_state=random_state).compute()
               if frac > 0 else pd.DataFrame(columns=['review', 'votes_up']))
    
    return sampled['review'].tolist(), sampled['votes_up'].tolist()
likes_docs, likes_votes = sample_bucket(df, True)
dis_docs, dis_votes = sample_bucket(df, False)


In [None]:
#Cell 5 - Encode with BERT
device = 'cuda' if torch.cuda.is_available() else 'cpu':
embedder = SentenceTransformer('all_MiniLM-L6-v2', device=device)

def encode_docs(docs, batch_size=64):
    embs = []
    for i in range(0, len(docs), batch_size):
        embs.append(embedder.encode(docs[i:i+batch_size], convert_to_numpy=True))
    return np.vstack(embs)

all_docs = likes_docs + dis_docs
likes_emb = encode_docs(likes_docs)
dis_emb = encode_docs(dis_docs)
all_emb = np.vstack([likes_emb, dis_emb])

In [None]:
#Cell 6: Fit BERTopic with seep topics
#Prepare seed list in the order of your dict keys

seed_topic_list = list(themes.values())

#Build & fit the model on all docs

topic_model = BERTopic(
    embedding_model=embedder,
    n_gram_range=(1, 2),
    seed_topic_list=seed_topic_list,
    representation_model=KeyBERTInspired(),
    calculate_probabilities=False,
    verbose=False
)

topics, _ = topic_model.fit_transform(all_docs, all_emb)

In [None]:
#Cell 7: Extract per-topic keywords, counts & best example
# Build a simple DataFrame to help aggregate

sentiments = ['Like']*len(likes_docs) + ['Dislikes']*len(dis_docs)
df_all = pd.DataFrame({
    "Sentiment": sentiments,
    "TopicID": topics,
    "Review": all_docs,
    "Votes": likes_votes + dis_votes
})

insights = {}
for tid in sorted(set(topics)):
    mask = df_all['TopicID'] == tid
    sub = df_all[mask]
    if tid < 0 or sub.empty:
        continue

    #count & best example

    cnt = len(sub)
    best = sub.loc[sub['Votes'].idxmax(), 'Review']

    #Top Keywords from BERTopic
    kw = [w for w, _ in topic_model.get_topic(tid)][: 10]

    insights[tid] = {
        "Theme": list(themes.keys())[tid] if 0 <= tid < len(themes) else "Other",
        "Count": cnt,
        "Keywords": kw,
        "Example": best
    }
    


In [None]:
#Cell 8: Summarize each example

summarizer = pipeline(
    'summarization',
    model='sshleifer/distilbart-cnn-12-6',
    device=0 if torch.cuda.is_available() else -1
)

def safe_summarize(text):
    try:
        out = summarizer(text, max_length=40, min_length=15, do_sample=False, truncation=True)
        return out[0]['summary_text']
    
    except:
        return text[:200] + '...'
    
for data in insights.values():
    data['Summary'] = safe_summarize(data['Example'] or " ".join(data['Keywords']))

In [None]:
#Cell 9: Build final DataFrame & display
df_report = pd.DataFrame([
    {
        "Theme": data['Theme'],
        "Cluster (ID)": tid,
        "Count": data['Count'],
        "Keywords": ", ".join(data['Keywords']),
        "Example Review": data['Example'],
        "Summary": data['Summary']
    }
    for tid, data  in insights.items()
])

display(
    df_report.style
    .set_caption("BERTopic-guided Review Insights")
    .hide(axis="index")
)

In [None]:
# Cell 10: Plot review counts by theme & sentiment
plt.figure(figsize=(10,6))
plot_df = df_all.merge(df_report[['Cluster (ID)','Theme']], 
                      left_on='TopicID', right_on='Cluster (ID)', how='inner')
plot_df.groupby(['Theme','Sentiment'])['Review'] \
       .count() \
       .unstack('Sentiment') \
       .plot(kind='bar', rot=45)
plt.title("Review Counts per Theme & Sentiment")
plt.xlabel("Theme")
plt.ylabel("Number of Reviews")
plt.tight_layout()
plt.show()