In [1]:
# Steam Theme Analysis with SBERT & Dask

#This notebook reads per-game review Parquet files, embeds reviews with SBERT, classifies each review into one of your predefined themes via cosine-similarity to theme centroids, and then summarizes counts & like-ratios in parallel using Dask.


In [2]:
# Cell 2: Imports & Dask client
import os, random, numpy as np, pandas as pd
import dask.dataframe as dd
from dask import delayed, compute
from dask.distributed import Client
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import json

# Start Dask client
client = Client()

In [3]:
# Cell 2: Load Theme Dictionary
with open('game_themes.json', 'r') as f:
    raw = json.load(f)

# steam_appid → { theme_name → [keywords...] }
GAME_THEMES = {int(appid): themes for appid, themes in raw.items()}


In [4]:
# Cell 4: Initialize SBERT Model and Precompute Theme Embeddings
# Load a Sentence-BERT model
# embedder = SentenceTransformer('all-MiniLM-L6-v2', device='cuda')
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Precompute one embedding vector per theme per game
THEME_EMBEDDINGS = {}
for appid, themes in GAME_THEMES.items():
    theme_embeds = []
    for theme, seeds in themes.items():
        seed_embeds = embedder.encode(seeds, convert_to_numpy=True)
        # average the seed embeddings
        theme_embeds.append(seed_embeds.mean(axis=0))
    THEME_EMBEDDINGS[appid] = np.vstack(theme_embeds)

In [5]:

@delayed
def analyse_one_game(appid, themes):
    import os
    import numpy as np
    import pandas as pd
    from sentence_transformers import SentenceTransformer
    from sklearn.metrics.pairwise import cosine_similarity

    # locate the parquet file for this game
    CANDIDATE = ['parquet_output_indie','parquet_output_theme_combo']
    path = next((os.path.join(d, f"{appid}.parquet") for d in CANDIDATE 
                 if os.path.exists(os.path.join(d, f"{appid}.parquet"))), None)
    if path is None:
        print(f"⚠️ No file for {appid}")
        return pd.DataFrame()

    # load into pandas
    df = pd.read_parquet(path).reset_index(drop=True)
    df = df[df['review_language'] == 'english']
    reviews = df['review'].dropna().tolist()
    if not reviews:
        print(f"⚠️ No reviews for {appid}")
        return pd.DataFrame({
            'steam_appid': [appid],
            'Theme': list(themes.keys()),
            '#Reviews': [0]*len(themes),
            'LikeRatio': ['0%']*len(themes),
            'Reviews': [[] for _ in themes]
        })

    # embed reviews in batches for efficiency
    # model = SentenceTransformer('all-MiniLM-L6-v2', device = 'cuda')
    model = SentenceTransformer('all-MiniLM-L6-v2')

    batch_size = 64
    embed_chunks = []
    for i in range(0, len(reviews), batch_size):
        batch = reviews[i:i+batch_size]
        embed_chunks.append(model.encode(batch, convert_to_numpy=True))
    review_embeds = np.vstack(embed_chunks)

    # get precomputed theme embeddings for this game
    theme_embeds = THEME_EMBEDDINGS[appid]  # shape (n_themes, dim)

    # compute cosine similarity and assign each review to the closest theme
    sims = cosine_similarity(review_embeds, theme_embeds)
    topic_ids = sims.argmax(axis=1)
    df['topic_id'] = topic_ids

    # build the report
    counts = df.groupby('topic_id')['review'].count().reindex(range(len(themes)), fill_value=0)
    likes  = df[df['voted_up']].groupby('topic_id')['review'].count().reindex(range(len(themes)), fill_value=0)

    report = pd.DataFrame({
        'steam_appid': appid,
        'Theme': list(themes.keys()),
        '#Reviews': counts.values,
        'LikeRatio': ((likes / counts.replace(0, 1) * 100)
                       .round(1)
                       .astype(str) + '%').values,
        'Reviews': [df[df['topic_id'] == tid]['review'].tolist()
                    for tid in range(len(themes))]
    })
    return report

In [6]:
tasks   = [analyse_one_game(appid, themes) for appid, themes in GAME_THEMES.items()]
reports = compute(*tasks)
# combine into one DataFrame
final_report = pd.concat(reports, ignore_index=True)
final_report


Unnamed: 0,steam_appid,Theme,#Reviews,LikeRatio,Reviews
0,391540,story,11039,96.6%,[[h1][b]Undertale...[/b][/h1] \n\nWhere do I e...
1,391540,characters,8630,95.9%,[I pirated this game and played through it mul...
2,391540,gameplay,41944,96.7%,[1. Accidentally kill the first boss\n2. Cry\n...
3,391540,combat,2867,95.5%,"[I love this game so much, I made the trailer ..."
4,391540,music,14245,97.9%,"[At a first glance, [b]Undertale[/b] probably ..."
5,391540,visuals,1675,92.5%,[Knowing the mouse might one day escape the ma...
6,391540,humor_dialogue,4823,91.1%,"[To be honest, I played the game because of th..."
7,391540,morality,1906,91.3%,"[An amazing and unique game, that offers the u..."
8,391540,secrets_meta,1180,90.4%,[This game has one weird trick that makes the ...
9,391540,warp,1894,96.5%,[The earth is 4.6 billion years old and we man...


In [8]:
final_report.to_csv('output_csvs/SBERT_per_task_report.csv', index=False)