In [1]:
# Cell 1: Install dependencies (once)
# conda install -c conda-forge \
#     dask distributed dask-ml scikit-learn pandas pyarrow -y


In [2]:
# Cell 2: Imports & Dask client
import os, random, numpy as np, pandas as pd
import dask.dataframe as dd
from dask import delayed, compute
from dask.distributed import Client

# dask-ml modules
from dask_ml.feature_extraction.text import HashingVectorizer
from dask_ml.wrappers import Incremental

# scikit-learn
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer

# Reproducibility
random.seed(42)
np.random.seed(42)

# Launch a local Dask client
client = Client()
client


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 16,Total memory: 30.49 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:43343,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 16
Started: Just now,Total memory: 30.49 GiB

0,1
Comm: tcp://127.0.0.1:41325,Total threads: 4
Dashboard: http://127.0.0.1:42521/status,Memory: 7.62 GiB
Nanny: tcp://127.0.0.1:35351,
Local directory: /tmp/dask-scratch-space/worker-jrmmz8i8,Local directory: /tmp/dask-scratch-space/worker-jrmmz8i8

0,1
Comm: tcp://127.0.0.1:45309,Total threads: 4
Dashboard: http://127.0.0.1:32951/status,Memory: 7.62 GiB
Nanny: tcp://127.0.0.1:41611,
Local directory: /tmp/dask-scratch-space/worker-l63pvr86,Local directory: /tmp/dask-scratch-space/worker-l63pvr86

0,1
Comm: tcp://127.0.0.1:38487,Total threads: 4
Dashboard: http://127.0.0.1:38915/status,Memory: 7.62 GiB
Nanny: tcp://127.0.0.1:46189,
Local directory: /tmp/dask-scratch-space/worker-nbrtyl1e,Local directory: /tmp/dask-scratch-space/worker-nbrtyl1e

0,1
Comm: tcp://127.0.0.1:46509,Total threads: 4
Dashboard: http://127.0.0.1:40935/status,Memory: 7.62 GiB
Nanny: tcp://127.0.0.1:46487,
Local directory: /tmp/dask-scratch-space/worker-sixl2znf,Local directory: /tmp/dask-scratch-space/worker-sixl2znf


In [3]:
import json
with open('../game_themes.json', 'r') as f:
    raw = json.load(f)
GAME_THEMES = {int(appid): themes for appid, themes in raw.items()}
n_themes = len(GAME_THEMES)


In [4]:
# Cell 4: Global constants & shared HashingVectorizer

N_FEATURES   = 2**18   # ~260K dims
BATCH_SIZE   = 5000    # partial_fit chunk size
SAMPLE_PER_C = 5000    # TF-IDF docs per cluster

# A single, stateless vectoriser we’ll reuse in each task
global_vec = HashingVectorizer(
    n_features=N_FEATURES,
    alternate_sign=False,
    stop_words='english',
    dtype=np.float32
)


In [5]:
@delayed
def analyse_one_game(appid, themes):
    import os, numpy as np, pandas as pd
    from dask_ml.feature_extraction.text import HashingVectorizer
    from dask_ml.wrappers import Incremental
    from sklearn.cluster import MiniBatchKMeans
    from sklearn.feature_extraction.text import TfidfVectorizer

    # 1) Find your parquet file
    CANDIDATE = [
      'parquet_output_theme_combo',
      'parquet_output_indie'
    ]
    path = next((os.path.join(d, f"{appid}.parquet") for d in CANDIDATE 
                 if os.path.exists(os.path.join(d, f"{appid}.parquet"))), None)
    if path is None:
        print(f"⚠️ No file for {appid}")
        return None

    # 2) Load & filter (pandas is fine for <30 MB per game)
    df = pd.read_parquet(path, columns=['review','votes_up','voted_up','review_language'])
    df = df[df.review_language=='english'].dropna(subset=['review'])
    df['review'] = df.review.astype(str)
    n = len(df)
    if n == 0:
        print(f"⚠️ No English reviews for {appid}")
        return None

    # ─── NOW instantiate your vectorizer *inside* the function ───────────────
    vec = HashingVectorizer(
        n_features=2**18,
        alternate_sign=False,
        stop_words='english',
        dtype=np.float32
    )

    # 3) Seed centroids
    pseudo = [" ".join(ws) for ws in themes.values()]
    pseudo_sparse  = vec.transform(pseudo)                       # SciPy CSR
    init_centroids = np.vstack([r.toarray().ravel() for r in pseudo_sparse])

    # 4) Out-of-core clustering
    mbkm = MiniBatchKMeans(
        n_clusters=   len(themes),
        init=         init_centroids,
        n_init=       1,
        random_state= 42,
        batch_size=   5000
    )
    km = Incremental(mbkm, predict_meta=np.zeros(1, dtype=int))

    labels = np.empty(n, dtype=int)
    for i in range(0, n, 5000):
        block = df['review'].iloc[i:i+5000]
        Xb    = vec.transform(block)
        km.partial_fit(Xb)
        labels[i:i+5000] = km.predict(Xb)

    df['topic_id'] = labels

    # 5) Collect all reviews per theme
    reviews_per_theme = {
        tid: df.loc[df.topic_id==tid, 'review'].tolist()
        for tid in range(len(themes))
    }

    # 6) Build your report
    counts = df.groupby('topic_id').review.count()
    likes  = df[df.voted_up].groupby('topic_id').review.count()

    report = pd.DataFrame({
        'steam_appid':   appid,
        'Theme':         list(themes.keys()),
        '#Reviews':      counts.values,
        'LikeRatio':     (likes/counts*100).round(1).astype(str) + '%',
        'Reviews':       [reviews_per_theme[tid] for tid in range(len(themes))]
    })
    return report


In [6]:
# Cell 6: dispatch all games in parallel on threads
from dask import compute

tasks   = [analyse_one_game(appid, themes) for appid, themes in GAME_THEMES.items()]

# Use the threaded scheduler so everything runs in‐process (no Dask worker death)
reports = compute(*tasks, scheduler='threads')

# Filter out any None (missing/empty) and concat
reports      = [r for r in reports if r is not None]
final_report = pd.concat(reports, ignore_index=True)

final_report.to_csv('steam_theme_reports.csv', index=False)
display(final_report.head(20))


⚠️ No file for 391540
⚠️ No file for 383870
⚠️ No file for 504230
⚠️ No file for 1145360
⚠️ No file for 1794680
⚠️ No file for 1868140
⚠️ No file for 413150
⚠️ No file for 367520
⚠️ No file for 105600
⚠️ No file for 945360
⚠️ No file for 239030
⚠️ No file for 3164500
⚠️ No file for 1966720
⚠️ No file for 10
⚠️ No file for 1145350
⚠️ No file for 268910




ValueError: No objects to concatenate