In [3]:
import dask.dataframe as dd
import re
import json

# ——— CONFIG ———
DD_PATH = '../Step_3_analysis/cleaned_data_polars/10.parquet'   # adjust to your parquet files
TOP_N   = 5                             # how many top reviews per theme
# 1. Define your themes & compile regexes
themes = {
    # Core moment‑to‑moment play
    'gameplay': [
        'gameplay', 'mechanics', 'tactical shooter', 'precision', 'aim',
        'headshot', 'spray‑control', 'burst‑fire', 'recoil', 'crosshair',
        'peek', 'counter‑strafe', 'movement', 'jump‑peek', 'clutch',
        'bomb plant', 'defuse', 'round', 'eco‑round', 'overtime'
    ],

    # Weapons, grenades, recoil patterns
    'weapons': [
        'weapon', 'gun', 'rifle', 'ak', 'm4', 'awp', 'pistol', 'deagle',
        'smg', 'shotgun', 'sniper', 'knife', 'grenade', 'flashbang',
        'smoke', 'molotov', 'he‑nade', 'incendiary', 'zeus', 'spray',
        'pull‑out time', 'reload'
    ],

    # Map design and call‑outs
    'maps': [
        'map', 'layout', 'bombsite', 'call‑out', 'rotation', 'angles',
        'cover', 'line‑up', 'utility spot', 'choke‑point', 'dust2',
        'mirage', 'inferno', 'nuke', 'overpass', 'ancient', 'vertigo',
        'office', 'train', 'cache'
    ],

    # Ranked play, esport angle, organised competition
    'competitive': [
        'competitive', 'matchmaking', 'rank', 'elo', 'premier', 'global elite',
        'silver', 'faceit', 'esport', 'tournament', 'major', 'league',
        'teamplay', 'strat', 'timeout', 'coach', 'demo review', 'practice'
    ],

    # Game economy, skins, trading and cases
    'economy & skins': [
        'economy', 'money', 'buy', 'force', 'save', 'full‑buy', 'bonus‑loss',
        'skin', 'knife skin', 'case', 'capsule', 'stattrak', 'souvenir',
        'sticker', 'trade‑up', 'market', 'auction', 'rarity', 'float value',
        'pattern', 'lootbox'
    ],

    # Anti‑cheat and integrity concerns
    'anti_cheat': [
        'cheater', 'cheat', 'hacker', 'hack', 'wallhack', 'aimbot', 'spinbot',
        'vac', 'vac ban', 'prime', 'overwatch', 'smurf', 'rage', 'backtracking',
        'triggerbot', 'report', 'banwave', 'trust factor'
    ],

    # Performance, networking and technical stability
    'performance': [
        'fps', 'frame rate', 'stutter', 'lag', 'ping', 'tickrate', 'sub‑tick',
        'server', 'hit‑reg', 'netcode', 'desync', 'packet loss', 'freeze',
        'crash', 'memory leak', 'loading time', 'update', 'patch', 'driver'
    ],

    # Visual fidelity and art direction
    'visuals': [
        'visuals', 'graphics', 'shader', 'lighting', 'smoke effect',
        'blood splatter', 'particle', 'texture', 'model', 'animation',
        'ui', 'hud', 'crosshair style', 'ray tracing', 'color', 'resolution',
        'fov', 'viewmodel'
    ],

    # Audio design and voice comms
    'audio': [
        'audio', 'sound', 'footstep', 'sound cue', 'directional',
        'occlusion', 'gunshot', 'reverb', 'bomb beep', 'defuse sound',
        'voice chat', 'callout', 'microphone', 'radio command', 'volume',
        'sound bug', 'muffle', 'mix'
    ],

    # Social experience and community features
    'community': [
        'community', 'friends', 'lobby', 'party', 'team‑mate', 'toxic',
        'grief', 'vote kick', 'chat', 'text chat', 'mute', 'spray logo',
        'workshop', 'community server', 'surf', 'bhop', 'mods', 'plugin',
        'custom map', 'training map'
    ]
}


def slugify(name):
    return name.lower().replace(' ', '_')

# 2. Read only the columns we need
df = dd.read_parquet(DD_PATH, columns=['review', 'votes_up'])

# 3. Normalize text
df['review_text'] = df['review'].fillna('').str.lower()

# 4. For each theme, build a regex string and flag rows
for theme, words in themes.items():
    pat = r'\b(?:' + '|'.join(map(re.escape, words)) + r')\b'
    col = f"is_{slugify(theme)}"
    df[col] = df['review_text'].str.contains(pat,
                                             flags=re.IGNORECASE,
                                             regex=True)

# 5. Persist so subsequent operations reuse cached partitions
df = df.persist()

# 6. Compute total counts per theme
bool_cols = [f"is_{slugify(t)}" for t in themes]
counts_series = df[bool_cols].sum().compute()

theme_counts = {
    theme: int(counts_series[f"is_{slugify(theme)}"])
    for theme in themes
}

# 7. Extract top-N reviews by votes_up for each theme
top_reviews = {}
for theme in themes:
    col = f"is_{slugify(theme)}"
    # filter, sort, and grab top N
    top_df = (
        df[df[col]]
        .nlargest(TOP_N, 'votes_up')[['review', 'votes_up']]
        .compute()
    )
    # convert to list of dicts
    top_reviews[theme] = top_df.to_dict(orient='records')

# 8. Bundle and save
summary = {
    'theme_counts': theme_counts,
    'top_reviews':   top_reviews
}

# Print to console
print(json.dumps(summary, indent=2))

# Optionally write out for your dashboard/front-end
with open('review_theme_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

  out = getattr(getattr(obj, accessor, obj), attr)(*args, **kwargs)
  out = getattr(getattr(obj, accessor, obj), attr)(*args, **kwargs)
  out = getattr(getattr(obj, accessor, obj), attr)(*args, **kwargs)
  out = getattr(getattr(obj, accessor, obj), attr)(*args, **kwargs)
  out = getattr(getattr(obj, accessor, obj), attr)(*args, **kwargs)
  out = getattr(getattr(obj, accessor, obj), attr)(*args, **kwargs)
  out = getattr(getattr(obj, accessor, obj), attr)(*args, **kwargs)
  out = getattr(getattr(obj, accessor, obj), attr)(*args, **kwargs)
  out = getattr(getattr(obj, accessor, obj), attr)(*args, **kwargs)
  out = getattr(getattr(obj, accessor, obj), attr)(*args, **kwargs)
  out = getattr(getattr(obj, accessor, obj), attr)(*args, **kwargs)
  out = getattr(getattr(obj, accessor, obj), attr)(*args, **kwargs)
  out = getattr(getattr(obj, accessor, obj), attr)(*args, **kwargs)
  out = getattr(getattr(obj, accessor, obj), attr)(*args, **kwargs)
  out = getattr(getattr(obj, accessor, obj), att

{
  "theme_counts": {
    "gameplay": 1656,
    "weapons": 1021,
    "maps": 502,
    "competitive": 759,
    "economy & skins": 1230,
    "anti_cheat": 809,
    "performance": 5845,
    "visuals": 944,
    "audio": 322,
    "community": 2247
  },
  "top_reviews": {
    "gameplay": [
      {
        "review": "I never played a better first person shooter Graphics doesnt matter for gamer gameplay only",
        "votes_up": 712
      },
      {
        "review": "CS is one of the greatest and undermined games in history It is in my opinion the best game to ever have a crosshair It is not a contact sport so for the girly girls it is an activity to indulge in It starts off with a team trying to kill another team I started playing cs when I noticed jacking off isnt the only activity to do on a computer I started inviting a lot of friends out and getting drinks as well as a boom box to spend a nice summer day in the living room playing this wonderful game I wanted to play every chance that i