In [1]:
# Cell 1: Imports & Dask Client
import os
from dask.distributed import Client
import dask.dataframe as dd
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Start a local Dask client
client = Client()



In [2]:
# Cell 2: Load Theme Dictionary & Precompute Embeddings
import json

# Load per-game theme keywords
with open('game_themes.json', 'r') as f:
    raw = json.load(f)
GAME_THEMES = {int(appid): themes for appid, themes in raw.items()}

# Initialize SBERT embedder
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Precompute theme embeddings for each game
THEME_EMBEDDINGS = {}
for appid, themes in GAME_THEMES.items():
    emb_list = []
    for theme, seeds in themes.items():
        seed_emb = embedder.encode(seeds, convert_to_numpy=True)
        emb_list.append(seed_emb.mean(axis=0))
    THEME_EMBEDDINGS[appid] = np.vstack(emb_list)



In [3]:
# Cell 3: Read All Parquet Files into a Dask DataFrame
# Assumes all game parquet files are in the same folder
ddf = dd.read_parquet(
    'parquet_output_theme_combo/*.parquet',
    columns=['steam_appid', 'review', 'review_language', 'voted_up']
)


In [4]:
# Cell 4: Filter & Clean Data
# Keep only English reviews and drop missing text
ddf = ddf[ddf['review_language'] == 'english']
ddf = ddf.dropna(subset=['review'])



In [5]:
# Cell 5: Define Partition-wise Topic Assignment
model = SentenceTransformer('all-MiniLM-L6-v2')  # reused on each worker

def assign_topic(df_partition):
    # If no rows, return as-is
    if df_partition.empty:
        df_partition['topic_id'] = []
        return df_partition

    reviews = df_partition['review'].tolist()
    # Compute embeddings in one go
    review_embeds = embedder.encode(reviews, convert_to_numpy=True, batch_size=64)
    
    # Assign each review to its game-specific theme
    topic_ids = []
    for idx, appid in enumerate(df_partition['steam_appid']):
        theme_embs = THEME_EMBEDDINGS[int(appid)]
        sims = cosine_similarity(review_embeds[idx:idx+1], theme_embs)
        topic_ids.append(int(sims.argmax()))
    
    df_partition['topic_id'] = topic_ids
    return df_partition

# Apply to each partition; specify output metadata
meta = ddf._meta.assign(topic_id=np.int64())
ddf_with_topic = ddf.map_partitions(assign_topic, meta=meta)

In [6]:
# Cell 6: Aggregate Counts, Likes, and Collect Reviews per Theme
# Count reviews and sum votes per (game, theme)
agg = ddf_with_topic.groupby(['steam_appid', 'topic_id']).agg(
    review_count=('review', 'count'),
    likes_sum=('voted_up', 'sum')
)

# Also collect reviews into lists per group
reviews_series = ddf_with_topic.groupby(['steam_appid', 'topic_id'])['review'] \
    .apply(lambda x: list(x), meta=('review', object))

# Compute both in parallel
agg_df, reviews_df = dd.compute(agg, reviews_series)

# Convert reviews series to DataFrame
reviews_df = reviews_df.reset_index().rename(columns={'review': 'Reviews'})

# Convert aggregation to DataFrame
agg_df = agg_df.reset_index()

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


In [7]:
# Cell 7: Construct Final Report DataFrame
import pandas as pd

# Merge counts, likes, and reviews
report_df = pd.merge(
    agg_df,
    reviews_df,
    on=['steam_appid', 'topic_id'],
    how='left'
)

# Build the final output structure
rows = []
for _, row in report_df.iterrows():
    appid = int(row['steam_appid'])
    tid = int(row['topic_id'])
    theme_name = list(GAME_THEMES[appid].keys())[tid]
    total = int(row['review_count'])
    likes = int(row['likes_sum'])
    like_ratio = f"{(likes / total * 100):.1f}%" if total > 0 else '0%'
    rows.append({
        'steam_appid': appid,
        'Theme': theme_name,
        '#Reviews': total,
        'LikeRatio': like_ratio,
        'Reviews': row['Reviews']
    })

final_report = pd.DataFrame(rows)

# Optionally, save to CSV
final_report.to_csv('output_csvs/SBERT_DD_report.csv', index=False)

In [8]:

# Cell 8: View the Report
print(final_report.head())

   steam_appid        Theme  #Reviews LikeRatio  \
0           10    community      2511     96.2%   
1           10   anti_cheat      3654     93.7%   
2           10  performance      2527     91.7%   
3           10  competitive      9644     98.1%   
4           10     gameplay      2416     96.9%   

                                             Reviews  
0  [Actually the best game in this world. It stil...  
1  [So here's a little story.\nBefore my dad and ...  
2  [How to correctly play this game:\n-Noisiest f...  
3  [[h1] Once a fire lit in my heart and now it w...  
4  [Counter-Strike won't ever be canceled.\nCount...  
