In [None]:
# Cell 1: Imports & Optimized Dask Client
import os
import json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from dask.distributed import Client, LocalCluster
import dask.dataframe as dd
import dask.bag as db
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline

# Start a local Dask cluster with constrained resources
# Reserve ~75% of available RAM for Dask, leaving room for other processes
cluster = LocalCluster(
    n_workers=4,  # Adjust based on your CPU cores
    threads_per_worker=2,
    memory_limit='4GB'  # 16GB total across 4 workers, leaving 4GB for system
)
client = Client(cluster)
print(f"Dashboard link: {client.dashboard_link}")

Dashboard link: http://127.0.0.1:8787/status


2025-05-07 01:50:16,728 - distributed.scheduler - ERROR - Task ('from_sequence-process_summary_item-42e21853e75f5bee5f1ce4c84bd89b70', 9) marked as failed because 4 workers died while trying to run it
2025-05-07 01:50:20,667 - distributed.scheduler - ERROR - Task ('from_sequence-process_summary_item-42e21853e75f5bee5f1ce4c84bd89b70', 1) marked as failed because 4 workers died while trying to run it
2025-05-07 01:50:20,668 - distributed.scheduler - ERROR - Task ('from_sequence-process_summary_item-42e21853e75f5bee5f1ce4c84bd89b70', 8) marked as failed because 4 workers died while trying to run it


In [2]:
# Cell 2: Load Theme Dictionary & Optimize Theme Embeddings
# Load per-game theme keywords
with open('game_themes.json', 'r') as f:
    raw = json.load(f)
GAME_THEMES = {int(appid): themes for appid, themes in raw.items()}

# Initialize SBERT embedder
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Function to get theme embeddings for specific app IDs
# This avoids loading all embeddings at once
def get_theme_embeddings(app_ids):
    """Get theme embeddings for a specific set of app IDs"""
    embeddings = {}
    for appid in app_ids:
        if appid not in embeddings and appid in GAME_THEMES:
            emb_list = []
            for theme, seeds in GAME_THEMES[appid].items():
                seed_emb = embedder.encode(seeds, convert_to_numpy=True)
                emb_list.append(seed_emb.mean(axis=0))
            embeddings[appid] = np.vstack(emb_list)
    return embeddings

In [3]:
# Cell 3: Read Parquet Files with Optimized Blocksize
# Read with explicit blocksize optimization
ddf = dd.read_parquet(
    'parquet_output_theme_combo/*.parquet',
    columns=['steam_appid', 'review', 'review_language', 'voted_up'],
    blocksize='64MB'  # Adjust based on available RAM
)

In [4]:
# Cell 4: Filter & Clean Data
# Keep only English reviews and drop missing text
ddf = ddf[ddf['review_language'] == 'english']
ddf = ddf.dropna(subset=['review'])

In [5]:
# Cell 5: Optimized Partition-wise Topic Assignment
def assign_topic(df_partition):
    """Assign topics using only theme embeddings for app IDs in this partition"""
    # If no rows, return as-is
    if df_partition.empty:
        df_partition['topic_id'] = []
        return df_partition
    
    # Get unique app IDs in this partition
    app_ids = df_partition['steam_appid'].unique().tolist()
    app_ids = [int(appid) for appid in app_ids]
    
    # Get embeddings only for app IDs in this partition
    local_theme_embeddings = get_theme_embeddings(app_ids)
    
    reviews = df_partition['review'].tolist()
    # Compute embeddings in one go with batching
    review_embeds = embedder.encode(reviews, convert_to_numpy=True, batch_size=64)
    
    # Assign each review to its game-specific theme
    topic_ids = []
    for idx, appid in enumerate(df_partition['steam_appid']):
        appid = int(appid)
        if appid in local_theme_embeddings:
            theme_embs = local_theme_embeddings[appid]
            sims = cosine_similarity(review_embeds[idx:idx+1], theme_embs)
            topic_ids.append(int(sims.argmax()))
        else:
            # Default topic if theme embeddings not available
            topic_ids.append(0)
    
    df_partition['topic_id'] = topic_ids
    return df_partition

# Apply to each partition; specify output metadata
meta = ddf._meta.assign(topic_id=np.int64())
ddf_with_topic = ddf.map_partitions(assign_topic, meta=meta)

In [6]:
# Cell 6: Aggregate Counts, Likes, and Collect Reviews per Theme
# Process in smaller chunks to avoid memory issues
# Get unique app IDs
unique_app_ids = ddf['steam_appid'].unique().compute()

# Initialize empty dataframes for results
all_agg_dfs = []
all_review_dfs = []

# Process in batches of app IDs
batch_size = 5  # Adjust based on your memory constraints
for i in tqdm(range(0, len(unique_app_ids), batch_size)):
    batch_app_ids = unique_app_ids[i:i+batch_size]
    
    # Filter data for this batch of app IDs
    batch_ddf = ddf_with_topic[ddf_with_topic['steam_appid'].isin(batch_app_ids)]
    
    # Aggregate for this batch
    agg = batch_ddf.groupby(['steam_appid', 'topic_id']).agg(
        review_count=('review', 'count'),
        likes_sum=('voted_up', 'sum')
    )
    
    # Collect reviews for this batch
    reviews_series = batch_ddf.groupby(['steam_appid', 'topic_id'])['review'] \
        .apply(lambda x: list(x), meta=('review', object))
    
    # Compute both in parallel
    agg_df, reviews_df = dd.compute(agg, reviews_series)
    
    # Convert to DataFrames
    agg_df = agg_df.reset_index()
    reviews_df = reviews_df.reset_index().rename(columns={'review': 'Reviews'})
    
    # Append to results
    all_agg_dfs.append(agg_df)
    all_review_dfs.append(reviews_df)

# Combine results
agg_df = pd.concat(all_agg_dfs)
reviews_df = pd.concat(all_review_dfs)

  0%|          | 0/1 [00:00<?, ?it/s]

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


In [7]:
# Cell 7: Construct Final Report DataFrame
# Merge counts, likes, and reviews
report_df = pd.merge(
    agg_df,
    reviews_df,
    on=['steam_appid', 'topic_id'],
    how='left'
)

# Build the final output structure
rows = []
for _, row in report_df.iterrows():
    appid = int(row['steam_appid'])
    tid = int(row['topic_id'])
    
    # Check if appid exists in GAME_THEMES
    if appid in GAME_THEMES:
        theme_keys = list(GAME_THEMES[appid].keys())
        # Check if tid is a valid index
        if tid < len(theme_keys):
            theme_name = theme_keys[tid]
        else:
            theme_name = f"Unknown Theme {tid}"
    else:
        theme_name = f"Unknown Theme {tid}"
    
    total = int(row['review_count'])
    likes = int(row['likes_sum'])
    like_ratio = f"{(likes / total * 100):.1f}%" if total > 0 else '0%'
    rows.append({
        'steam_appid': appid,
        'Theme': theme_name,
        '#Reviews': total,
        'LikeRatio': like_ratio,
        'Reviews': row['Reviews']
    })

final_report = pd.DataFrame(rows)

# Save intermediate results to avoid recomputation if summarization fails
final_report.to_csv('output_csvs/SBERT_DD_new_report.csv', index=False)

In [8]:
# Cell 8: View the Report
# Print preview of the DataFrame (excluding the Reviews column as it contains lists)
print("Final report preview (Reviews column contains lists of review texts):")
print(final_report[['steam_appid', 'Theme', '#Reviews', 'LikeRatio']].head())

# Verify that Reviews column contains lists
sample_reviews = final_report['Reviews'].iloc[0]
print(f"\nSample from first Reviews entry (showing first review only):")
if isinstance(sample_reviews, list) and len(sample_reviews) > 0:
    print(f"Number of reviews in list: {len(sample_reviews)}")
    print(f"First review (truncated): {sample_reviews[0][:100]}...")

Final report preview (Reviews column contains lists of review texts):
   steam_appid        Theme  #Reviews LikeRatio
0           10    community      2511     96.2%
1           10   anti_cheat      3654     93.7%
2           10  performance      2527     91.7%
3           10  competitive      9644     98.1%
4           10     gameplay      2416     96.9%

Sample from first Reviews entry (showing first review only):
Number of reviews in list: 2511
First review (truncated): Actually the best game in this world. It still doesnt matter if u have NASA PC or you are playing on...


In [9]:
# Cell 9: Parallelize Hierarchical Summarization with Dask (OPTIONAL)
# NOTE: This cell is optional. The basic report with the required columns is 
# already saved in Cell 7. Only run this if you want theme summarization.

# First save the report with required columns
final_report_basic = final_report[['steam_appid', 'Theme', '#Reviews', 'LikeRatio', 'Reviews']]
final_report_basic.to_csv('output_csvs/theme_report.csv', index=False)
print(f"Saved basic report with required columns to 'output_csvs/theme_report.csv'")

# Initialize summarizer
summarizer = pipeline(
    task='summarization',
    model='sshleifer/distilbart-cnn-12-6',
    device=0,  # change to -1 if no GPU
    framework='pt'
)

def hierarchical_summary(reviews, chunk_size=200,
                         max_len=60, min_len=20):
    """
    Summarize a long list of reviews into one short summary:
      1) Chunk the reviews into batches of chunk_size
      2) Summarize each batch
      3) Summarize the concatenation of batch summaries
    
    Params:
      reviews    : list of str, the reviews to summarize
      chunk_size : int, number of reviews per intermediate chunk
      max_len    : int, max summary tokens per call
      min_len    : int, min summary tokens per call
    
    Returns:
      str, final "quick read" summary
    """
    # If there are fewer than chunk_size, just do one summary
    if len(reviews) <= chunk_size:
        doc = "\n\n".join(reviews[:chunk_size])  # Limit to chunk_size to avoid OOM
        return summarizer(
            doc,
            max_length=max_len,
            min_length=min_len,
            truncation=True,
            do_sample=False
        )[0]['summary_text']
    
    # 2) Summarize each chunk
    intermediate = []
    for i in range(0, min(len(reviews), 1000), chunk_size):  # Limit to 1000 reviews max
        batch = reviews[i:i+chunk_size]
        text = "\n\n".join(batch)
        summ = summarizer(
            text,
            max_length=max_len,
            min_length=min_len,
            truncation=True,
            do_sample=False
        )[0]['summary_text']
        intermediate.append(summ)
    
    # 3) Summarize the intermediate summaries
    joined = " ".join(intermediate)
    return summarizer(
        joined,
        max_length=max_len,
        min_length=min_len,
        truncation=True,
        do_sample=False
    )[0]['summary_text']

# Prepare data for parallel processing
summary_items = []
for _, row in final_report.iterrows():
    summary_items.append({
        'appid': row['steam_appid'], 
        'theme': row['Theme'], 
        'reviews': row['Reviews']
    })

# Function for parallel processing
def process_summary_item(item):
    try:
        summary = hierarchical_summary(
            item['reviews'],
            chunk_size=200,
            max_len=60,
            min_len=20
        )
        return {
            'appid': item['appid'],
            'theme': item['theme'],
            'summary': summary
        }
    except Exception as e:
        return {
            'appid': item['appid'],
            'theme': item['theme'],
            'summary': f"Error: {str(e)}"
        }

# Process in parallel with Dask
# Split into smaller batches to avoid memory issues
batch_size = 10  # Process 10 themes at a time
all_summaries = []

for i in tqdm(range(0, len(summary_items), batch_size), desc="Processing summary batches"):
    batch = summary_items[i:i+batch_size]
    bag = db.from_sequence(batch)
    batch_results = bag.map(process_summary_item).compute()
    all_summaries.extend(batch_results)

# Create a mapping from (appid, theme) to summary
summary_map = {
    (item['appid'], item['theme']): item['summary'] 
    for item in all_summaries
}

# Create a copy of the final report and add summaries
final_report_with_summary = final_report.copy()
final_report_with_summary['QuickSummary'] = final_report_with_summary.apply(
    lambda row: summary_map.get((row['steam_appid'], row['Theme']), "No summary available"),
    axis=1
)

# Save enhanced results
final_report_with_summary.to_csv('output_csvs/theme_report_with_summary.csv', index=False)

# Display sample of final results
print(final_report[['steam_appid', 'Theme', 'QuickSummary']].head())

# Clean up
client.close()
cluster.close()

Saved basic report with required columns to 'output_csvs/theme_report.csv'


Device set to use cuda:0


Processing summary batches:   0%|          | 0/5 [00:00<?, ?it/s]

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


KilledWorker: Attempted to run task ('from_sequence-process_summary_item-42e21853e75f5bee5f1ce4c84bd89b70', 9) on 4 different workers, but all those workers died while running it. The last worker that attempt to run the task was tcp://127.0.0.1:32827. Inspecting worker logs is often a good next step to diagnose what went wrong. For more information see https://distributed.dask.org/en/stable/killed.html.