# 

In [None]:
# Cell 1: Imports & Dask Client
import os
from dask.distributed import Client
import dask.dataframe as dd
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Start a local Dask client
client = Client()
client

In [None]:
# Cell 2: Load Theme Dictionary & Precompute Embeddings
#only send the theme embeddings for the required worker and not all the embedding at once.
import json

# Load per-game theme keywords
with open('game_themes.json', 'r') as f:
    raw = json.load(f)
GAME_THEMES = {int(appid): themes for appid, themes in raw.items()}

# Initialize SBERT embedder
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Precompute theme embeddings for each game
THEME_EMBEDDINGS = {}
for appid, themes in GAME_THEMES.items():
    emb_list = []
    for theme, seeds in themes.items():
        seed_emb = embedder.encode(seeds, convert_to_numpy=True)
        emb_list.append(seed_emb.mean(axis=0))
    THEME_EMBEDDINGS[appid] = np.vstack(emb_list)



In [None]:
# Cell 3: Read All Parquet Files into a Dask DataFrame
# Assumes all game parquet files are in the same folder
ddf = dd.read_parquet(
    'parquet_output_theme_combo/*.parquet',
    columns=['steam_appid', 'review', 'review_language', 'voted_up']
)


In [None]:
# Cell 4: Filter & Clean Data
# Keep only English reviews and drop missing text
ddf = ddf[ddf['review_language'] == 'english']
ddf = ddf.dropna(subset=['review'])



In [None]:
# Cell 5: Define Partition-wise Topic Assignment
model = SentenceTransformer('all-MiniLM-L6-v2')  # reused on each worker

def assign_topic(df_partition):
    # If no rows, return as-is
    if df_partition.empty:
        df_partition['topic_id'] = []
        return df_partition

    reviews = df_partition['review'].tolist()
    # Compute embeddings in one go
    review_embeds = embedder.encode(reviews, convert_to_numpy=True, batch_size=64)
    
    # Assign each review to its game-specific theme
    topic_ids = []
    for idx, appid in enumerate(df_partition['steam_appid']):
        theme_embs = THEME_EMBEDDINGS[int(appid)]
        sims = cosine_similarity(review_embeds[idx:idx+1], theme_embs)
        topic_ids.append(int(sims.argmax()))
    
    df_partition['topic_id'] = topic_ids
    return df_partition

# Apply to each partition; specify output metadata
meta = ddf._meta.assign(topic_id=np.int64())
ddf_with_topic = ddf.map_partitions(assign_topic, meta=meta)

In [None]:
# Cell 6: Aggregate Counts, Likes, and Collect Reviews per Theme
# Count reviews and sum votes per (game, theme)
agg = ddf_with_topic.groupby(['steam_appid', 'topic_id']).agg(
    review_count=('review', 'count'),
    likes_sum=('voted_up', 'sum')
)

# Also collect reviews into lists per group
reviews_series = ddf_with_topic.groupby(['steam_appid', 'topic_id'])['review'] \
    .apply(lambda x: list(x), meta=('review', object))

# Compute both in parallel
agg_df, reviews_df = dd.compute(agg, reviews_series)

# Convert reviews series to DataFrame
reviews_df = reviews_df.reset_index().rename(columns={'review': 'Reviews'})

# Convert aggregation to DataFrame
agg_df = agg_df.reset_index()

In [None]:
# Cell 7: Construct Final Report DataFrame
import pandas as pd

# Merge counts, likes, and reviews
report_df = pd.merge(
    agg_df,
    reviews_df,
    on=['steam_appid', 'topic_id'],
    how='left'
)

# Build the final output structure
rows = []
for _, row in report_df.iterrows():
    appid = int(row['steam_appid'])
    tid = int(row['topic_id'])
    theme_name = list(GAME_THEMES[appid].keys())[tid]
    total = int(row['review_count'])
    likes = int(row['likes_sum'])
    like_ratio = f"{(likes / total * 100):.1f}%" if total > 0 else '0%'
    rows.append({
        'steam_appid': appid,
        'Theme': theme_name,
        '#Reviews': total,
        'LikeRatio': like_ratio,
        'Reviews': row['Reviews']
    })

final_report = pd.DataFrame(rows)

# Optionally, save to CSV
final_report.to_csv('output_csvs/SBERT_DD_report.csv', index=False)

In [None]:

# Cell 8: View the Report
print(final_report.head())
client.close()


In [None]:
# Cell 9 (FULLY OPTIMIZED - FIXED): GPU-optimized hierarchical summarization with Dask

import pandas as pd
import numpy as np
import torch
import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
from tqdm.auto import tqdm
import time

# Start a local Dask cluster
n_workers = 4  # Adjust based on your CPU core count
cluster = LocalCluster(n_workers=n_workers, threads_per_worker=1)
client = Client(cluster)
print(f"Dask dashboard available at: {client.dashboard_link}")

# Define model parameters 
MODEL_NAME = 'sshleifer/distilbart-cnn-12-6'
MAX_GPU_BATCH_SIZE = 64  # Large batch size for RTX 4080 Super

# First, load the data once and distribute it to avoid repetition
@dask.delayed
def prepare_partition(start_idx, end_idx):
    """Prepare a partition without loading the entire DataFrame into each worker"""
    # Get just this partition
    return final_report.iloc[start_idx:end_idx].copy()

# Prepare partitions with delayed
partition_size = len(final_report) // n_workers
partitions = []
for i in range(n_workers):
    start_idx = i * partition_size
    end_idx = (i + 1) * partition_size if i < n_workers - 1 else len(final_report)
    partitions.append(prepare_partition(start_idx, end_idx))

# The main processing function - FIXED: Removed dependency on datasets library
@dask.delayed
def process_partition(partition_df, worker_id):
    """Process a partition of the data on a worker with batch processing"""
    # Import packages needed in the worker
    from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
    import torch
    from tqdm.auto import tqdm
    
    # Load tokenizer first
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # Load model with device_map="auto"
    model = AutoModelForSeq2SeqLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    
    # Create pipeline with model AND tokenizer
    summarizer = pipeline(
        task='summarization',
        model=model,
        tokenizer=tokenizer,
        framework='pt',
        model_kwargs={"use_cache": True}
    )
    
    # Report worker GPU status
    gpu_mem = torch.cuda.memory_allocated(0) / (1024**3)
    print(f"Worker {worker_id}: GPU Memory: {gpu_mem:.2f}GB allocated")
    
    # FIXED: Process chunks in batches without requiring the datasets library
    def process_chunks_batched(chunks):
        """Process chunks in batches for efficient GPU utilization"""
        # Process in large batches to utilize GPU effectively
        all_summaries = []
        
        # Process in batches of MAX_GPU_BATCH_SIZE
        for i in range(0, len(chunks), MAX_GPU_BATCH_SIZE):
            batch = chunks[i:i+MAX_GPU_BATCH_SIZE]
            batch_summaries = summarizer(
                batch,
                max_length=60,
                min_length=20,
                truncation=True,
                do_sample=False
            )
            all_summaries.extend([s["summary_text"] for s in batch_summaries])
            
        return all_summaries
    
    # Define the hierarchical summary function with batch processing
    def hierarchical_summary(reviews, chunk_size=200):
        # If there are fewer than chunk_size, just do one summary
        if len(reviews) <= chunk_size:
            doc = "\n\n".join(reviews)
            return summarizer(
                doc,
                max_length=60,
                min_length=20,
                truncation=True,
                do_sample=False
            )[0]['summary_text']
        
        # Prepare all chunks for processing
        all_chunks = []
        for i in range(0, len(reviews), chunk_size):
            batch = reviews[i:i+chunk_size]
            text = "\n\n".join(batch)
            all_chunks.append(text)
        
        # Process chunks with batched processing
        intermediate_summaries = process_chunks_batched(all_chunks)
        
        # Summarize the intermediate summaries
        joined = " ".join(intermediate_summaries)
        return summarizer(
            joined,
            max_length=60,
            min_length=20,
            truncation=True,
            do_sample=False
        )[0]['summary_text']
    
    # Process the partition with a progress bar
    results = []
    # Create a progress bar for this worker
    with tqdm(total=len(partition_df), desc=f"Worker {worker_id}", position=worker_id) as pbar:
        for idx, row in partition_df.iterrows():
            summary = hierarchical_summary(row['Reviews'], chunk_size=200)
            results.append((idx, summary))
            pbar.update(1)
            
            # Clean up every few iterations
            if len(results) % 5 == 0:
                torch.cuda.empty_cache()
    
    # Clean up at the end
    torch.cuda.empty_cache()
    del model
    del summarizer
    
    # Return the results for this partition
    return results

# Schedule the tasks with the delayed partitions
print(f"Scheduling {n_workers} partitions for processing...")
delayed_results = []
for i in range(n_workers):
    delayed_result = process_partition(partitions[i], i)
    delayed_results.append(delayed_result)
    print(f"Scheduled partition {i+1}/{n_workers}")

# Create a main progress bar for overall progress
print("\nStarting distributed computation with progress tracking:")
main_progress = tqdm(total=len(final_report), desc="Overall Progress")

# Start timing
start_time = time.time()

# Create a global progress updater
def update_main_progress(future):
    # Update main progress bar based on worker progress
    completed_tasks = sum(future.status == "finished" for future in client.futures.values())
    main_progress.n = min(len(final_report), completed_tasks * (len(final_report) // len(delayed_results)))
    main_progress.refresh()

# Submit the tasks to the cluster
futures = client.compute(delayed_results)

# Start a loop to update the main progress bar
import threading
stop_flag = False

def progress_monitor():
    while not stop_flag:
        update_main_progress(futures)
        time.sleep(0.5)

# Start the progress monitor in a separate thread
monitor_thread = threading.Thread(target=progress_monitor)
monitor_thread.start()

# Wait for computation to complete - FIXED: Added more reliable computation approach
try:
    print("Computing all partitions...")
    results = client.gather(futures)
except Exception as e:
    # Fallback to direct computation if future gathering fails
    print(f"Error with futures: {e}")
    print("Falling back to direct computation...")
    results = dask.compute(*delayed_results)

# Stop the progress monitor
stop_flag = True
monitor_thread.join()

# Update progress bar to completion
main_progress.n = len(final_report)
main_progress.refresh()
main_progress.close()

# Flatten the nested list of results
all_results = []
for worker_results in results:
    all_results.extend(worker_results)

# Sort by index
all_results.sort(key=lambda x: x[0])
summaries = [result[1] for result in all_results]

# Store results in a new column
final_report['QuickSummary'] = summaries

# Report final timing
elapsed_time = time.time() - start_time
print(f"\nCompleted in {elapsed_time:.2f} seconds")

# Display results
display(final_report[['steam_appid', 'Theme', 'QuickSummary']].head())

# Shut down the client and cluster
client.close()
cluster.close()

In [None]:
final_report.to_csv('output_csvs/summarized_report.csv')