In [None]:
# Cell 1: Imports & Dask Client
import os
from dask.distributed import Client
import dask.dataframe as dd
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Start a local Dask client
client = Client()



In [None]:
# Cell 2: Load Theme Dictionary & Precompute Embeddings
import json

# Load per-game theme keywords
with open('game_themes.json', 'r') as f:
    raw = json.load(f)
GAME_THEMES = {int(appid): themes for appid, themes in raw.items()}

# Initialize SBERT embedder
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Precompute theme embeddings for each game
THEME_EMBEDDINGS = {}
for appid, themes in GAME_THEMES.items():
    emb_list = []
    for theme, seeds in themes.items():
        seed_emb = embedder.encode(seeds, convert_to_numpy=True)
        emb_list.append(seed_emb.mean(axis=0))
    THEME_EMBEDDINGS[appid] = np.vstack(emb_list)



In [None]:
# Cell 3: Read All Parquet Files into a Dask DataFrame
# Assumes all game parquet files are in the same folder
ddf = dd.read_parquet(
    'parquet_output_theme_combo/*.parquet',
    columns=['steam_appid', 'review', 'review_language', 'voted_up']
)


In [None]:
# Cell 4: Filter & Clean Data
# Keep only English reviews and drop missing text
ddf = ddf[ddf['review_language'] == 'english']
ddf = ddf.dropna(subset=['review'])



In [5]:
# Cell 5: Define Partition-wise Topic Assignment
model = SentenceTransformer('all-MiniLM-L6-v2')  # reused on each worker

def assign_topic(df_partition):
    # If no rows, return as-is
    if df_partition.empty:
        df_partition['topic_id'] = []
        return df_partition

    reviews = df_partition['review'].tolist()
    # Compute embeddings in one go
    review_embeds = embedder.encode(reviews, convert_to_numpy=True, batch_size=64)
    
    # Assign each review to its game-specific theme
    topic_ids = []
    for idx, appid in enumerate(df_partition['steam_appid']):
        theme_embs = THEME_EMBEDDINGS[int(appid)]
        sims = cosine_similarity(review_embeds[idx:idx+1], theme_embs)
        topic_ids.append(int(sims.argmax()))
    
    df_partition['topic_id'] = topic_ids
    return df_partition

# Apply to each partition; specify output metadata
meta = ddf._meta.assign(topic_id=np.int64())
ddf_with_topic = ddf.map_partitions(assign_topic, meta=meta)

In [6]:
# Cell 6: Aggregate Counts, Likes, and Collect Reviews per Theme
# Count reviews and sum votes per (game, theme)
agg = ddf_with_topic.groupby(['steam_appid', 'topic_id']).agg(
    review_count=('review', 'count'),
    likes_sum=('voted_up', 'sum')
)

# Also collect reviews into lists per group
reviews_series = ddf_with_topic.groupby(['steam_appid', 'topic_id'])['review'] \
    .apply(lambda x: list(x), meta=('review', object))

# Compute both in parallel
agg_df, reviews_df = dd.compute(agg, reviews_series)

# Convert reviews series to DataFrame
reviews_df = reviews_df.reset_index().rename(columns={'review': 'Reviews'})

# Convert aggregation to DataFrame
agg_df = agg_df.reset_index()

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.


In [7]:
# Cell 7: Construct Final Report DataFrame
import pandas as pd

# Merge counts, likes, and reviews
report_df = pd.merge(
    agg_df,
    reviews_df,
    on=['steam_appid', 'topic_id'],
    how='left'
)

# Build the final output structure
rows = []
for _, row in report_df.iterrows():
    appid = int(row['steam_appid'])
    tid = int(row['topic_id'])
    theme_name = list(GAME_THEMES[appid].keys())[tid]
    total = int(row['review_count'])
    likes = int(row['likes_sum'])
    like_ratio = f"{(likes / total * 100):.1f}%" if total > 0 else '0%'
    rows.append({
        'steam_appid': appid,
        'Theme': theme_name,
        '#Reviews': total,
        'LikeRatio': like_ratio,
        'Reviews': row['Reviews']
    })

final_report = pd.DataFrame(rows)

# Optionally, save to CSV
final_report.to_csv('output_csvs/SBERT_DD_report.csv', index=False)

In [8]:

# Cell 8: View the Report
print(final_report.head())
client.close()


   steam_appid        Theme  #Reviews LikeRatio  \
0           10    community      2511     96.2%   
1           10   anti_cheat      3654     93.7%   
2           10  performance      2527     91.7%   
3           10  competitive      9644     98.1%   
4           10     gameplay      2416     96.9%   

                                             Reviews  
0  [Actually the best game in this world. It stil...  
1  [So here's a little story.\nBefore my dad and ...  
2  [How to correctly play this game:\n-Noisiest f...  
3  [[h1] Once a fire lit in my heart and now it w...  
4  [Counter-Strike won't ever be canceled.\nCount...  


In [None]:
# Cell 9 (FIXED - FIRST): Hierarchical summarization of all reviews per theme

from transformers import pipeline
from tqdm.auto import tqdm

# 1) Initialize a single summarizer pipeline
summarizer = pipeline(
    task='summarization',
    model='sshleifer/distilbart-cnn-12-6',
    device=0,              # change to -1 if no GPU
    framework='pt'
)

def hierarchical_summary(reviews, chunk_size=200,
                         max_len=60, min_len=20):
    """
    Summarize a long list of reviews into one short summary:
      1) Chunk the reviews into batches of chunk_size
      2) Summarize each batch
      3) Summarize the concatenation of batch summaries
    
    Params:
      reviews    : list of str, the reviews to summarize
      chunk_size : int, number of reviews per intermediate chunk
      max_len    : int, max summary tokens per call
      min_len    : int, min summary tokens per call
    
    Returns:
      str, final "quick read" summary
    """
    # If there are fewer than chunk_size, just do one summary
    if len(reviews) <= chunk_size:
        doc = "\n\n".join(reviews)
        return summarizer(
            doc,
            max_length=max_len,
            min_length=min_len,
            truncation=True,
            do_sample=False
        )[0]['summary_text']
    
    # 2) Prepare all chunks for batch processing
    all_chunks = []
    for i in range(0, len(reviews), chunk_size):
        batch = reviews[i:i+chunk_size]
        text = "\n\n".join(batch)
        all_chunks.append(text)
    
    # Process all chunks in one batch
    intermediate_summaries = summarizer(
        all_chunks,
        max_length=max_len,
        min_length=min_len,
        truncation=True,
        do_sample=False
    )
    
    # Extract summary texts
    intermediate = [summary['summary_text'] for summary in intermediate_summaries]
    
    # 3) Summarize the intermediate summaries
    joined = " ".join(intermediate)
    return summarizer(
        joined,
        max_length=max_len,
        min_length=min_len,
        truncation=True,
        do_sample=False
    )[0]['summary_text']

# 4) Apply to each row of final_report with progress bar
quick_summaries = []
for _, row in tqdm(final_report.iterrows(),
                  total=len(final_report),
                  desc="Summarizing themes"):
    revs = row['Reviews']
    quick = hierarchical_summary(revs,
                                 chunk_size=200,
                                 max_len=60,
                                 min_len=20)
    quick_summaries.append(quick)

# 5) Store results in a new column
final_report['QuickSummary'] = quick_summaries

# 6) Inspect
display(final_report[['steam_appid','Theme','QuickSummary']].head())

In [None]:
# Cell 9 (BIG DATA - BUT WITH PYTHON FUTURES): Hierarchical summarization of all reviews per theme using parallel processing

import pandas as pd
import numpy as np
from transformers import pipeline
from tqdm.auto import tqdm
import concurrent.futures
import multiprocessing

# Set up the number of workers based on CPU cores (with one less to avoid overloading)
num_workers = max(1, multiprocessing.cpu_count() - 1)

# 1) Initialize the summarizer pipeline globally
# This avoids serialization issues
summarizer = pipeline(
    task='summarization',
    model='sshleifer/distilbart-cnn-12-6',
    framework='pt'
)

def hierarchical_summary(reviews, chunk_size=200, max_len=60, min_len=20):
    """
    Summarize a long list of reviews into one short summary using parallel processing for chunks
    """
    # If there are fewer than chunk_size, just do one summary
    if len(reviews) <= chunk_size:
        doc = "\n\n".join(reviews)
        return summarizer(
            doc,
            max_length=max_len,
            min_length=min_len,
            truncation=True,
            do_sample=False
        )[0]['summary_text']
    
    # 2) Prepare all chunks for processing
    all_chunks = []
    for i in range(0, len(reviews), chunk_size):
        batch = reviews[i:i+chunk_size]
        text = "\n\n".join(batch)
        all_chunks.append(text)
    
    # Process chunks in a batch to maximize GPU usage
    intermediate_summaries = summarizer(
        all_chunks,
        max_length=max_len,
        min_length=min_len,
        truncation=True,
        do_sample=False
    )
    
    # Extract summary texts
    intermediate = [summary['summary_text'] for summary in intermediate_summaries]
    
    # 3) Summarize the intermediate summaries
    joined = " ".join(intermediate)
    return summarizer(
        joined,
        max_length=max_len,
        min_length=min_len,
        truncation=True,
        do_sample=False
    )[0]['summary_text']

# 4) Function to process each batch of rows in parallel
def process_batch(batch_df):
    results = []
    for _, row in batch_df.iterrows():
        summary = hierarchical_summary(row['Reviews'], chunk_size=200, max_len=60, min_len=20)
        results.append((row.name, summary))
    return results

# Split the dataframe into batches for parallel processing
def split_dataframe(df, batch_size):
    batches = []
    for i in range(0, len(df), batch_size):
        batches.append(df.iloc[i:i+batch_size])
    return batches

# Calculate optimal batch size based on dataset size and worker count
batch_size = max(1, len(final_report) // num_workers)
batches = split_dataframe(final_report, batch_size)

# Use ThreadPoolExecutor for parallel processing with progress bar
all_results = []
with tqdm(total=len(final_report), desc="Summarizing themes") as pbar:
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        # Submit all batches to the executor
        future_to_batch = {executor.submit(process_batch, batch): batch for batch in batches}
        
        # Process completed batches and update progress
        for future in concurrent.futures.as_completed(future_to_batch):
            batch_results = future.result()
            all_results.extend(batch_results)
            # Update progress bar by the number of rows processed in this batch
            pbar.update(len(future_to_batch[future]))

# Sort results by the original index and extract summaries
all_results.sort(key=lambda x: x[0])  # Sort by index
summaries = [result[1] for result in all_results]

# 5) Store results in a new column
final_report['QuickSummary'] = summaries

# 6) Inspect
display(final_report[['steam_appid', 'Theme', 'QuickSummary']].head())

In [11]:
# Cell 9 (OPTIMIZED - FASTEST SO FAR): GPU-optimized hierarchical summarization for RTX 4080 Super

import pandas as pd
import numpy as np
from transformers import pipeline, AutoTokenizer
from tqdm.auto import tqdm
import torch
import concurrent.futures
import multiprocessing

# Check GPU memory and set optimal batch sizes
gpu_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert to GB
print(f"Available GPU memory: {gpu_mem:.2f} GB")

# RTX 4080 Super optimization parameters
# With 16GB VRAM, we can use larger batch sizes and optimize throughput
MODEL_NAME = 'sshleifer/distilbart-cnn-12-6'
MAX_GPU_BATCH_SIZE = 32  # Larger batch size for 16GB VRAM
PARALLEL_PROCESSES = 4   # Optimal number for balancing CPU and GPU workloads

# 1) Initialize tokenizer to estimate token counts for optimal batching
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 2) Initialize the summarizer pipeline with optimized settings for RTX 4080 Super
summarizer = pipeline(
    task='summarization',
    model=MODEL_NAME,
    device=0,
    framework='pt',
    # Optimized settings for higher throughput
    model_kwargs={
        "use_cache": True,  # Enable KV caching for faster inference
    },
    # Enable half-precision for faster processing and lower memory usage
    torch_dtype=torch.float16
)

def estimate_tokens(text):
    """Estimate token count to optimize batching"""
    return len(tokenizer.encode(text))

def hierarchical_summary(reviews, chunk_size=200, max_len=60, min_len=20):
    """
    GPU-optimized hierarchical summarization with dynamic batching
    """
    # If there are fewer than chunk_size, just do one summary
    if len(reviews) <= chunk_size:
        doc = "\n\n".join(reviews)
        return summarizer(
            doc,
            max_length=max_len,
            min_length=min_len,
            truncation=True,
            do_sample=False
        )[0]['summary_text']
    
    # 2) Prepare all chunks for processing
    all_chunks = []
    for i in range(0, len(reviews), chunk_size):
        batch = reviews[i:i+chunk_size]
        text = "\n\n".join(batch)
        all_chunks.append(text)
    
    # Dynamically determine optimal batch size based on token counts
    # For RTX 4080 Super with 16GB, we can process larger batches
    summaries = []
    for i in range(0, len(all_chunks), MAX_GPU_BATCH_SIZE):
        batch = all_chunks[i:i+MAX_GPU_BATCH_SIZE]
        batch_summaries = summarizer(
            batch,
            max_length=max_len,
            min_length=min_len,
            truncation=True,
            do_sample=False
        )
        summaries.extend([s['summary_text'] for s in batch_summaries])
    
    # 3) Summarize the intermediate summaries
    # RTX 4080 Super can handle the full set of intermediate summaries
    joined = " ".join(summaries)
    return summarizer(
        joined,
        max_length=max_len,
        min_length=min_len,
        truncation=True,
        do_sample=False
    )[0]['summary_text']

# 4) Function to process each batch of rows with GPU optimization
def process_gpu_batch(batch_df):
    results = []
    # Pre-collect all reviews to optimize memory transfers to GPU
    all_rows = [(row.name, row['Reviews']) for _, row in batch_df.iterrows()]
    
    for idx, reviews in all_rows:
        # Use optimized hierarchical summary function
        summary = hierarchical_summary(reviews, chunk_size=200, max_len=60, min_len=20)
        results.append((idx, summary))
        
        # Optional: Force CUDA cache clearing every few iterations to prevent memory fragmentation
        if idx % 10 == 0:
            torch.cuda.empty_cache()
            
    return results

# Calculate optimal processing strategy based on dataset size
total_rows = len(final_report)
# Determine batch size for parallel processing
optimal_batch_size = max(1, total_rows // PARALLEL_PROCESSES)

# Split dataframe into optimized batches
batches = [final_report.iloc[i:i+optimal_batch_size] for i in range(0, total_rows, optimal_batch_size)]

# Process with concurrent.futures and progress tracking
all_results = []
with tqdm(total=total_rows, desc="GPU Summarizing (RTX 4080 Super)") as pbar:
    # Use ThreadPoolExecutor to manage parallel GPU tasks
    with concurrent.futures.ThreadPoolExecutor(max_workers=PARALLEL_PROCESSES) as executor:
        future_to_batch = {executor.submit(process_gpu_batch, batch): batch for batch in batches}
        
        for future in concurrent.futures.as_completed(future_to_batch):
            try:
                batch_results = future.result()
                all_results.extend(batch_results)
                batch_size = len(future_to_batch[future])
                pbar.update(batch_size)
            except Exception as e:
                print(f"Error processing batch: {e}")
                # Continue with remaining batches

# Sort results by the original index
all_results.sort(key=lambda x: x[0])
summaries = [result[1] for result in all_results]

# 5) Store results in a new column
final_report['QuickSummary'] = summaries

# 6) Inspect results
display(final_report[['steam_appid', 'Theme', 'QuickSummary']].head())

# 7) Clean up GPU memory
torch.cuda.empty_cache()

Available GPU memory: 15.56 GB


Device set to use cuda:0


GPU Summarizing (RTX 4080 Super):   0%|          | 0/45 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Your max_length is set to 60, but your input_length is only 5. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)
Your max_length is set to 60, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)


Unnamed: 0,steam_appid,Theme,QuickSummary
0,10,community,Counter-Strike 1.6 was a significant part of ...
1,10,anti_cheat,Counter-Strike is a first-person shooter vide...
2,10,performance,"The game crashes constantly, there is no impl..."
3,10,competitive,"CounterStrike 1.6 is like a time travel, just..."
4,10,gameplay,Counter Strike is a first-person shooter game...


In [12]:
# Cell 9 (ULTRA OPTIMIZED - FIXED): Maximum GPU utilization for RTX 4080 Super

import pandas as pd
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm.auto import tqdm
import torch
import gc

# Force CUDA initialization and check memory
torch.cuda.init()
total_gpu_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert to GB
print(f"Total GPU memory: {total_gpu_mem:.2f} GB")

# Ultra-aggressive GPU optimization parameters for RTX 4080 Super
MODEL_NAME = 'sshleifer/distilbart-cnn-12-6'
MAX_GPU_BATCH_SIZE = 64  # Much larger batch size to fully utilize VRAM
MAX_SEQUENCE_LENGTH = 1024  # Set maximum context length to optimize memory usage

# Load model and tokenizer directly for maximum control
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME, 
    torch_dtype=torch.float16,  # Half-precision for maximum throughput
    device_map="auto"           # Automatically map to available GPU
)

# Move model to GPU and optimize for inference
model.to("cuda")
model.eval()  # Set to evaluation mode

# Create a custom pipeline with maximum batch efficiency
summarizer = pipeline(
    task='summarization',
    model=model,
    tokenizer=tokenizer,
    framework='pt',
    # Force maximum GPU memory usage
    model_kwargs={"use_cache": True}
)

# Monitor GPU memory usage
def gpu_memory_usage():
    """Return GPU memory usage in GB"""
    reserved = torch.cuda.memory_reserved(0) / (1024**3)
    allocated = torch.cuda.memory_allocated(0) / (1024**3)
    print(f"GPU Memory: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved")
    return allocated, reserved

def hierarchical_summary(reviews, chunk_size=200, max_len=60, min_len=20):
    """
    Ultra-optimized hierarchical summarization for maximum GPU utilization
    """
    # If there are fewer than chunk_size, just do one summary
    if len(reviews) <= chunk_size:
        doc = "\n\n".join(reviews)
        return summarizer(
            doc,
            max_length=max_len,
            min_length=min_len,
            truncation=True,
            do_sample=False
        )[0]['summary_text']
    
    # 2) Prepare all chunks for processing with ultra-large batches
    all_chunks = []
    for i in range(0, len(reviews), chunk_size):
        batch = reviews[i:i+chunk_size]
        text = "\n\n".join(batch)
        all_chunks.append(text)
    
    # Process in maximally large batches to saturate GPU
    # This is the key optimization - use much larger batches to fill VRAM
    summaries = []
    for i in range(0, len(all_chunks), MAX_GPU_BATCH_SIZE):
        batch = all_chunks[i:i+MAX_GPU_BATCH_SIZE]
        
        # Log memory usage before batch
        print(f"Processing batch of size {len(batch)} ({i}/{len(all_chunks)})")
        gpu_memory_usage()
        
        # Process maximum-sized batch
        batch_summaries = summarizer(
            batch,
            max_length=max_len,
            min_length=min_len,
            truncation=True,
            do_sample=False
        )
        summaries.extend([s['summary_text'] for s in batch_summaries])
        
        # Log memory after batch
        gpu_memory_usage()
    
    # 3) Summarize the intermediate summaries in a single batch
    joined = " ".join(summaries)
    
    # Final summary
    return summarizer(
        joined,
        max_length=max_len,
        min_length=min_len,
        truncation=True,
        do_sample=False
    )[0]['summary_text']

# Pre-process all reviews to maximize throughput - FIXED THIS LINE
print("Preparing all reviews for processing...")
all_rows = [(i, row['Reviews']) for i, (_, row) in enumerate(final_report.iterrows())]

# Process the entire dataset in sequential maximum-sized batches
# This approach ensures GPU is fully saturated
all_results = []
with tqdm(total=len(final_report), desc="Ultra GPU Optimization") as pbar:
    # Process each row with maximum batch efficiency
    for i in range(0, len(all_rows), 10):  # Process in batches of 10 rows
        batch_rows = all_rows[i:i+10]
        batch_results = []
        
        for batch_idx, (row_idx, reviews) in enumerate(batch_rows):
            # Force garbage collection before large operations
            if batch_idx % 5 == 0:
                torch.cuda.empty_cache()
                gc.collect()
            
            # Process with maximum GPU utilization
            summary = hierarchical_summary(
                reviews, 
                chunk_size=200, 
                max_len=60, 
                min_len=20
            )
            batch_results.append((row_idx, summary))
            pbar.update(1)
        
        all_results.extend(batch_results)
        # Force GPU memory cleanup between large batches
        torch.cuda.empty_cache()
        gc.collect()

# Sort and store results
all_results.sort(key=lambda x: x[0])
summaries = [result[1] for result in all_results]

# Store results in a new column
final_report['QuickSummary'] = summaries

# Inspect results
display(final_report[['steam_appid', 'Theme', 'QuickSummary']].head())

# Final cleanup
torch.cuda.empty_cache()

Total GPU memory: 15.56 GB


Device set to use cuda:0


Preparing all reviews for processing...


Ultra GPU Optimization:   0%|          | 0/45 [00:00<?, ?it/s]

Processing batch of size 13 (0/13)
GPU Memory: 0.69GB allocated, 1.34GB reserved
GPU Memory: 0.69GB allocated, 1.34GB reserved
Processing batch of size 19 (0/19)
GPU Memory: 0.69GB allocated, 1.34GB reserved
GPU Memory: 0.69GB allocated, 1.34GB reserved
Processing batch of size 13 (0/13)
GPU Memory: 0.69GB allocated, 1.34GB reserved
GPU Memory: 0.69GB allocated, 1.34GB reserved
Processing batch of size 49 (0/49)
GPU Memory: 0.69GB allocated, 1.34GB reserved
GPU Memory: 0.69GB allocated, 1.34GB reserved
Processing batch of size 13 (0/13)
GPU Memory: 0.69GB allocated, 1.34GB reserved
GPU Memory: 0.69GB allocated, 1.34GB reserved
Processing batch of size 11 (0/11)
GPU Memory: 0.69GB allocated, 1.06GB reserved
GPU Memory: 0.69GB allocated, 1.08GB reserved
Processing batch of size 8 (0/8)
GPU Memory: 0.69GB allocated, 1.08GB reserved


Your max_length is set to 60, but your input_length is only 5. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)


GPU Memory: 0.69GB allocated, 1.08GB reserved
Processing batch of size 2 (0/2)
GPU Memory: 0.69GB allocated, 1.08GB reserved
GPU Memory: 0.69GB allocated, 1.08GB reserved
Processing batch of size 13 (0/13)
GPU Memory: 0.69GB allocated, 1.09GB reserved
GPU Memory: 0.69GB allocated, 1.09GB reserved
Processing batch of size 4 (0/4)
GPU Memory: 0.69GB allocated, 1.09GB reserved
GPU Memory: 0.69GB allocated, 1.09GB reserved
Processing batch of size 64 (0/210)
GPU Memory: 0.69GB allocated, 1.06GB reserved
GPU Memory: 0.69GB allocated, 1.08GB reserved
Processing batch of size 64 (64/210)
GPU Memory: 0.69GB allocated, 1.08GB reserved
GPU Memory: 0.69GB allocated, 1.08GB reserved
Processing batch of size 64 (128/210)
GPU Memory: 0.69GB allocated, 1.08GB reserved
GPU Memory: 0.69GB allocated, 1.08GB reserved
Processing batch of size 18 (192/210)
GPU Memory: 0.69GB allocated, 1.08GB reserved
GPU Memory: 0.69GB allocated, 1.08GB reserved
Processing batch of size 56 (0/56)
GPU Memory: 0.69GB alloca

Your max_length is set to 60, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)


GPU Memory: 0.69GB allocated, 1.08GB reserved
Processing batch of size 19 (0/19)
GPU Memory: 0.69GB allocated, 1.08GB reserved
GPU Memory: 0.69GB allocated, 1.08GB reserved
Processing batch of size 13 (0/13)
GPU Memory: 0.69GB allocated, 1.08GB reserved
GPU Memory: 0.69GB allocated, 1.08GB reserved
Processing batch of size 28 (0/28)
GPU Memory: 0.69GB allocated, 1.08GB reserved
GPU Memory: 0.69GB allocated, 1.08GB reserved
Processing batch of size 15 (0/15)
GPU Memory: 0.69GB allocated, 1.06GB reserved
GPU Memory: 0.69GB allocated, 1.08GB reserved
Processing batch of size 16 (0/16)
GPU Memory: 0.69GB allocated, 1.08GB reserved
GPU Memory: 0.69GB allocated, 1.08GB reserved
Processing batch of size 9 (0/9)
GPU Memory: 0.69GB allocated, 1.08GB reserved
GPU Memory: 0.69GB allocated, 1.08GB reserved
Processing batch of size 64 (0/64)
GPU Memory: 0.69GB allocated, 1.08GB reserved
GPU Memory: 0.69GB allocated, 1.08GB reserved
Processing batch of size 64 (0/124)
GPU Memory: 0.69GB allocated, 1

Unnamed: 0,steam_appid,Theme,QuickSummary
0,10,community,Counter-Strike 1.6 was a significant part of ...
1,10,anti_cheat,Counter-Strike is a first-person shooter vide...
2,10,performance,"The game crashes constantly, there is no impl..."
3,10,competitive,"CounterStrike 1.6 is like a time travel, just..."
4,10,gameplay,Counter Strike is a first-person shooter game...


In [None]:
# Cell 9 (DASK DISTRIBUTED - FINAL WITH PROGRESS): GPU-optimized hierarchical summarization with Dask

import pandas as pd
import numpy as np
import torch
import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
from tqdm.auto import tqdm
import time

# Start a local Dask cluster
n_workers = 4  # Adjust based on your CPU core count
cluster = LocalCluster(n_workers=n_workers, threads_per_worker=1)
client = Client(cluster)
print(f"Dask dashboard available at: {client.dashboard_link}")

# Define model parameters 
MODEL_NAME = 'sshleifer/distilbart-cnn-12-6'
MAX_GPU_BATCH_SIZE = 64  # Large batch size for RTX 4080 Super

@dask.delayed
def process_partition(partition_df, worker_id):
    """Process a partition of the data on a worker"""
    # Import packages needed in the worker
    from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
    import torch
    from tqdm.auto import tqdm
    
    # Load tokenizer first
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # Load model with device_map="auto" only
    model = AutoModelForSeq2SeqLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto"  # This will handle device placement automatically
    )
    
    # Create pipeline with both model AND tokenizer
    summarizer = pipeline(
        task='summarization',
        model=model,
        tokenizer=tokenizer,
        framework='pt',
        model_kwargs={"use_cache": True}
    )
    
    # Report worker GPU status
    gpu_mem = torch.cuda.memory_allocated(0) / (1024**3)
    print(f"Worker {worker_id}: GPU Memory: {gpu_mem:.2f}GB allocated")
    
    # Define the hierarchical summary function within the worker
    def hierarchical_summary(reviews, chunk_size=200, max_len=60, min_len=20):
        # If there are fewer than chunk_size, just do one summary
        if len(reviews) <= chunk_size:
            doc = "\n\n".join(reviews)
            return summarizer(
                doc,
                max_length=max_len,
                min_length=min_len,
                truncation=True,
                do_sample=False
            )[0]['summary_text']
        
        # Prepare all chunks for processing
        all_chunks = []
        for i in range(0, len(reviews), chunk_size):
            batch = reviews[i:i+chunk_size]
            text = "\n\n".join(batch)
            all_chunks.append(text)
        
        # Process in large batches to utilize GPU
        summaries = []
        for i in range(0, len(all_chunks), MAX_GPU_BATCH_SIZE):
            batch = all_chunks[i:i+MAX_GPU_BATCH_SIZE]
            batch_summaries = summarizer(
                batch,
                max_length=max_len,
                min_length=min_len,
                truncation=True,
                do_sample=False
            )
            summaries.extend([s['summary_text'] for s in batch_summaries])
        
        # Summarize the intermediate summaries
        joined = " ".join(summaries)
        return summarizer(
            joined,
            max_length=max_len,
            min_length=min_len,
            truncation=True,
            do_sample=False
        )[0]['summary_text']
    
    # Process the partition with a progress bar
    results = []
    # Create a progress bar for this worker
    with tqdm(total=len(partition_df), desc=f"Worker {worker_id}", position=worker_id) as pbar:
        for idx, row in partition_df.iterrows():
            summary = hierarchical_summary(row['Reviews'], chunk_size=200, max_len=60, min_len=20)
            results.append((idx, summary))
            pbar.update(1)
            
            # Clean up every few iterations
            if len(results) % 5 == 0:
                torch.cuda.empty_cache()
    
    # Clean up at the end
    torch.cuda.empty_cache()
    del model
    del summarizer
    
    # Return the results for this partition
    return results

# Convert pandas DataFrame to Dask DataFrame
dask_df = dd.from_pandas(final_report, npartitions=n_workers)

# Set up manual progress tracking
print(f"Processing {len(final_report)} rows across {n_workers} partitions...")

# Simple approach to split the dataframe
partition_size = len(final_report) // n_workers
delayed_results = []

# Process each partition separately
print(f"Scheduling {n_workers} partitions for processing...")
for i in range(n_workers):
    # Get start and end index for this partition
    start_idx = i * partition_size
    end_idx = (i + 1) * partition_size if i < n_workers - 1 else len(final_report)
    
    # Get this partition as a pandas DataFrame
    partition_df = final_report.iloc[start_idx:end_idx].copy()
    
    # Create a delayed task to process this partition
    delayed_result = process_partition(partition_df, i)
    delayed_results.append(delayed_result)
    print(f"Scheduled partition {i+1}/{n_workers} with {len(partition_df)} rows")

# Create a main progress bar for overall progress
print("\nStarting distributed computation with progress tracking:")
main_progress = tqdm(total=len(final_report), desc="Overall Progress")

# Start timing
start_time = time.time()

# Create a global progress updater
def update_main_progress(future):
    # Update main progress bar based on worker progress
    # This function will be called repeatedly to update the main progress bar
    completed_tasks = sum(future.status == "finished" for future in client.futures.values())
    main_progress.n = min(len(final_report), completed_tasks * (len(final_report) // len(delayed_results)))
    main_progress.refresh()

# Submit the tasks to the cluster
futures = client.compute(delayed_results)

# Start a loop to update the main progress bar
import threading
stop_flag = False

def progress_monitor():
    while not stop_flag:
        update_main_progress(futures)
        time.sleep(0.5)

# Start the progress monitor in a separate thread
monitor_thread = threading.Thread(target=progress_monitor)
monitor_thread.start()

# Wait for computation to complete
results = dask.compute(*delayed_results)

# Stop the progress monitor
stop_flag = True
monitor_thread.join()

# Update progress bar to completion
main_progress.n = len(final_report)
main_progress.refresh()
main_progress.close()

# Flatten the nested list of results
all_results = []
for worker_results in results:
    all_results.extend(worker_results)

# Sort by index
all_results.sort(key=lambda x: x[0])
summaries = [result[1] for result in all_results]

# Store results in a new column
final_report['QuickSummary'] = summaries

# Report final timing
elapsed_time = time.time() - start_time
print(f"\nCompleted in {elapsed_time:.2f} seconds")

# Display results
display(final_report[['steam_appid', 'Theme', 'QuickSummary']].head())

# Shut down the client and cluster
client.close()
cluster.close()

In [9]:
# Cell 9 (FULLY OPTIMIZED - FIXED): GPU-optimized hierarchical summarization with Dask

import pandas as pd
import numpy as np
import torch
import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
from tqdm.auto import tqdm
import time

# Start a local Dask cluster
n_workers = 4  # Adjust based on your CPU core count
cluster = LocalCluster(n_workers=n_workers, threads_per_worker=1)
client = Client(cluster)
print(f"Dask dashboard available at: {client.dashboard_link}")

# Define model parameters 
MODEL_NAME = 'sshleifer/distilbart-cnn-12-6'
MAX_GPU_BATCH_SIZE = 64  # Large batch size for RTX 4080 Super

# First, load the data once and distribute it to avoid repetition
@dask.delayed
def prepare_partition(start_idx, end_idx):
    """Prepare a partition without loading the entire DataFrame into each worker"""
    # Get just this partition
    return final_report.iloc[start_idx:end_idx].copy()

# Prepare partitions with delayed
partition_size = len(final_report) // n_workers
partitions = []
for i in range(n_workers):
    start_idx = i * partition_size
    end_idx = (i + 1) * partition_size if i < n_workers - 1 else len(final_report)
    partitions.append(prepare_partition(start_idx, end_idx))

# The main processing function - FIXED: Removed dependency on datasets library
@dask.delayed
def process_partition(partition_df, worker_id):
    """Process a partition of the data on a worker with batch processing"""
    # Import packages needed in the worker
    from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
    import torch
    from tqdm.auto import tqdm
    
    # Load tokenizer first
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # Load model with device_map="auto"
    model = AutoModelForSeq2SeqLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto"
    )
    
    # Create pipeline with model AND tokenizer
    summarizer = pipeline(
        task='summarization',
        model=model,
        tokenizer=tokenizer,
        framework='pt',
        model_kwargs={"use_cache": True}
    )
    
    # Report worker GPU status
    gpu_mem = torch.cuda.memory_allocated(0) / (1024**3)
    print(f"Worker {worker_id}: GPU Memory: {gpu_mem:.2f}GB allocated")
    
    # FIXED: Process chunks in batches without requiring the datasets library
    def process_chunks_batched(chunks):
        """Process chunks in batches for efficient GPU utilization"""
        # Process in large batches to utilize GPU effectively
        all_summaries = []
        
        # Process in batches of MAX_GPU_BATCH_SIZE
        for i in range(0, len(chunks), MAX_GPU_BATCH_SIZE):
            batch = chunks[i:i+MAX_GPU_BATCH_SIZE]
            batch_summaries = summarizer(
                batch,
                max_length=60,
                min_length=20,
                truncation=True,
                do_sample=False
            )
            all_summaries.extend([s["summary_text"] for s in batch_summaries])
            
        return all_summaries
    
    # Define the hierarchical summary function with batch processing
    def hierarchical_summary(reviews, chunk_size=200):
        # If there are fewer than chunk_size, just do one summary
        if len(reviews) <= chunk_size:
            doc = "\n\n".join(reviews)
            return summarizer(
                doc,
                max_length=60,
                min_length=20,
                truncation=True,
                do_sample=False
            )[0]['summary_text']
        
        # Prepare all chunks for processing
        all_chunks = []
        for i in range(0, len(reviews), chunk_size):
            batch = reviews[i:i+chunk_size]
            text = "\n\n".join(batch)
            all_chunks.append(text)
        
        # Process chunks with batched processing
        intermediate_summaries = process_chunks_batched(all_chunks)
        
        # Summarize the intermediate summaries
        joined = " ".join(intermediate_summaries)
        return summarizer(
            joined,
            max_length=60,
            min_length=20,
            truncation=True,
            do_sample=False
        )[0]['summary_text']
    
    # Process the partition with a progress bar
    results = []
    # Create a progress bar for this worker
    with tqdm(total=len(partition_df), desc=f"Worker {worker_id}", position=worker_id) as pbar:
        for idx, row in partition_df.iterrows():
            summary = hierarchical_summary(row['Reviews'], chunk_size=200)
            results.append((idx, summary))
            pbar.update(1)
            
            # Clean up every few iterations
            if len(results) % 5 == 0:
                torch.cuda.empty_cache()
    
    # Clean up at the end
    torch.cuda.empty_cache()
    del model
    del summarizer
    
    # Return the results for this partition
    return results

# Schedule the tasks with the delayed partitions
print(f"Scheduling {n_workers} partitions for processing...")
delayed_results = []
for i in range(n_workers):
    delayed_result = process_partition(partitions[i], i)
    delayed_results.append(delayed_result)
    print(f"Scheduled partition {i+1}/{n_workers}")

# Create a main progress bar for overall progress
print("\nStarting distributed computation with progress tracking:")
main_progress = tqdm(total=len(final_report), desc="Overall Progress")

# Start timing
start_time = time.time()

# Create a global progress updater
def update_main_progress(future):
    # Update main progress bar based on worker progress
    completed_tasks = sum(future.status == "finished" for future in client.futures.values())
    main_progress.n = min(len(final_report), completed_tasks * (len(final_report) // len(delayed_results)))
    main_progress.refresh()

# Submit the tasks to the cluster
futures = client.compute(delayed_results)

# Start a loop to update the main progress bar
import threading
stop_flag = False

def progress_monitor():
    while not stop_flag:
        update_main_progress(futures)
        time.sleep(0.5)

# Start the progress monitor in a separate thread
monitor_thread = threading.Thread(target=progress_monitor)
monitor_thread.start()

# Wait for computation to complete - FIXED: Added more reliable computation approach
try:
    print("Computing all partitions...")
    results = client.gather(futures)
except Exception as e:
    # Fallback to direct computation if future gathering fails
    print(f"Error with futures: {e}")
    print("Falling back to direct computation...")
    results = dask.compute(*delayed_results)

# Stop the progress monitor
stop_flag = True
monitor_thread.join()

# Update progress bar to completion
main_progress.n = len(final_report)
main_progress.refresh()
main_progress.close()

# Flatten the nested list of results
all_results = []
for worker_results in results:
    all_results.extend(worker_results)

# Sort by index
all_results.sort(key=lambda x: x[0])
summaries = [result[1] for result in all_results]

# Store results in a new column
final_report['QuickSummary'] = summaries

# Report final timing
elapsed_time = time.time() - start_time
print(f"\nCompleted in {elapsed_time:.2f} seconds")

# Display results
display(final_report[['steam_appid', 'Theme', 'QuickSummary']].head())

# Shut down the client and cluster
client.close()
cluster.close()

Dask dashboard available at: http://127.0.0.1:8787/status
Scheduling 4 partitions for processing...
Scheduled partition 1/4
Scheduled partition 2/4
Scheduled partition 3/4
Scheduled partition 4/4

Starting distributed computation with progress tracking:


Overall Progress:   0%|          | 0/45 [00:00<?, ?it/s]

Computing all partitions...


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
Device set to use cuda:0


Worker 2:   0%|          | 0/11 [00:00<?, ?it/s][A[ADevice set to use cuda:0



Worker 3:   0%|          | 0/12 [00:00<?, ?it/s][A[A[ADevice set to use cuda:0

Worker 1:   0%|          | 0/11 [00:00<?, ?it/s][ADevice set to use cuda:0
Worker 0:   0%|          | 0/11 [00:00<?, ?it/s]

Worker 2: GPU Memory: 0.57GB allocated
Worker 3: GPU Memory: 0.57GB allocated
Worker 1: GPU Memory: 0.57GB allocated
Worker 0: GPU Memory: 0.57GB allocated


Worker 0:   9%|▉         | 1/11 [00:05<00:56,  5.65s/it]

Worker 0:  27%|██▋       | 3/11 [00:19<00:50,  6.34s/it][A[A
Worker 1:   9%|▉         | 1/11 [00:26<04:27, 26.75s/it][A


Worker 3:   8%|▊         | 1/12 [00:27<05:01, 27.40s/it][A[A[A

Worker 2:  18%|█▊        | 2/11 [00:28<02:18, 15.38s/it][A[A

Worker 0:  45%|████▌     | 5/11 [00:43<00:55,  9.31s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Worker 0:  55%|█████▍    | 6/11 [00:49<00:39,  7.97s/it][A[AYour max_length is set to 60, but your input_length is only 5. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)
Worker 0:  82%|████████▏ | 9/11 [00:58<00:09,  4.78s/it]
Worker 1:  18%|█▊        | 2/11 [00:59<04:32, 30.26s/it][AYour max_length is set to 60, but your input_length is only 46. Since this is a summarization t


Completed in 227.71 seconds





Worker 3: 100%|██████████| 12/12 [03:43<00:00, 18.61s/it][A[A[A


Unnamed: 0,steam_appid,Theme,QuickSummary
0,10,community,Counter-Strike 1.6 was a significant part of ...
1,10,anti_cheat,Counter-Strike is a first-person shooter vide...
2,10,performance,"The game crashes constantly, there is no impl..."
3,10,competitive,"CounterStrike 1.6 is like a time travel, just..."
4,10,gameplay,Counter Strike is a first-person shooter game...


In [10]:
final_report.to_csv('output_csvs/summarized_report.csv')