In [None]:
# # Cell 9 (DASK DISTRIBUTED - FINAL WITH PROGRESS): GPU-optimized hierarchical summarization with Dask

# import pandas as pd
# import numpy as np
# import torch
# import dask
# import dask.dataframe as dd
# from dask.distributed import Client, LocalCluster
# from tqdm.auto import tqdm
# import time

# # Start a local Dask cluster
# n_workers = 4  # Adjust based on your CPU core count
# cluster = LocalCluster(n_workers=n_workers, threads_per_worker=1)
# client = Client(cluster)
# print(f"Dask dashboard available at: {client.dashboard_link}")

# # Define model parameters 
# MODEL_NAME = 'sshleifer/distilbart-cnn-12-6'
# MAX_GPU_BATCH_SIZE = 64  # Large batch size for RTX 4080 Super

# @dask.delayed
# def process_partition(partition_df, worker_id):
#     """Process a partition of the data on a worker"""
#     # Import packages needed in the worker
#     from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
#     import torch
#     from tqdm.auto import tqdm
    
#     # Load tokenizer first
#     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
#     # Load model with device_map="auto" only
#     model = AutoModelForSeq2SeqLM.from_pretrained(
#         MODEL_NAME,
#         torch_dtype=torch.float16,
#         device_map="auto"  # This will handle device placement automatically
#     )
    
#     # Create pipeline with both model AND tokenizer
#     summarizer = pipeline(
#         task='summarization',
#         model=model,
#         tokenizer=tokenizer,
#         framework='pt',
#         model_kwargs={"use_cache": True}
#     )
    
#     # Report worker GPU status
#     gpu_mem = torch.cuda.memory_allocated(0) / (1024**3)
#     print(f"Worker {worker_id}: GPU Memory: {gpu_mem:.2f}GB allocated")
    
#     # Define the hierarchical summary function within the worker
#     def hierarchical_summary(reviews, chunk_size=200, max_len=60, min_len=20):
#         # If there are fewer than chunk_size, just do one summary
#         if len(reviews) <= chunk_size:
#             doc = "\n\n".join(reviews)
#             return summarizer(
#                 doc,
#                 max_length=max_len,
#                 min_length=min_len,
#                 truncation=True,
#                 do_sample=False
#             )[0]['summary_text']
        
#         # Prepare all chunks for processing
#         all_chunks = []
#         for i in range(0, len(reviews), chunk_size):
#             batch = reviews[i:i+chunk_size]
#             text = "\n\n".join(batch)
#             all_chunks.append(text)
        
#         # Process in large batches to utilize GPU
#         summaries = []
#         for i in range(0, len(all_chunks), MAX_GPU_BATCH_SIZE):
#             batch = all_chunks[i:i+MAX_GPU_BATCH_SIZE]
#             batch_summaries = summarizer(
#                 batch,
#                 max_length=max_len,
#                 min_length=min_len,
#                 truncation=True,
#                 do_sample=False
#             )
#             summaries.extend([s['summary_text'] for s in batch_summaries])
        
#         # Summarize the intermediate summaries
#         joined = " ".join(summaries)
#         return summarizer(
#             joined,
#             max_length=max_len,
#             min_length=min_len,
#             truncation=True,
#             do_sample=False
#         )[0]['summary_text']
    
#     # Process the partition with a progress bar
#     results = []
#     # Create a progress bar for this worker
#     with tqdm(total=len(partition_df), desc=f"Worker {worker_id}", position=worker_id) as pbar:
#         for idx, row in partition_df.iterrows():
#             summary = hierarchical_summary(row['Reviews'], chunk_size=200, max_len=60, min_len=20)
#             results.append((idx, summary))
#             pbar.update(1)
            
#             # Clean up every few iterations
#             if len(results) % 5 == 0:
#                 torch.cuda.empty_cache()
    
#     # Clean up at the end
#     torch.cuda.empty_cache()
#     del model
#     del summarizer
    
#     # Return the results for this partition
#     return results

# # Convert pandas DataFrame to Dask DataFrame
# dask_df = dd.from_pandas(final_report, npartitions=n_workers)

# # Set up manual progress tracking
# print(f"Processing {len(final_report)} rows across {n_workers} partitions...")

# # Simple approach to split the dataframe
# partition_size = len(final_report) // n_workers
# delayed_results = []

# # Process each partition separately
# print(f"Scheduling {n_workers} partitions for processing...")
# for i in range(n_workers):
#     # Get start and end index for this partition
#     start_idx = i * partition_size
#     end_idx = (i + 1) * partition_size if i < n_workers - 1 else len(final_report)
    
#     # Get this partition as a pandas DataFrame
#     partition_df = final_report.iloc[start_idx:end_idx].copy()
    
#     # Create a delayed task to process this partition
#     delayed_result = process_partition(partition_df, i)
#     delayed_results.append(delayed_result)
#     print(f"Scheduled partition {i+1}/{n_workers} with {len(partition_df)} rows")

# # Create a main progress bar for overall progress
# print("\nStarting distributed computation with progress tracking:")
# main_progress = tqdm(total=len(final_report), desc="Overall Progress")

# # Start timing
# start_time = time.time()

# # Create a global progress updater
# def update_main_progress(future):
#     # Update main progress bar based on worker progress
#     # This function will be called repeatedly to update the main progress bar
#     completed_tasks = sum(future.status == "finished" for future in client.futures.values())
#     main_progress.n = min(len(final_report), completed_tasks * (len(final_report) // len(delayed_results)))
#     main_progress.refresh()

# # Submit the tasks to the cluster
# futures = client.compute(delayed_results)

# # Start a loop to update the main progress bar
# import threading
# stop_flag = False

# def progress_monitor():
#     while not stop_flag:
#         update_main_progress(futures)
#         time.sleep(0.5)

# # Start the progress monitor in a separate thread
# monitor_thread = threading.Thread(target=progress_monitor)
# monitor_thread.start()

# # Wait for computation to complete
# results = dask.compute(*delayed_results)

# # Stop the progress monitor
# stop_flag = True
# monitor_thread.join()

# # Update progress bar to completion
# main_progress.n = len(final_report)
# main_progress.refresh()
# main_progress.close()

# # Flatten the nested list of results
# all_results = []
# for worker_results in results:
#     all_results.extend(worker_results)

# # Sort by index
# all_results.sort(key=lambda x: x[0])
# summaries = [result[1] for result in all_results]

# # Store results in a new column
# final_report['QuickSummary'] = summaries

# # Report final timing
# elapsed_time = time.time() - start_time
# print(f"\nCompleted in {elapsed_time:.2f} seconds")

# # Display results
# display(final_report[['steam_appid', 'Theme', 'QuickSummary']].head())

# # Shut down the client and cluster
# client.close()
# cluster.close()

In [None]:
# # Cell 9 (ULTRA OPTIMIZED - FIXED): Maximum GPU utilization for RTX 4080 Super

# import pandas as pd
# import numpy as np
# from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
# from tqdm.auto import tqdm
# import torch
# import gc

# # Force CUDA initialization and check memory
# torch.cuda.init()
# total_gpu_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert to GB
# print(f"Total GPU memory: {total_gpu_mem:.2f} GB")

# # Ultra-aggressive GPU optimization parameters for RTX 4080 Super
# MODEL_NAME = 'sshleifer/distilbart-cnn-12-6'
# MAX_GPU_BATCH_SIZE = 64  # Much larger batch size to fully utilize VRAM
# MAX_SEQUENCE_LENGTH = 1024  # Set maximum context length to optimize memory usage

# # Load model and tokenizer directly for maximum control
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# model = AutoModelForSeq2SeqLM.from_pretrained(
#     MODEL_NAME, 
#     torch_dtype=torch.float16,  # Half-precision for maximum throughput
#     device_map="auto"           # Automatically map to available GPU
# )

# # Move model to GPU and optimize for inference
# model.to("cuda")
# model.eval()  # Set to evaluation mode

# # Create a custom pipeline with maximum batch efficiency
# summarizer = pipeline(
#     task='summarization',
#     model=model,
#     tokenizer=tokenizer,
#     framework='pt',
#     # Force maximum GPU memory usage
#     model_kwargs={"use_cache": True}
# )

# # Monitor GPU memory usage
# def gpu_memory_usage():
#     """Return GPU memory usage in GB"""
#     reserved = torch.cuda.memory_reserved(0) / (1024**3)
#     allocated = torch.cuda.memory_allocated(0) / (1024**3)
#     print(f"GPU Memory: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved")
#     return allocated, reserved

# def hierarchical_summary(reviews, chunk_size=200, max_len=60, min_len=20):
#     """
#     Ultra-optimized hierarchical summarization for maximum GPU utilization
#     """
#     # If there are fewer than chunk_size, just do one summary
#     if len(reviews) <= chunk_size:
#         doc = "\n\n".join(reviews)
#         return summarizer(
#             doc,
#             max_length=max_len,
#             min_length=min_len,
#             truncation=True,
#             do_sample=False
#         )[0]['summary_text']
    
#     # 2) Prepare all chunks for processing with ultra-large batches
#     all_chunks = []
#     for i in range(0, len(reviews), chunk_size):
#         batch = reviews[i:i+chunk_size]
#         text = "\n\n".join(batch)
#         all_chunks.append(text)
    
#     # Process in maximally large batches to saturate GPU
#     # This is the key optimization - use much larger batches to fill VRAM
#     summaries = []
#     for i in range(0, len(all_chunks), MAX_GPU_BATCH_SIZE):
#         batch = all_chunks[i:i+MAX_GPU_BATCH_SIZE]
        
#         # Log memory usage before batch
#         print(f"Processing batch of size {len(batch)} ({i}/{len(all_chunks)})")
#         gpu_memory_usage()
        
#         # Process maximum-sized batch
#         batch_summaries = summarizer(
#             batch,
#             max_length=max_len,
#             min_length=min_len,
#             truncation=True,
#             do_sample=False
#         )
#         summaries.extend([s['summary_text'] for s in batch_summaries])
        
#         # Log memory after batch
#         gpu_memory_usage()
    
#     # 3) Summarize the intermediate summaries in a single batch
#     joined = " ".join(summaries)
    
#     # Final summary
#     return summarizer(
#         joined,
#         max_length=max_len,
#         min_length=min_len,
#         truncation=True,
#         do_sample=False
#     )[0]['summary_text']

# # Pre-process all reviews to maximize throughput - FIXED THIS LINE
# print("Preparing all reviews for processing...")
# all_rows = [(i, row['Reviews']) for i, (_, row) in enumerate(final_report.iterrows())]

# # Process the entire dataset in sequential maximum-sized batches
# # This approach ensures GPU is fully saturated
# all_results = []
# with tqdm(total=len(final_report), desc="Ultra GPU Optimization") as pbar:
#     # Process each row with maximum batch efficiency
#     for i in range(0, len(all_rows), 10):  # Process in batches of 10 rows
#         batch_rows = all_rows[i:i+10]
#         batch_results = []
        
#         for batch_idx, (row_idx, reviews) in enumerate(batch_rows):
#             # Force garbage collection before large operations
#             if batch_idx % 5 == 0:
#                 torch.cuda.empty_cache()
#                 gc.collect()
            
#             # Process with maximum GPU utilization
#             summary = hierarchical_summary(
#                 reviews, 
#                 chunk_size=200, 
#                 max_len=60, 
#                 min_len=20
#             )
#             batch_results.append((row_idx, summary))
#             pbar.update(1)
        
#         all_results.extend(batch_results)
#         # Force GPU memory cleanup between large batches
#         torch.cuda.empty_cache()
#         gc.collect()

# # Sort and store results
# all_results.sort(key=lambda x: x[0])
# summaries = [result[1] for result in all_results]

# # Store results in a new column
# final_report['QuickSummary'] = summaries

# # Inspect results
# display(final_report[['steam_appid', 'Theme', 'QuickSummary']].head())

# # Final cleanup
# torch.cuda.empty_cache()

In [None]:
# # Cell 9 (OPTIMIZED - FASTEST SO FAR): GPU-optimized hierarchical summarization for RTX 4080 Super

# import pandas as pd
# import numpy as np
# from transformers import pipeline, AutoTokenizer
# from tqdm.auto import tqdm
# import torch
# import concurrent.futures
# import multiprocessing

# # Check GPU memory and set optimal batch sizes
# gpu_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert to GB
# print(f"Available GPU memory: {gpu_mem:.2f} GB")

# # RTX 4080 Super optimization parameters
# # With 16GB VRAM, we can use larger batch sizes and optimize throughput
# MODEL_NAME = 'sshleifer/distilbart-cnn-12-6'
# MAX_GPU_BATCH_SIZE = 32  # Larger batch size for 16GB VRAM
# PARALLEL_PROCESSES = 4   # Optimal number for balancing CPU and GPU workloads

# # 1) Initialize tokenizer to estimate token counts for optimal batching
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# # 2) Initialize the summarizer pipeline with optimized settings for RTX 4080 Super
# summarizer = pipeline(
#     task='summarization',
#     model=MODEL_NAME,
#     device=0,
#     framework='pt',
#     # Optimized settings for higher throughput
#     model_kwargs={
#         "use_cache": True,  # Enable KV caching for faster inference
#     },
#     # Enable half-precision for faster processing and lower memory usage
#     torch_dtype=torch.float16
# )

# def estimate_tokens(text):
#     """Estimate token count to optimize batching"""
#     return len(tokenizer.encode(text))

# def hierarchical_summary(reviews, chunk_size=200, max_len=60, min_len=20):
#     """
#     GPU-optimized hierarchical summarization with dynamic batching
#     """
#     # If there are fewer than chunk_size, just do one summary
#     if len(reviews) <= chunk_size:
#         doc = "\n\n".join(reviews)
#         return summarizer(
#             doc,
#             max_length=max_len,
#             min_length=min_len,
#             truncation=True,
#             do_sample=False
#         )[0]['summary_text']
    
#     # 2) Prepare all chunks for processing
#     all_chunks = []
#     for i in range(0, len(reviews), chunk_size):
#         batch = reviews[i:i+chunk_size]
#         text = "\n\n".join(batch)
#         all_chunks.append(text)
    
#     # Dynamically determine optimal batch size based on token counts
#     # For RTX 4080 Super with 16GB, we can process larger batches
#     summaries = []
#     for i in range(0, len(all_chunks), MAX_GPU_BATCH_SIZE):
#         batch = all_chunks[i:i+MAX_GPU_BATCH_SIZE]
#         batch_summaries = summarizer(
#             batch,
#             max_length=max_len,
#             min_length=min_len,
#             truncation=True,
#             do_sample=False
#         )
#         summaries.extend([s['summary_text'] for s in batch_summaries])
    
#     # 3) Summarize the intermediate summaries
#     # RTX 4080 Super can handle the full set of intermediate summaries
#     joined = " ".join(summaries)
#     return summarizer(
#         joined,
#         max_length=max_len,
#         min_length=min_len,
#         truncation=True,
#         do_sample=False
#     )[0]['summary_text']

# # 4) Function to process each batch of rows with GPU optimization
# def process_gpu_batch(batch_df):
#     results = []
#     # Pre-collect all reviews to optimize memory transfers to GPU
#     all_rows = [(row.name, row['Reviews']) for _, row in batch_df.iterrows()]
    
#     for idx, reviews in all_rows:
#         # Use optimized hierarchical summary function
#         summary = hierarchical_summary(reviews, chunk_size=200, max_len=60, min_len=20)
#         results.append((idx, summary))
        
#         # Optional: Force CUDA cache clearing every few iterations to prevent memory fragmentation
#         if idx % 10 == 0:
#             torch.cuda.empty_cache()
            
#     return results

# # Calculate optimal processing strategy based on dataset size
# total_rows = len(final_report)
# # Determine batch size for parallel processing
# optimal_batch_size = max(1, total_rows // PARALLEL_PROCESSES)

# # Split dataframe into optimized batches
# batches = [final_report.iloc[i:i+optimal_batch_size] for i in range(0, total_rows, optimal_batch_size)]

# # Process with concurrent.futures and progress tracking
# all_results = []
# with tqdm(total=total_rows, desc="GPU Summarizing (RTX 4080 Super)") as pbar:
#     # Use ThreadPoolExecutor to manage parallel GPU tasks
#     with concurrent.futures.ThreadPoolExecutor(max_workers=PARALLEL_PROCESSES) as executor:
#         future_to_batch = {executor.submit(process_gpu_batch, batch): batch for batch in batches}
        
#         for future in concurrent.futures.as_completed(future_to_batch):
#             try:
#                 batch_results = future.result()
#                 all_results.extend(batch_results)
#                 batch_size = len(future_to_batch[future])
#                 pbar.update(batch_size)
#             except Exception as e:
#                 print(f"Error processing batch: {e}")
#                 # Continue with remaining batches

# # Sort results by the original index
# all_results.sort(key=lambda x: x[0])
# summaries = [result[1] for result in all_results]

# # 5) Store results in a new column
# final_report['QuickSummary'] = summaries

# # 6) Inspect results
# display(final_report[['steam_appid', 'Theme', 'QuickSummary']].head())

# # 7) Clean up GPU memory
# torch.cuda.empty_cache()

In [None]:
# # Cell 9 (BIG DATA - BUT WITH PYTHON-FUTURES): Hierarchical summarization of all reviews per theme using parallel processing

# import pandas as pd
# import numpy as np
# from transformers import pipeline
# from tqdm.auto import tqdm
# import concurrent.futures
# import multiprocessing

# # Set up the number of workers based on CPU cores (with one less to avoid overloading)
# num_workers = max(1, multiprocessing.cpu_count() - 1)

# # 1) Initialize the summarizer pipeline globally
# # This avoids serialization issues
# summarizer = pipeline(
#     task='summarization',
#     model='sshleifer/distilbart-cnn-12-6',
#     framework='pt'
# )

# def hierarchical_summary(reviews, chunk_size=200, max_len=60, min_len=20):
#     """
#     Summarize a long list of reviews into one short summary using parallel processing for chunks
#     """
#     # If there are fewer than chunk_size, just do one summary
#     if len(reviews) <= chunk_size:
#         doc = "\n\n".join(reviews)
#         return summarizer(
#             doc,
#             max_length=max_len,
#             min_length=min_len,
#             truncation=True,
#             do_sample=False
#         )[0]['summary_text']
    
#     # 2) Prepare all chunks for processing
#     all_chunks = []
#     for i in range(0, len(reviews), chunk_size):
#         batch = reviews[i:i+chunk_size]
#         text = "\n\n".join(batch)
#         all_chunks.append(text)
    
#     # Process chunks in a batch to maximize GPU usage
#     intermediate_summaries = summarizer(
#         all_chunks,
#         max_length=max_len,
#         min_length=min_len,
#         truncation=True,
#         do_sample=False
#     )
    
#     # Extract summary texts
#     intermediate = [summary['summary_text'] for summary in intermediate_summaries]
    
#     # 3) Summarize the intermediate summaries
#     joined = " ".join(intermediate)
#     return summarizer(
#         joined,
#         max_length=max_len,
#         min_length=min_len,
#         truncation=True,
#         do_sample=False
#     )[0]['summary_text']

# # 4) Function to process each batch of rows in parallel
# def process_batch(batch_df):
#     results = []
#     for _, row in batch_df.iterrows():
#         summary = hierarchical_summary(row['Reviews'], chunk_size=200, max_len=60, min_len=20)
#         results.append((row.name, summary))
#     return results

# # Split the dataframe into batches for parallel processing
# def split_dataframe(df, batch_size):
#     batches = []
#     for i in range(0, len(df), batch_size):
#         batches.append(df.iloc[i:i+batch_size])
#     return batches

# # Calculate optimal batch size based on dataset size and worker count
# batch_size = max(1, len(final_report) // num_workers)
# batches = split_dataframe(final_report, batch_size)

# # Use ThreadPoolExecutor for parallel processing with progress bar
# all_results = []
# with tqdm(total=len(final_report), desc="Summarizing themes") as pbar:
#     with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
#         # Submit all batches to the executor
#         future_to_batch = {executor.submit(process_batch, batch): batch for batch in batches}
        
#         # Process completed batches and update progress
#         for future in concurrent.futures.as_completed(future_to_batch):
#             batch_results = future.result()
#             all_results.extend(batch_results)
#             # Update progress bar by the number of rows processed in this batch
#             pbar.update(len(future_to_batch[future]))

# # Sort results by the original index and extract summaries
# all_results.sort(key=lambda x: x[0])  # Sort by index
# summaries = [result[1] for result in all_results]

# # 5) Store results in a new column
# final_report['QuickSummary'] = summaries

# # 6) Inspect
# display(final_report[['steam_appid', 'Theme', 'QuickSummary']].head())

In [None]:
# # Cell 9 (FIXED - FIRST): Hierarchical summarization of all reviews per theme

# from transformers import pipeline
# from tqdm.auto import tqdm

# # 1) Initialize a single summarizer pipeline
# summarizer = pipeline(
#     task='summarization',
#     model='sshleifer/distilbart-cnn-12-6',
#     device=0,              # change to -1 if no GPU
#     framework='pt'
# )

# def hierarchical_summary(reviews, chunk_size=200,
#                          max_len=60, min_len=20):
#     """
#     Summarize a long list of reviews into one short summary:
#       1) Chunk the reviews into batches of chunk_size
#       2) Summarize each batch
#       3) Summarize the concatenation of batch summaries
    
#     Params:
#       reviews    : list of str, the reviews to summarize
#       chunk_size : int, number of reviews per intermediate chunk
#       max_len    : int, max summary tokens per call
#       min_len    : int, min summary tokens per call
    
#     Returns:
#       str, final "quick read" summary
#     """
#     # If there are fewer than chunk_size, just do one summary
#     if len(reviews) <= chunk_size:
#         doc = "\n\n".join(reviews)
#         return summarizer(
#             doc,
#             max_length=max_len,
#             min_length=min_len,
#             truncation=True,
#             do_sample=False
#         )[0]['summary_text']
    
#     # 2) Prepare all chunks for batch processing
#     all_chunks = []
#     for i in range(0, len(reviews), chunk_size):
#         batch = reviews[i:i+chunk_size]
#         text = "\n\n".join(batch)
#         all_chunks.append(text)
    
#     # Process all chunks in one batch
#     intermediate_summaries = summarizer(
#         all_chunks,
#         max_length=max_len,
#         min_length=min_len,
#         truncation=True,
#         do_sample=False
#     )
    
#     # Extract summary texts
#     intermediate = [summary['summary_text'] for summary in intermediate_summaries]
    
#     # 3) Summarize the intermediate summaries
#     joined = " ".join(intermediate)
#     return summarizer(
#         joined,
#         max_length=max_len,
#         min_length=min_len,
#         truncation=True,
#         do_sample=False
#     )[0]['summary_text']

# # 4) Apply to each row of final_report with progress bar
# quick_summaries = []
# for _, row in tqdm(final_report.iterrows(),
#                   total=len(final_report),
#                   desc="Summarizing themes"):
#     revs = row['Reviews']
#     quick = hierarchical_summary(revs,
#                                  chunk_size=200,
#                                  max_len=60,
#                                  min_len=20)
#     quick_summaries.append(quick)

# # 5) Store results in a new column
# final_report['QuickSummary'] = quick_summaries

# # 6) Inspect
# display(final_report[['steam_appid','Theme','QuickSummary']].head())

# Tuned for my hardware 1m 50 secs inference

In [None]:
# # Cell 9: Hardware-optimized GPU summarization with Dask - Tuned for Ryzen 9700X & RTX 4080 Super

# import pandas as pd
# import numpy as np
# import torch
# import dask
# import dask.dataframe as dd
# from dask.distributed import Client, LocalCluster
# from tqdm.auto import tqdm
# import time
# import os
# import threading

# # Create checkpoint directory if it doesn't exist (minimal overhead)
# os.makedirs('checkpoints', exist_ok=True)

# # Optimized configuration for your specific hardware
# # RTX 4080 Super (12GB usable VRAM) + Ryzen 9700X + 20GB usable RAM
# HARDWARE_CONFIG = {
#     'worker_count': 6,                # Optimal for Ryzen 9700X
#     'memory_per_worker': '3GB',       # 18GB total for workers, leaving headroom
#     'gpu_batch_size': 96,             # Aggressive batch size for RTX 4080 Super
#     'model_name': 'sshleifer/distilbart-cnn-12-6',  # Best model for your GPU
#     'chunk_size': 400,                # Larger chunks for faster processing
#     'checkpoint_frequency': 25,       # Less frequent checkpoints for speed
#     'cleanup_frequency': 10,          # Less frequent memory cleanup
# }

# print(f"Starting optimized Dask cluster for Ryzen 9700X + RTX 4080 Super configuration")
# cluster = LocalCluster(
#     n_workers=HARDWARE_CONFIG['worker_count'], 
#     threads_per_worker=2,
#     memory_limit=HARDWARE_CONFIG['memory_per_worker']
# )
# client = Client(cluster)
# print(f"Dask dashboard available at: {client.dashboard_link}")

# # Determine optimal partition sizes - larger for better throughput
# @dask.delayed
# def prepare_partition(start_idx, end_idx):
#     """Prepare a partition optimized for high-end hardware"""
#     return final_report.iloc[start_idx:end_idx].copy()

# # Create larger partitions for better throughput
# n_workers = HARDWARE_CONFIG['worker_count']
# partition_size = len(final_report) // n_workers
# partitions = []
# for i in range(n_workers):
#     start_idx = i * partition_size
#     end_idx = (i + 1) * partition_size if i < n_workers - 1 else len(final_report)
#     partitions.append(prepare_partition(start_idx, end_idx))
#     print(f"Prepared partition {i+1} with {end_idx-start_idx} items")

# # Optimized worker function with aggressive resource usage
# @dask.delayed
# def process_partition(partition_df, worker_id):
#     """Optimized worker for RTX 4080 Super"""
#     # Import needed packages
#     from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
#     import torch
    
#     # Load model components with optimal settings for RTX 4080 Super
#     print(f"Worker {worker_id} initializing with optimized settings for RTX 4080 Super")
    
#     # Load tokenizer
#     tokenizer = AutoTokenizer.from_pretrained(HARDWARE_CONFIG['model_name'])
    
#     # Load model with optimized settings for RTX 4080 Super
#     model = AutoModelForSeq2SeqLM.from_pretrained(
#         HARDWARE_CONFIG['model_name'],
#         torch_dtype=torch.float16,        # Half precision for speed
#         device_map="auto",                # Automatic device placement
#         low_cpu_mem_usage=True            # Optimized memory usage
#     )
    
#     # Create optimized pipeline
#     summarizer = pipeline(
#         task='summarization',
#         model=model,
#         tokenizer=tokenizer,
#         framework='pt',
#         model_kwargs={
#             "use_cache": True,            # Enable caching for speed
#             "return_dict_in_generate": True  # More efficient generation
#         }
#     )
    
#     # Report GPU status
#     gpu_mem = torch.cuda.memory_allocated(0) / (1024**3)
#     print(f"Worker {worker_id}: GPU Memory: {gpu_mem:.2f}GB allocated")
    
#     # Highly optimized batch processing function
#     def process_chunks_batched(chunks):
#         """Process chunks in large batches for RTX 4080 Super"""
#         all_summaries = []
        
#         # Use large batches for the RTX 4080 Super
#         for i in range(0, len(chunks), HARDWARE_CONFIG['gpu_batch_size']):
#             batch = chunks[i:i+HARDWARE_CONFIG['gpu_batch_size']]
#             batch_summaries = summarizer(
#                 batch,
#                 max_length=60,
#                 min_length=20,
#                 truncation=True,
#                 do_sample=False,
#                 num_beams=2  # Use beam search for better quality with minimal speed impact
#             )
#             all_summaries.extend([s["summary_text"] for s in batch_summaries])
            
#             # Minimal cleanup - only when really needed
#             if i % (HARDWARE_CONFIG['gpu_batch_size'] * 3) == 0 and torch.cuda.is_available():
#                 torch.cuda.empty_cache()
                    
#         return all_summaries
    
#     # Optimized hierarchical summary function
#     def hierarchical_summary(reviews):
#         """Create hierarchical summary with optimized chunk sizes"""
#         # Handle edge cases efficiently
#         if not reviews or not isinstance(reviews, list):
#             return "No reviews available for summarization."
        
#         # Fast path for small review sets
#         if len(reviews) <= HARDWARE_CONFIG['chunk_size']:
#             doc = "\n\n".join(reviews)
#             return summarizer(
#                 doc,
#                 max_length=60,
#                 min_length=20,
#                 truncation=True,
#                 do_sample=False
#             )[0]['summary_text']
        
#         # Process larger review sets with optimized chunking
#         all_chunks = []
#         for i in range(0, len(reviews), HARDWARE_CONFIG['chunk_size']):
#             batch = reviews[i:i+HARDWARE_CONFIG['chunk_size']]
#             text = "\n\n".join(batch)
#             all_chunks.append(text)
        
#         # Process chunks with optimized batching
#         intermediate_summaries = process_chunks_batched(all_chunks)
        
#         # Create final summary
#         joined = " ".join(intermediate_summaries)
#         return summarizer(
#             joined,
#             max_length=60,
#             min_length=20,
#             truncation=True,
#             do_sample=False
#         )[0]['summary_text']
    
#     # Process the partition with minimal overhead
#     results = []
    
#     # Use tqdm for progress tracking
#     with tqdm(total=len(partition_df), desc=f"Worker {worker_id}", position=worker_id) as pbar:
#         for idx, row in partition_df.iterrows():
#             # Process the review
#             summary = hierarchical_summary(row['Reviews'])
#             results.append((idx, summary))
            
#             # Minimal cleanup - only every N iterations
#             if len(results) % HARDWARE_CONFIG['cleanup_frequency'] == 0:
#                 torch.cuda.empty_cache()
                
#             # Update progress bar
#             pbar.update(1)
    
#     # Final cleanup
#     torch.cuda.empty_cache()
    
#     print(f"Worker {worker_id} completed successfully")
#     return results

# # Schedule tasks
# print(f"Scheduling {n_workers} optimized partitions...")
# delayed_results = []
# for i in range(n_workers):
#     delayed_result = process_partition(partitions[i], i)
#     delayed_results.append(delayed_result)

# # Streamlined progress tracking
# print("\nStarting optimized computation...")
# main_progress = tqdm(total=len(final_report), desc="Overall Progress")

# # Start timing
# start_time = time.time()

# # Minimal checkpoint system - only save occasionally
# def update_main_progress(futures):
#     while not stop_flag:
#         # Count completed futures
#         completed_count = sum(f.status == 'finished' for f in futures)
#         completed_percentage = completed_count / len(futures)
        
#         # Update progress bar
#         main_progress.n = int(len(final_report) * completed_percentage)
#         main_progress.refresh()
        
#         # Only check every 5 seconds to reduce overhead
#         time.sleep(5)

# # Submit tasks to cluster
# futures = client.compute(delayed_results)

# # Start progress monitor with minimal overhead
# stop_flag = False
# monitor_thread = threading.Thread(target=update_main_progress, args=(futures,))
# monitor_thread.daemon = True
# monitor_thread.start()

# # Wait for computation
# try:
#     print("Computing with optimal settings for RTX 4080 Super...")
#     results = client.gather(futures)
# except Exception as e:
#     print(f"Error with futures: {e}")
#     print("Falling back to direct computation...")
#     results = dask.compute(*delayed_results)

# # Stop progress monitor
# stop_flag = True
# monitor_thread.join(timeout=3)

# # Update progress to completion
# main_progress.n = len(final_report)
# main_progress.refresh()
# main_progress.close()

# # Process results efficiently
# all_results = []
# for worker_results in results:
#     all_results.extend(worker_results)

# # Sort results
# all_results.sort(key=lambda x: x[0])
# summaries = [result[1] for result in all_results]

# # Store results
# final_report['QuickSummary'] = summaries

# # Report timing
# elapsed_time = time.time() - start_time
# print(f"\nOptimized processing completed in {elapsed_time:.2f} seconds")
# print(f"Average time per item: {elapsed_time/len(final_report):.2f} seconds")

# # Display results
# print("\nResults sample:")
# display(final_report[['steam_appid', 'Theme', 'QuickSummary']].head())

# # Save results
# final_report.to_csv('output_csvs/optimized_hardware_report.csv')
# print("Results saved to output_csvs/optimized_hardware_report.csv")

# # Clean up
# client.close()
# cluster.close()

# Dynamic Allocation and takes twice as much time but can run on most systems by reading the available resources.

In [None]:
# Cell 9: Dynamically optimized GPU hierarchical summarization with Dask

import pandas as pd
import numpy as np
import torch
import dask
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
from tqdm.auto import tqdm
import time
import os
import psutil
import json
import threading

# 1. Dynamic resource allocation based on system capabilities
def get_system_resources():
    """Determine optimal system resource allocation"""
    # Get available memory and CPU resources
    total_memory = psutil.virtual_memory().total / (1024**3)  # GB
    available_memory = psutil.virtual_memory().available / (1024**3)  # GB
    cpu_count = psutil.cpu_count(logical=False) or psutil.cpu_count(logical=True)
    
    # Check for GPU presence and memory
    gpu_available = torch.cuda.is_available()
    gpu_count = torch.cuda.device_count() if gpu_available else 0
    gpu_memory = [torch.cuda.get_device_properties(i).total_memory / (1024**3) for i in range(gpu_count)] if gpu_available else []
    
    # Determine optimal worker count - leave cores for system and GPU processes
    if gpu_available:
        # For GPU workloads, fewer workers but more memory per worker
        worker_count = min(max(1, cpu_count // 2), gpu_count + 1)
    else:
        # For CPU workloads, use more workers
        worker_count = max(1, cpu_count - 1)
    
    # Memory per worker (70% of available to leave headroom)
    safe_memory = available_memory * 0.7
    memory_per_worker = safe_memory / worker_count
    
    # Dynamic chunk size based on available memory
    if memory_per_worker > 8:  # High memory
        chunk_size = 300
    elif memory_per_worker > 4:  # Medium memory
        chunk_size = 200
    else:  # Low memory
        chunk_size = 100
    
    print(f"System resources: {total_memory:.1f}GB total RAM, {available_memory:.1f}GB available")
    print(f"CPU cores: {cpu_count}, GPU count: {gpu_count}")
    if gpu_count > 0:
        for i, mem in enumerate(gpu_memory):
            print(f"GPU {i}: {mem:.1f}GB memory")
    
    return {
        'worker_count': worker_count,
        'memory_per_worker': memory_per_worker,
        'chunk_size': chunk_size,
        'gpu_available': gpu_available,
        'gpu_count': gpu_count,
        'gpu_memory': gpu_memory
    }

# Get system resources
resources = get_system_resources()

# Create checkpoint directory if it doesn't exist
os.makedirs('checkpoints', exist_ok=True)

# Start a local Dask cluster with dynamic resources
n_workers = resources['worker_count']
print(f"Starting Dask cluster with {n_workers} workers, {resources['memory_per_worker']:.1f}GB per worker")
cluster = LocalCluster(
    n_workers=n_workers, 
    threads_per_worker=2,
    memory_limit=f"{resources['memory_per_worker']:.1f}GB"
)
client = Client(cluster)
print(f"Dask dashboard available at: {client.dashboard_link}")

# 2. Determine model based on available resources
def select_model():
    """Select appropriate model based on available resources"""
    if resources['gpu_available'] and any(mem > 8 for mem in resources['gpu_memory']):
        # For high-end GPUs, use more powerful model
        return 'sshleifer/distilbart-cnn-12-6'
    elif resources['gpu_available']:
        # For lower-end GPUs, use smaller model
        return 'facebook/bart-large-cnn'
    else:
        # For CPU-only, use smallest model
        return 'facebook/bart-base'

# Select model based on resources
MODEL_NAME = select_model()
print(f"Selected model: {MODEL_NAME}")

# 3. First, load the data and check for existing checkpoints
def load_with_checkpoint():
    """Load data with checkpoint recovery"""
    checkpoint_file = 'checkpoints/summarization_progress.json'
    
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as f:
            checkpoint = json.load(f)
            print(f"Found checkpoint with {len(checkpoint)} completed summaries")
            
        # Filter the dataframe to only process remaining rows
        completed_indices = list(map(int, checkpoint.keys()))
        remaining_df = final_report[~final_report.index.isin(completed_indices)].copy()
        
        print(f"Resuming processing for {len(remaining_df)} remaining items")
        return remaining_df, checkpoint
    else:
        print("No checkpoint found, processing all items")
        return final_report, {}

# Load data with checkpoint support
df_to_process, existing_summaries = load_with_checkpoint()

# 4. Prepare partitions with optimized distribution
@dask.delayed
def prepare_partition(start_idx, end_idx, df):
    """Prepare a partition without loading the entire DataFrame into each worker"""
    # Get just this partition
    return df.iloc[start_idx:end_idx].copy()

# Distribute the remaining work
partition_size = len(df_to_process) // n_workers
partitions = []
for i in range(n_workers):
    start_idx = i * partition_size
    end_idx = (i + 1) * partition_size if i < n_workers - 1 else len(df_to_process)
    partitions.append(prepare_partition(start_idx, end_idx, df_to_process))

# 5. Worker processing function with dynamic GPU batch sizing
@dask.delayed
def process_partition(partition_df, worker_id):
    """Process a partition with dynamic batch sizes and error recovery"""
    # Import packages needed in the worker
    from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
    import torch
    import gc
    
    # Determine optimal GPU batch size based on available memory
    def determine_gpu_batch_size():
        if not torch.cuda.is_available():
            return 8  # Conservative default for CPU
            
        try:
            # Get GPU memory info for this worker
            total_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # GB
            # Reserve 10% for system processes and overhead
            usable_mem = total_mem * 0.9
            
            # Scale batch size based on available GPU memory
            if usable_mem > 16:  # High-end GPU with >16GB
                return 64
            elif usable_mem > 8:  # Mid-range GPU with >8GB
                return 32
            elif usable_mem > 4:  # Lower-end GPU with >4GB
                return 16
            else:  # Minimal GPU
                return 8
        except Exception as e:
            print(f"Error determining GPU batch size: {e}")
            return 8  # Conservative fallback
    
    # Worker initialization with error handling
    try:
        # Load tokenizer first
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        
        # Configure device placement based on available resources
        if torch.cuda.is_available():
            device_map = "auto"
            dtype = torch.float16  # Use half precision with GPU
        else:
            device_map = None
            dtype = torch.float32  # Use full precision with CPU
        
        # Load model with appropriate configuration
        model = AutoModelForSeq2SeqLM.from_pretrained(
            MODEL_NAME,
            torch_dtype=dtype,
            device_map=device_map,
            low_cpu_mem_usage=True
        )
        
        # Create pipeline with model AND tokenizer
        summarizer = pipeline(
            task='summarization',
            model=model,
            tokenizer=tokenizer,
            framework='pt',
            model_kwargs={"use_cache": True}
        )
        
        # Report worker status
        if torch.cuda.is_available():
            gpu_mem = torch.cuda.memory_allocated(0) / (1024**3)
            print(f"Worker {worker_id}: GPU Memory: {gpu_mem:.2f}GB allocated")
            MAX_GPU_BATCH_SIZE = determine_gpu_batch_size()
            print(f"Worker {worker_id}: Using GPU batch size: {MAX_GPU_BATCH_SIZE}")
        else:
            MAX_GPU_BATCH_SIZE = 8
            print(f"Worker {worker_id}: Using CPU with batch size: {MAX_GPU_BATCH_SIZE}")
    except Exception as e:
        print(f"Worker {worker_id} initialization error: {e}")
        # Fall back to a simpler configuration
        try:
            print(f"Falling back to CPU-only mode for worker {worker_id}")
            tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
            model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
            summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
            MAX_GPU_BATCH_SIZE = 4  # Conservative batch size for fallback mode
        except Exception as e2:
            print(f"Critical failure in worker {worker_id}: {e2}")
            return []  # Return empty results to avoid deadlock
    
    # Efficient batch processing function with memory management
    def process_chunks_batched(chunks):
        """Process chunks in batches with dynamic memory management"""
        all_summaries = []
        
        # Process in dynamically sized batches
        for i in range(0, len(chunks), MAX_GPU_BATCH_SIZE):
            try:
                batch = chunks[i:i+MAX_GPU_BATCH_SIZE]
                batch_summaries = summarizer(
                    batch,
                    max_length=60,
                    min_length=20,
                    truncation=True,
                    do_sample=False
                )
                all_summaries.extend([s["summary_text"] for s in batch_summaries])
                
                # Proactively manage memory
                if i % (MAX_GPU_BATCH_SIZE * 2) == 0 and torch.cuda.is_available():
                    torch.cuda.empty_cache()
                    
            except Exception as e:
                print(f"Error in batch {i//MAX_GPU_BATCH_SIZE} of worker {worker_id}: {e}")
                # Try smaller batch on failure
                if len(batch) > 1:
                    print("Retrying with smaller batches...")
                    for single_item in batch:
                        try:
                            summary = summarizer(
                                [single_item],
                                max_length=60,
                                min_length=20,
                                truncation=True,
                                do_sample=False
                            )
                            all_summaries.append(summary[0]["summary_text"])
                        except Exception as e2:
                            print(f"Failed to process single item: {e2}")
                            all_summaries.append("Error generating summary.")
                else:
                    all_summaries.append("Error generating summary.")
                
                # Clean up after errors
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()
                gc.collect()
        
        return all_summaries
    
    # Hierarchical summary function with adaptive chunking
    def hierarchical_summary(reviews, base_chunk_size=200):
        """Create hierarchical summary with adaptive chunk sizing"""
        # Defense against empty or invalid reviews
        if not reviews or not isinstance(reviews, list):
            return "No reviews available for summarization."
        
        # If there are fewer than chunk_size, just do one summary
        if len(reviews) <= base_chunk_size:
            try:
                # Join reviews with clear separation
                doc = "\n\n".join(reviews[:base_chunk_size])
                return summarizer(
                    doc,
                    max_length=60,
                    min_length=20,
                    truncation=True,
                    do_sample=False
                )[0]['summary_text']
            except Exception as e:
                print(f"Error summarizing small batch: {e}")
                # Try with even smaller batch if original fails
                try:
                    half_size = len(reviews) // 2
                    doc = "\n\n".join(reviews[:half_size])
                    return summarizer(
                        doc,
                        max_length=60,
                        min_length=20, 
                        truncation=True,
                        do_sample=False
                    )[0]['summary_text']
                except:
                    return "Error generating summary for this batch."
        
        # Adaptively determine chunk size based on review length
        # If reviews are very short, use larger chunks
        avg_review_len = sum(len(r) for r in reviews[:100]) / min(100, len(reviews))
        if avg_review_len < 100:  # Very short reviews
            chunk_size = min(base_chunk_size * 2, 500)
        elif avg_review_len > 500:  # Very long reviews
            chunk_size = max(base_chunk_size // 2, 50)
        else:
            chunk_size = base_chunk_size
            
        print(f"Worker {worker_id}: Using chunk size {chunk_size} for avg review length {avg_review_len:.1f}")
        
        # Prepare all chunks for processing
        all_chunks = []
        for i in range(0, len(reviews), chunk_size):
            batch = reviews[i:i+chunk_size]
            text = "\n\n".join(batch)
            all_chunks.append(text)
        
        # Process chunks with batched processing
        try:
            intermediate_summaries = process_chunks_batched(all_chunks)
            
            # Summarize the intermediate summaries
            joined = " ".join(intermediate_summaries)
            final_summary = summarizer(
                joined,
                max_length=60,
                min_length=20,
                truncation=True,
                do_sample=False
            )[0]['summary_text']
            
            return final_summary
        except Exception as e:
            print(f"Error in hierarchical summarization: {e}")
            # Try to salvage what we can
            if intermediate_summaries:
                try:
                    return f"Partial summary: {' '.join(intermediate_summaries[:3])}"
                except:
                    pass
            return "Error generating hierarchical summary."
    
    # Process the partition with checkpointing
    results = []
    processed_count = 0
    
    # Create a progress bar for this worker
    with tqdm(total=len(partition_df), desc=f"Worker {worker_id}", position=worker_id) as pbar:
        for idx, row in partition_df.iterrows():
            try:
                # Skip processing if we already have too many errors in a row
                if processed_count > 0 and len(results) == 0:
                    # If first N items all failed, skip this worker
                    if processed_count >= 5:
                        print(f"Worker {worker_id} failing consistently, aborting")
                        break
                
                # Process the review with the adaptive chunk size
                summary = hierarchical_summary(row['Reviews'], base_chunk_size=resources['chunk_size'])
                results.append((idx, summary))
                processed_count += 1
                
                # Clean up every few iterations
                if processed_count % 5 == 0:
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()
                    gc.collect()
                    
                # Checkpoint every 10 items
                if processed_count % 10 == 0:
                    print(f"Worker {worker_id}: Processed {processed_count}/{len(partition_df)} items")
                
            except Exception as e:
                print(f"Error processing row {idx} in worker {worker_id}: {e}")
                # Still record the error so we know this row was attempted
                results.append((idx, f"Error: Failed to generate summary."))
            
            pbar.update(1)
    
    # Final cleanup
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()
    
    print(f"Worker {worker_id} completed: {len(results)}/{len(partition_df)} successful")
    return results

# 6. Schedule the tasks with the delayed partitions
print(f"Scheduling {n_workers} partitions for processing...")
delayed_results = []
for i in range(n_workers):
    delayed_result = process_partition(partitions[i], i)
    delayed_results.append(delayed_result)
    print(f"Scheduled partition {i+1}/{n_workers}")

# 7. Progress tracking and checkpointing
# Create main progress bar for overall progress
print("\nStarting distributed computation with progress tracking:")
main_progress = tqdm(total=len(df_to_process), desc="Overall Progress")

# Start timing
start_time = time.time()

# Create a global progress updater with checkpointing
def update_main_progress(futures):
    """Update progress bar and save checkpoints"""
    checkpoint_file = 'checkpoints/summarization_progress.json'
    summaries_so_far = existing_summaries.copy()
    
    while not stop_flag:
        # Count completed futures
        completed_count = sum(f.status == 'finished' for f in futures)
        completed_percentage = completed_count / len(futures)
        
        # Update progress bar
        main_progress.n = int(len(df_to_process) * completed_percentage)
        main_progress.refresh()
        
        # Check for newly completed results and update checkpoint
        for future in [f for f in futures if f.status == 'finished']:
            try:
                result = future.result()
                for idx, summary in result:
                    summaries_so_far[str(idx)] = summary
            except:
                pass  # Skip failed futures
        
        # Save checkpoint every 30 seconds
        with open(checkpoint_file, 'w') as f:
            json.dump(summaries_so_far, f)
        
        time.sleep(5)

# Submit the tasks to the cluster
futures = client.compute(delayed_results)

# Start a loop to update the main progress bar
stop_flag = False

# Start the progress monitor in a separate thread
monitor_thread = threading.Thread(target=update_main_progress, args=(futures,))
monitor_thread.daemon = True  # Allow program to exit if thread is still running
monitor_thread.start()

# 8. Wait for computation to complete with robust error handling
try:
    print("Computing all partitions...")
    results = client.gather(futures)
except Exception as e:
    # Fallback to direct computation if future gathering fails
    print(f"Error with futures: {e}")
    print("Falling back to direct computation...")
    results = dask.compute(*delayed_results)

# Stop the progress monitor
stop_flag = True
monitor_thread.join(timeout=5)  # Wait for thread to terminate, but with timeout

# Update progress bar to completion
main_progress.n = len(df_to_process)
main_progress.refresh()
main_progress.close()

# 9. Process results with checkpoint recovery
all_results = []

# Gather results from all workers
for worker_results in results:
    if worker_results:  # Check if worker returned any results
        all_results.extend(worker_results)

# Load checkpoint file for any results we already had
checkpoint_file = 'checkpoints/summarization_progress.json'
if os.path.exists(checkpoint_file):
    with open(checkpoint_file, 'r') as f:
        checkpoint_data = json.load(f)
        
    # Add checkpoint data for any missing indices
    result_indices = [idx for idx, _ in all_results]
    for idx_str, summary in checkpoint_data.items():
        idx = int(idx_str)
        if idx not in result_indices:
            all_results.append((idx, summary))

# Sort by index to maintain order
all_results.sort(key=lambda x: x[0])

# Create a dictionary mapping of indices to summaries
result_dict = {idx: summary for idx, summary in all_results}

# Apply to final report
final_report['QuickSummary'] = final_report.index.map(
    lambda idx: result_dict.get(idx, "Summary not generated")
)

# Report final timing
elapsed_time = time.time() - start_time
print(f"\nCompleted in {elapsed_time:.2f} seconds")
print(f"Successfully summarized {len(result_dict)}/{len(final_report)} items")

# Display results
print("\nSample results:")
display(final_report[['steam_appid', 'Theme', 'QuickSummary']].head())

# 10. Save the results
final_report.to_csv('output_csvs/dynamic_summarized_report.csv')
print("Results saved to output_csvs/dynamic_summarized_report.csv")

# Shut down the client and cluster
client.close()
cluster.close()