In [None]:
# # Cell 9 (DASK DISTRIBUTED - FINAL WITH PROGRESS): GPU-optimized hierarchical summarization with Dask

# import pandas as pd
# import numpy as np
# import torch
# import dask
# import dask.dataframe as dd
# from dask.distributed import Client, LocalCluster
# from tqdm.auto import tqdm
# import time

# # Start a local Dask cluster
# n_workers = 4  # Adjust based on your CPU core count
# cluster = LocalCluster(n_workers=n_workers, threads_per_worker=1)
# client = Client(cluster)
# print(f"Dask dashboard available at: {client.dashboard_link}")

# # Define model parameters 
# MODEL_NAME = 'sshleifer/distilbart-cnn-12-6'
# MAX_GPU_BATCH_SIZE = 64  # Large batch size for RTX 4080 Super

# @dask.delayed
# def process_partition(partition_df, worker_id):
#     """Process a partition of the data on a worker"""
#     # Import packages needed in the worker
#     from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
#     import torch
#     from tqdm.auto import tqdm
    
#     # Load tokenizer first
#     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
#     # Load model with device_map="auto" only
#     model = AutoModelForSeq2SeqLM.from_pretrained(
#         MODEL_NAME,
#         torch_dtype=torch.float16,
#         device_map="auto"  # This will handle device placement automatically
#     )
    
#     # Create pipeline with both model AND tokenizer
#     summarizer = pipeline(
#         task='summarization',
#         model=model,
#         tokenizer=tokenizer,
#         framework='pt',
#         model_kwargs={"use_cache": True}
#     )
    
#     # Report worker GPU status
#     gpu_mem = torch.cuda.memory_allocated(0) / (1024**3)
#     print(f"Worker {worker_id}: GPU Memory: {gpu_mem:.2f}GB allocated")
    
#     # Define the hierarchical summary function within the worker
#     def hierarchical_summary(reviews, chunk_size=200, max_len=60, min_len=20):
#         # If there are fewer than chunk_size, just do one summary
#         if len(reviews) <= chunk_size:
#             doc = "\n\n".join(reviews)
#             return summarizer(
#                 doc,
#                 max_length=max_len,
#                 min_length=min_len,
#                 truncation=True,
#                 do_sample=False
#             )[0]['summary_text']
        
#         # Prepare all chunks for processing
#         all_chunks = []
#         for i in range(0, len(reviews), chunk_size):
#             batch = reviews[i:i+chunk_size]
#             text = "\n\n".join(batch)
#             all_chunks.append(text)
        
#         # Process in large batches to utilize GPU
#         summaries = []
#         for i in range(0, len(all_chunks), MAX_GPU_BATCH_SIZE):
#             batch = all_chunks[i:i+MAX_GPU_BATCH_SIZE]
#             batch_summaries = summarizer(
#                 batch,
#                 max_length=max_len,
#                 min_length=min_len,
#                 truncation=True,
#                 do_sample=False
#             )
#             summaries.extend([s['summary_text'] for s in batch_summaries])
        
#         # Summarize the intermediate summaries
#         joined = " ".join(summaries)
#         return summarizer(
#             joined,
#             max_length=max_len,
#             min_length=min_len,
#             truncation=True,
#             do_sample=False
#         )[0]['summary_text']
    
#     # Process the partition with a progress bar
#     results = []
#     # Create a progress bar for this worker
#     with tqdm(total=len(partition_df), desc=f"Worker {worker_id}", position=worker_id) as pbar:
#         for idx, row in partition_df.iterrows():
#             summary = hierarchical_summary(row['Reviews'], chunk_size=200, max_len=60, min_len=20)
#             results.append((idx, summary))
#             pbar.update(1)
            
#             # Clean up every few iterations
#             if len(results) % 5 == 0:
#                 torch.cuda.empty_cache()
    
#     # Clean up at the end
#     torch.cuda.empty_cache()
#     del model
#     del summarizer
    
#     # Return the results for this partition
#     return results

# # Convert pandas DataFrame to Dask DataFrame
# dask_df = dd.from_pandas(final_report, npartitions=n_workers)

# # Set up manual progress tracking
# print(f"Processing {len(final_report)} rows across {n_workers} partitions...")

# # Simple approach to split the dataframe
# partition_size = len(final_report) // n_workers
# delayed_results = []

# # Process each partition separately
# print(f"Scheduling {n_workers} partitions for processing...")
# for i in range(n_workers):
#     # Get start and end index for this partition
#     start_idx = i * partition_size
#     end_idx = (i + 1) * partition_size if i < n_workers - 1 else len(final_report)
    
#     # Get this partition as a pandas DataFrame
#     partition_df = final_report.iloc[start_idx:end_idx].copy()
    
#     # Create a delayed task to process this partition
#     delayed_result = process_partition(partition_df, i)
#     delayed_results.append(delayed_result)
#     print(f"Scheduled partition {i+1}/{n_workers} with {len(partition_df)} rows")

# # Create a main progress bar for overall progress
# print("\nStarting distributed computation with progress tracking:")
# main_progress = tqdm(total=len(final_report), desc="Overall Progress")

# # Start timing
# start_time = time.time()

# # Create a global progress updater
# def update_main_progress(future):
#     # Update main progress bar based on worker progress
#     # This function will be called repeatedly to update the main progress bar
#     completed_tasks = sum(future.status == "finished" for future in client.futures.values())
#     main_progress.n = min(len(final_report), completed_tasks * (len(final_report) // len(delayed_results)))
#     main_progress.refresh()

# # Submit the tasks to the cluster
# futures = client.compute(delayed_results)

# # Start a loop to update the main progress bar
# import threading
# stop_flag = False

# def progress_monitor():
#     while not stop_flag:
#         update_main_progress(futures)
#         time.sleep(0.5)

# # Start the progress monitor in a separate thread
# monitor_thread = threading.Thread(target=progress_monitor)
# monitor_thread.start()

# # Wait for computation to complete
# results = dask.compute(*delayed_results)

# # Stop the progress monitor
# stop_flag = True
# monitor_thread.join()

# # Update progress bar to completion
# main_progress.n = len(final_report)
# main_progress.refresh()
# main_progress.close()

# # Flatten the nested list of results
# all_results = []
# for worker_results in results:
#     all_results.extend(worker_results)

# # Sort by index
# all_results.sort(key=lambda x: x[0])
# summaries = [result[1] for result in all_results]

# # Store results in a new column
# final_report['QuickSummary'] = summaries

# # Report final timing
# elapsed_time = time.time() - start_time
# print(f"\nCompleted in {elapsed_time:.2f} seconds")

# # Display results
# display(final_report[['steam_appid', 'Theme', 'QuickSummary']].head())

# # Shut down the client and cluster
# client.close()
# cluster.close()

In [None]:
# # Cell 9 (ULTRA OPTIMIZED - FIXED): Maximum GPU utilization for RTX 4080 Super

# import pandas as pd
# import numpy as np
# from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
# from tqdm.auto import tqdm
# import torch
# import gc

# # Force CUDA initialization and check memory
# torch.cuda.init()
# total_gpu_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert to GB
# print(f"Total GPU memory: {total_gpu_mem:.2f} GB")

# # Ultra-aggressive GPU optimization parameters for RTX 4080 Super
# MODEL_NAME = 'sshleifer/distilbart-cnn-12-6'
# MAX_GPU_BATCH_SIZE = 64  # Much larger batch size to fully utilize VRAM
# MAX_SEQUENCE_LENGTH = 1024  # Set maximum context length to optimize memory usage

# # Load model and tokenizer directly for maximum control
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# model = AutoModelForSeq2SeqLM.from_pretrained(
#     MODEL_NAME, 
#     torch_dtype=torch.float16,  # Half-precision for maximum throughput
#     device_map="auto"           # Automatically map to available GPU
# )

# # Move model to GPU and optimize for inference
# model.to("cuda")
# model.eval()  # Set to evaluation mode

# # Create a custom pipeline with maximum batch efficiency
# summarizer = pipeline(
#     task='summarization',
#     model=model,
#     tokenizer=tokenizer,
#     framework='pt',
#     # Force maximum GPU memory usage
#     model_kwargs={"use_cache": True}
# )

# # Monitor GPU memory usage
# def gpu_memory_usage():
#     """Return GPU memory usage in GB"""
#     reserved = torch.cuda.memory_reserved(0) / (1024**3)
#     allocated = torch.cuda.memory_allocated(0) / (1024**3)
#     print(f"GPU Memory: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved")
#     return allocated, reserved

# def hierarchical_summary(reviews, chunk_size=200, max_len=60, min_len=20):
#     """
#     Ultra-optimized hierarchical summarization for maximum GPU utilization
#     """
#     # If there are fewer than chunk_size, just do one summary
#     if len(reviews) <= chunk_size:
#         doc = "\n\n".join(reviews)
#         return summarizer(
#             doc,
#             max_length=max_len,
#             min_length=min_len,
#             truncation=True,
#             do_sample=False
#         )[0]['summary_text']
    
#     # 2) Prepare all chunks for processing with ultra-large batches
#     all_chunks = []
#     for i in range(0, len(reviews), chunk_size):
#         batch = reviews[i:i+chunk_size]
#         text = "\n\n".join(batch)
#         all_chunks.append(text)
    
#     # Process in maximally large batches to saturate GPU
#     # This is the key optimization - use much larger batches to fill VRAM
#     summaries = []
#     for i in range(0, len(all_chunks), MAX_GPU_BATCH_SIZE):
#         batch = all_chunks[i:i+MAX_GPU_BATCH_SIZE]
        
#         # Log memory usage before batch
#         print(f"Processing batch of size {len(batch)} ({i}/{len(all_chunks)})")
#         gpu_memory_usage()
        
#         # Process maximum-sized batch
#         batch_summaries = summarizer(
#             batch,
#             max_length=max_len,
#             min_length=min_len,
#             truncation=True,
#             do_sample=False
#         )
#         summaries.extend([s['summary_text'] for s in batch_summaries])
        
#         # Log memory after batch
#         gpu_memory_usage()
    
#     # 3) Summarize the intermediate summaries in a single batch
#     joined = " ".join(summaries)
    
#     # Final summary
#     return summarizer(
#         joined,
#         max_length=max_len,
#         min_length=min_len,
#         truncation=True,
#         do_sample=False
#     )[0]['summary_text']

# # Pre-process all reviews to maximize throughput - FIXED THIS LINE
# print("Preparing all reviews for processing...")
# all_rows = [(i, row['Reviews']) for i, (_, row) in enumerate(final_report.iterrows())]

# # Process the entire dataset in sequential maximum-sized batches
# # This approach ensures GPU is fully saturated
# all_results = []
# with tqdm(total=len(final_report), desc="Ultra GPU Optimization") as pbar:
#     # Process each row with maximum batch efficiency
#     for i in range(0, len(all_rows), 10):  # Process in batches of 10 rows
#         batch_rows = all_rows[i:i+10]
#         batch_results = []
        
#         for batch_idx, (row_idx, reviews) in enumerate(batch_rows):
#             # Force garbage collection before large operations
#             if batch_idx % 5 == 0:
#                 torch.cuda.empty_cache()
#                 gc.collect()
            
#             # Process with maximum GPU utilization
#             summary = hierarchical_summary(
#                 reviews, 
#                 chunk_size=200, 
#                 max_len=60, 
#                 min_len=20
#             )
#             batch_results.append((row_idx, summary))
#             pbar.update(1)
        
#         all_results.extend(batch_results)
#         # Force GPU memory cleanup between large batches
#         torch.cuda.empty_cache()
#         gc.collect()

# # Sort and store results
# all_results.sort(key=lambda x: x[0])
# summaries = [result[1] for result in all_results]

# # Store results in a new column
# final_report['QuickSummary'] = summaries

# # Inspect results
# display(final_report[['steam_appid', 'Theme', 'QuickSummary']].head())

# # Final cleanup
# torch.cuda.empty_cache()

In [None]:
# # Cell 9 (OPTIMIZED - FASTEST SO FAR): GPU-optimized hierarchical summarization for RTX 4080 Super

# import pandas as pd
# import numpy as np
# from transformers import pipeline, AutoTokenizer
# from tqdm.auto import tqdm
# import torch
# import concurrent.futures
# import multiprocessing

# # Check GPU memory and set optimal batch sizes
# gpu_mem = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # Convert to GB
# print(f"Available GPU memory: {gpu_mem:.2f} GB")

# # RTX 4080 Super optimization parameters
# # With 16GB VRAM, we can use larger batch sizes and optimize throughput
# MODEL_NAME = 'sshleifer/distilbart-cnn-12-6'
# MAX_GPU_BATCH_SIZE = 32  # Larger batch size for 16GB VRAM
# PARALLEL_PROCESSES = 4   # Optimal number for balancing CPU and GPU workloads

# # 1) Initialize tokenizer to estimate token counts for optimal batching
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# # 2) Initialize the summarizer pipeline with optimized settings for RTX 4080 Super
# summarizer = pipeline(
#     task='summarization',
#     model=MODEL_NAME,
#     device=0,
#     framework='pt',
#     # Optimized settings for higher throughput
#     model_kwargs={
#         "use_cache": True,  # Enable KV caching for faster inference
#     },
#     # Enable half-precision for faster processing and lower memory usage
#     torch_dtype=torch.float16
# )

# def estimate_tokens(text):
#     """Estimate token count to optimize batching"""
#     return len(tokenizer.encode(text))

# def hierarchical_summary(reviews, chunk_size=200, max_len=60, min_len=20):
#     """
#     GPU-optimized hierarchical summarization with dynamic batching
#     """
#     # If there are fewer than chunk_size, just do one summary
#     if len(reviews) <= chunk_size:
#         doc = "\n\n".join(reviews)
#         return summarizer(
#             doc,
#             max_length=max_len,
#             min_length=min_len,
#             truncation=True,
#             do_sample=False
#         )[0]['summary_text']
    
#     # 2) Prepare all chunks for processing
#     all_chunks = []
#     for i in range(0, len(reviews), chunk_size):
#         batch = reviews[i:i+chunk_size]
#         text = "\n\n".join(batch)
#         all_chunks.append(text)
    
#     # Dynamically determine optimal batch size based on token counts
#     # For RTX 4080 Super with 16GB, we can process larger batches
#     summaries = []
#     for i in range(0, len(all_chunks), MAX_GPU_BATCH_SIZE):
#         batch = all_chunks[i:i+MAX_GPU_BATCH_SIZE]
#         batch_summaries = summarizer(
#             batch,
#             max_length=max_len,
#             min_length=min_len,
#             truncation=True,
#             do_sample=False
#         )
#         summaries.extend([s['summary_text'] for s in batch_summaries])
    
#     # 3) Summarize the intermediate summaries
#     # RTX 4080 Super can handle the full set of intermediate summaries
#     joined = " ".join(summaries)
#     return summarizer(
#         joined,
#         max_length=max_len,
#         min_length=min_len,
#         truncation=True,
#         do_sample=False
#     )[0]['summary_text']

# # 4) Function to process each batch of rows with GPU optimization
# def process_gpu_batch(batch_df):
#     results = []
#     # Pre-collect all reviews to optimize memory transfers to GPU
#     all_rows = [(row.name, row['Reviews']) for _, row in batch_df.iterrows()]
    
#     for idx, reviews in all_rows:
#         # Use optimized hierarchical summary function
#         summary = hierarchical_summary(reviews, chunk_size=200, max_len=60, min_len=20)
#         results.append((idx, summary))
        
#         # Optional: Force CUDA cache clearing every few iterations to prevent memory fragmentation
#         if idx % 10 == 0:
#             torch.cuda.empty_cache()
            
#     return results

# # Calculate optimal processing strategy based on dataset size
# total_rows = len(final_report)
# # Determine batch size for parallel processing
# optimal_batch_size = max(1, total_rows // PARALLEL_PROCESSES)

# # Split dataframe into optimized batches
# batches = [final_report.iloc[i:i+optimal_batch_size] for i in range(0, total_rows, optimal_batch_size)]

# # Process with concurrent.futures and progress tracking
# all_results = []
# with tqdm(total=total_rows, desc="GPU Summarizing (RTX 4080 Super)") as pbar:
#     # Use ThreadPoolExecutor to manage parallel GPU tasks
#     with concurrent.futures.ThreadPoolExecutor(max_workers=PARALLEL_PROCESSES) as executor:
#         future_to_batch = {executor.submit(process_gpu_batch, batch): batch for batch in batches}
        
#         for future in concurrent.futures.as_completed(future_to_batch):
#             try:
#                 batch_results = future.result()
#                 all_results.extend(batch_results)
#                 batch_size = len(future_to_batch[future])
#                 pbar.update(batch_size)
#             except Exception as e:
#                 print(f"Error processing batch: {e}")
#                 # Continue with remaining batches

# # Sort results by the original index
# all_results.sort(key=lambda x: x[0])
# summaries = [result[1] for result in all_results]

# # 5) Store results in a new column
# final_report['QuickSummary'] = summaries

# # 6) Inspect results
# display(final_report[['steam_appid', 'Theme', 'QuickSummary']].head())

# # 7) Clean up GPU memory
# torch.cuda.empty_cache()

In [None]:
# # Cell 9 (BIG DATA - BUT WITH PYTHON-FUTURES): Hierarchical summarization of all reviews per theme using parallel processing

# import pandas as pd
# import numpy as np
# from transformers import pipeline
# from tqdm.auto import tqdm
# import concurrent.futures
# import multiprocessing

# # Set up the number of workers based on CPU cores (with one less to avoid overloading)
# num_workers = max(1, multiprocessing.cpu_count() - 1)

# # 1) Initialize the summarizer pipeline globally
# # This avoids serialization issues
# summarizer = pipeline(
#     task='summarization',
#     model='sshleifer/distilbart-cnn-12-6',
#     framework='pt'
# )

# def hierarchical_summary(reviews, chunk_size=200, max_len=60, min_len=20):
#     """
#     Summarize a long list of reviews into one short summary using parallel processing for chunks
#     """
#     # If there are fewer than chunk_size, just do one summary
#     if len(reviews) <= chunk_size:
#         doc = "\n\n".join(reviews)
#         return summarizer(
#             doc,
#             max_length=max_len,
#             min_length=min_len,
#             truncation=True,
#             do_sample=False
#         )[0]['summary_text']
    
#     # 2) Prepare all chunks for processing
#     all_chunks = []
#     for i in range(0, len(reviews), chunk_size):
#         batch = reviews[i:i+chunk_size]
#         text = "\n\n".join(batch)
#         all_chunks.append(text)
    
#     # Process chunks in a batch to maximize GPU usage
#     intermediate_summaries = summarizer(
#         all_chunks,
#         max_length=max_len,
#         min_length=min_len,
#         truncation=True,
#         do_sample=False
#     )
    
#     # Extract summary texts
#     intermediate = [summary['summary_text'] for summary in intermediate_summaries]
    
#     # 3) Summarize the intermediate summaries
#     joined = " ".join(intermediate)
#     return summarizer(
#         joined,
#         max_length=max_len,
#         min_length=min_len,
#         truncation=True,
#         do_sample=False
#     )[0]['summary_text']

# # 4) Function to process each batch of rows in parallel
# def process_batch(batch_df):
#     results = []
#     for _, row in batch_df.iterrows():
#         summary = hierarchical_summary(row['Reviews'], chunk_size=200, max_len=60, min_len=20)
#         results.append((row.name, summary))
#     return results

# # Split the dataframe into batches for parallel processing
# def split_dataframe(df, batch_size):
#     batches = []
#     for i in range(0, len(df), batch_size):
#         batches.append(df.iloc[i:i+batch_size])
#     return batches

# # Calculate optimal batch size based on dataset size and worker count
# batch_size = max(1, len(final_report) // num_workers)
# batches = split_dataframe(final_report, batch_size)

# # Use ThreadPoolExecutor for parallel processing with progress bar
# all_results = []
# with tqdm(total=len(final_report), desc="Summarizing themes") as pbar:
#     with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
#         # Submit all batches to the executor
#         future_to_batch = {executor.submit(process_batch, batch): batch for batch in batches}
        
#         # Process completed batches and update progress
#         for future in concurrent.futures.as_completed(future_to_batch):
#             batch_results = future.result()
#             all_results.extend(batch_results)
#             # Update progress bar by the number of rows processed in this batch
#             pbar.update(len(future_to_batch[future]))

# # Sort results by the original index and extract summaries
# all_results.sort(key=lambda x: x[0])  # Sort by index
# summaries = [result[1] for result in all_results]

# # 5) Store results in a new column
# final_report['QuickSummary'] = summaries

# # 6) Inspect
# display(final_report[['steam_appid', 'Theme', 'QuickSummary']].head())

In [None]:
# # Cell 9 (FIXED - FIRST): Hierarchical summarization of all reviews per theme

# from transformers import pipeline
# from tqdm.auto import tqdm

# # 1) Initialize a single summarizer pipeline
# summarizer = pipeline(
#     task='summarization',
#     model='sshleifer/distilbart-cnn-12-6',
#     device=0,              # change to -1 if no GPU
#     framework='pt'
# )

# def hierarchical_summary(reviews, chunk_size=200,
#                          max_len=60, min_len=20):
#     """
#     Summarize a long list of reviews into one short summary:
#       1) Chunk the reviews into batches of chunk_size
#       2) Summarize each batch
#       3) Summarize the concatenation of batch summaries
    
#     Params:
#       reviews    : list of str, the reviews to summarize
#       chunk_size : int, number of reviews per intermediate chunk
#       max_len    : int, max summary tokens per call
#       min_len    : int, min summary tokens per call
    
#     Returns:
#       str, final "quick read" summary
#     """
#     # If there are fewer than chunk_size, just do one summary
#     if len(reviews) <= chunk_size:
#         doc = "\n\n".join(reviews)
#         return summarizer(
#             doc,
#             max_length=max_len,
#             min_length=min_len,
#             truncation=True,
#             do_sample=False
#         )[0]['summary_text']
    
#     # 2) Prepare all chunks for batch processing
#     all_chunks = []
#     for i in range(0, len(reviews), chunk_size):
#         batch = reviews[i:i+chunk_size]
#         text = "\n\n".join(batch)
#         all_chunks.append(text)
    
#     # Process all chunks in one batch
#     intermediate_summaries = summarizer(
#         all_chunks,
#         max_length=max_len,
#         min_length=min_len,
#         truncation=True,
#         do_sample=False
#     )
    
#     # Extract summary texts
#     intermediate = [summary['summary_text'] for summary in intermediate_summaries]
    
#     # 3) Summarize the intermediate summaries
#     joined = " ".join(intermediate)
#     return summarizer(
#         joined,
#         max_length=max_len,
#         min_length=min_len,
#         truncation=True,
#         do_sample=False
#     )[0]['summary_text']

# # 4) Apply to each row of final_report with progress bar
# quick_summaries = []
# for _, row in tqdm(final_report.iterrows(),
#                   total=len(final_report),
#                   desc="Summarizing themes"):
#     revs = row['Reviews']
#     quick = hierarchical_summary(revs,
#                                  chunk_size=200,
#                                  max_len=60,
#                                  min_len=20)
#     quick_summaries.append(quick)

# # 5) Store results in a new column
# final_report['QuickSummary'] = quick_summaries

# # 6) Inspect
# display(final_report[['steam_appid','Theme','QuickSummary']].head())