# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt
from dask.diagnostics import ProgressBar

# Data Loading and Preparation

In [None]:
# Enable progress bar for better visibility of Dask operations
ProgressBar().register()

# Load all parquet files in one call using Dask
print("Reading parquet files...")
df = dd.read_parquet("top_100_parquet/*.parquet")

# Handle missing values in key columns
df['author_playtime_forever'] = df['author_playtime_forever'].fillna(0)
# Convert boolean voted_up to integer for aggregation (True=1, False=0)
df['voted_up_int'] = df['voted_up'].astype('bool').astype('int')
# Convert playtime from minutes to hours
df['playtime_hours'] = df['author_playtime_forever'] / 60.0

# Display the first few rows to verify data loading
df.head()

# Task 1 - Volume per Game

In [None]:
print("\n=== VOLUME PER GAME ===")
# Count reviews per game
game_review_counts = df.groupby('steam_appid').size()
game_review_counts = game_review_counts.reset_index()
game_review_counts.columns = ['steam_appid', 'review_count']
# Compute and sort by review count (descending)
top_games_by_volume = game_review_counts.compute().sort_values('review_count', ascending=False).head(100)
print("Top 20 games by review count:")
print(top_games_by_volume.head())
print('\n')
print(top_games_by_volume.tail())


In [None]:
print("\n=== VOLUME PER GAME (ENGLISH REVIEWS ONLY) ===")
# Filter for English reviews only
english_df = df[df['review_language'] == 'english']
# Count reviews per game
game_review_counts = english_df.groupby('steam_appid').size()
game_review_counts = game_review_counts.reset_index()
game_review_counts.columns = ['steam_appid', 'review_count']
# Compute and sort by review count (descending)
top_games_by_volume = game_review_counts.compute().sort_values('review_count', ascending=False).head(100)
print("Top 20 games by review count:")
print(top_games_by_volume.head(20))
print('\n')
print("Bottom 5 of top 100:")
print(top_games_by_volume.tail())

# SKIP THIS

In [None]:
# import os
# import shutil

# # Extract top 100 appids from your dataframe
# top_100_appids = top_games_by_volume['steam_appid'].tolist()

# # Source and destination
# source_folder = 'cleaned_data_polars'
# destination_folder = 'top_100_parquet'
# os.makedirs(destination_folder, exist_ok=True)

# # Copy .parquet files
# for appid in top_100_appids:
#     src = os.path.join(source_folder, f"{appid}.parquet")
#     dst = os.path.join(destination_folder, f"{appid}.parquet")
#     if os.path.exists(src):
#         shutil.copy2(src, dst)
#         print(f"Copied: {appid}.parquet")
#     else:
#         print(f"Missing: {appid}.parquet")


# Task 2 - Sentiment Proxy (Votes-Up Ratio)

In [None]:
print("\n=== SENTIMENT PROXY - VOTES-UP RATIO ===")
# Calculate sum of positive reviews and total count per game
sentiment_agg = df.groupby('steam_appid').agg({
    'voted_up_int': ['sum', 'count']
}).compute()

# Process results in pandas
sentiment_agg.columns = ['votes_up_sum', 'review_count']
sentiment_agg['positive_ratio'] = sentiment_agg['votes_up_sum'] / sentiment_agg['review_count']

# Filter games with at least 100 reviews
top_sentiment = (
    sentiment_agg[sentiment_agg['review_count'] >= 100]
    .sort_values('positive_ratio', ascending=False)
    .head(20)
    .reset_index()
)

print("Top 20 games by positive ratio (minimum 100 reviews):")
top_sentiment[['steam_appid', 'review_count', 'positive_ratio']]

# Playtime Statistics per Game

In [None]:
print("\n=== PLAY-TIME DISTRIBUTIONS ===")

# Define function to compute playtime stats for a partition of data
def compute_playtime_stats(partition_df):
    result = []
    
    # Get unique app IDs in this partition
    app_ids = partition_df['steam_appid'].unique()
    
    for app_id in app_ids:
        # Get playtime data for this app within the partition
        app_data = partition_df[partition_df['steam_appid'] == app_id]['playtime_hours']
        
        if len(app_data) > 0:
            result.append({
                'steam_appid': app_id,
                'count': len(app_data),
                'sum_hours': app_data.sum(),
                'median_hours': app_data.median(),
                'percentile_95_hours': app_data.quantile(0.95)
            })
    
    return pd.DataFrame(result)

# Apply the function to each partition
print("Computing playtime statistics in parallel...")
partition_stats = df.map_partitions(compute_playtime_stats).compute()

# Simpler aggregation approach
print("Combining results...")
playtime_stats_combined = partition_stats.groupby('steam_appid').agg({
    'count': 'sum',
    'sum_hours': 'sum',
    'median_hours': 'mean',  # Approximation: average of partition medians
    'percentile_95_hours': 'max'  # Conservative estimate: max of partition 95th percentiles
}).reset_index()

# Calculate the mean after aggregation
playtime_stats_combined['mean_hours'] = playtime_stats_combined['sum_hours'] / playtime_stats_combined['count']

# Reorder columns for better readability
playtime_stats_df = playtime_stats_combined[['steam_appid', 'count', 'mean_hours', 'median_hours', 'percentile_95_hours']]

print("Playtime statistics per game (showing first 20):")
playtime_stats_df.head(20)

# Generate Global Playtime Histogram

In [None]:
print("Generating playtime histogram...")

# For large datasets, sample to avoid memory issues
estimated_size = df.shape[0].compute()

if estimated_size > 1_000_000:
    # Use a sampling fraction that gives us at most 1M records
    sample_frac = min(1_000_000 / estimated_size, 1.0)
    print(f"Sampling {sample_frac:.2%} of data for histogram ({estimated_size:,} records)")
    playtime_data = df['playtime_hours'].sample(frac=sample_frac).compute()
else:
    # For smaller datasets, use all data
    playtime_data = df['playtime_hours'].compute()

# Filter outliers for better visualization (keep playtimes under 1000 hours)
filtered_playtime = playtime_data[playtime_data < 1000]

# Create histogram with 1-hour bins
plt.figure(figsize=(12, 8))
plt.hist(filtered_playtime, bins=np.arange(0, 1000, 1), alpha=0.75)
plt.title('Distribution of Playtime Hours (excluding outliers > 1000 hours)')
plt.xlabel('Playtime (hours)')
plt.ylabel('Number of Reviews')
plt.grid(True, linestyle='--', alpha=0.7)

# Add vertical lines for key statistics
global_mean = filtered_playtime.mean()
global_median = filtered_playtime.median()

plt.axvline(global_mean, color='r', linestyle='--', label=f'Mean: {global_mean:.2f} hours')
plt.axvline(global_median, color='g', linestyle='--', label=f'Median: {global_median:.2f} hours')

plt.legend()
plt.tight_layout()

# Save the histogram
plt.savefig('playtime_hist.png', dpi=300)
print("Histogram saved as 'playtime_hist.png'")

# Display the plot in the notebook
plt.show()

In [None]:
# Install required libraries
%pip install bertopic sentence-transformers umap-learn hdbscan dask[dataframe] pyarrow
%pip install plotly  # For visualizations
%mkdir -p models outputs  # Create directories for outputs

In [None]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm.auto import tqdm

# Set up logging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load all parquet files with Dask
logger.info("Loading parquet files...")
df = dd.read_parquet("top_100_parquet/*.parquet")
df = df[df['review_language'] == 'english']
# Check the shape before sampling
total_rows = len(df)
logger.info(f"Total rows before sampling: {total_rows}")

# Sample 10% of the data and convert to pandas
# Using random_state for reproducibility
logger.info("Sampling data...")
df_sample = df.sample(frac=0.1, random_state=42).compute()

logger.info(f"Sample shape: {df_sample.shape}")
df_sample.head()

In [None]:
import re
import string
from tqdm.auto import tqdm

def clean_text(text):
    """
    Clean text by removing URLs, emojis, and special characters
    """
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove emojis (simple approach)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0000257F"  # Enclosed characters
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply cleaning function and filter short reviews
logger.info("Cleaning text data...")
df_sample['clean_review'] = df_sample['review'].apply(clean_text)

# Filter reviews that are too short (less than 20 characters)
logger.info("Filtering short reviews...")
df_sample = df_sample[df_sample['clean_review'].str.len() >= 20].reset_index(drop=True)

logger.info(f"Shape after cleaning: {df_sample.shape}")
df_sample[['review', 'clean_review']].head(2)

In [None]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import umap
import hdbscan

# Set up the embedding model
logger.info("Loading sentence transformer model...")
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Set up UMAP for dimensionality reduction
logger.info("Configuring UMAP...")
umap_model = umap.UMAP(
    n_components=5,      # Dimension of the low dimensional space
    n_neighbors=15,      # Size of local neighborhood
    min_dist=0.0,        # Minimum distance between points in low dimensional space
    metric='cosine',     # Distance metric
    random_state=42      # For reproducibility
)

# Set up HDBSCAN for clustering
logger.info("Configuring HDBSCAN...")
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=20,     # Minimum size of clusters
    metric='euclidean',      # Distance metric
    cluster_selection_method='eom',  # Excess of Mass algorithm
    prediction_data=True     # Required for predicting new examples
)

# Set up BERTopic
logger.info("Configuring BERTopic...")
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    top_n_words=10,
    verbose=True
)

# Prepare corpus for topic modeling
corpus = df_sample['clean_review'].tolist()

# Fit the model and transform to get topic assignments
logger.info("Fitting BERTopic model...")
topics, probs = topic_model.fit_transform(corpus)

# Add topics to the dataframe
df_sample['topic'] = topics

logger.info(f"Number of topics found: {len(set(topics))}")

In [None]:
# Show top topics
logger.info("Top topics:")
topic_info = topic_model.get_topic_info()
print(topic_info.head(10))

# For each of the first 5 topics (excluding -1 which is outliers), print example reviews
logger.info("Example reviews for top topics:")

# Get list of topics excluding -1 (outliers)
top_topics = [topic for topic in topic_info['Topic'].tolist() if topic != -1][:5]

for topic in top_topics:
    print(f"\n\n--- TOPIC {topic}: {', '.join(word for word, _ in topic_model.get_topic(topic)[:5])} ---")
    
    # Get indices of documents in this topic
    doc_indices = [i for i, t in enumerate(topics) if t == topic]
    
    # Print 3 example reviews (or fewer if there aren't 3)
    samples = min(3, len(doc_indices))
    for i in range(samples):
        idx = doc_indices[i]
        # Print truncated version of the review for readability
        review_text = df_sample.iloc[idx]['clean_review']
        print(f"Example {i+1}: {review_text[:200]}..." if len(review_text) > 200 else f"Example {i+1}: {review_text}")

In [None]:
import plotly.io as pio
pio.renderers.default = "notebook"  # Set default renderer for Jupyter

# Create topic frequency bar chart
logger.info("Generating topic frequency visualization...")
fig1 = topic_model.visualize_barchart(top_n_topics=10)
fig1.write_html("outputs/topic_barchart.html")
fig1

# Create UMAP 2D scatter plot
logger.info("Generating UMAP topic visualization...")
fig2 = topic_model.visualize_topics()
fig2.write_html("outputs/topics_scatter.html")
fig2

# Try a different visualization instead
logger.info("Generating hierarchical topic visualization...")
try:
    # Try hierarchical topic visualization
    fig3 = topic_model.visualize_hierarchy()
    fig3.write_html("outputs/topics_hierarchy.html")
    fig3
except Exception as e:
    logger.warning(f"Hierarchical visualization failed: {e}")
    
    # If that fails, try document visualization
    try:
        logger.info("Trying document visualization instead...")
        fig3 = topic_model.visualize_documents(df_sample['clean_review'])
        fig3.write_html("outputs/document_viz.html")
        fig3
    except Exception as e2:
        logger.warning(f"Document visualization also failed: {e2}")
        print("Skipping third visualization due to compatibility issues.")

In [None]:
# Let's analyze if certain game features correlate with specific topics
logger.info("Analyzing game features by topic...")

# Key features to analyze
features = ['categories', 'genres', 'voted_up', 'author_playtime_forever', 'achievements']

# Create summary statistics for each topic
topic_stats = {}

for topic in set(topics):
    if topic == -1:  # Skip outliers
        continue
        
    # Get data for this topic
    topic_data = df_sample[df_sample['topic'] == topic]
    
    # Calculate statistics
    stats = {
        'count': len(topic_data),
        'avg_playtime': topic_data['author_playtime_forever'].mean(),
        'pct_positive': (topic_data['voted_up'] == True).mean() * 100,
    }
    
    # Get top genres and categories
    if 'genres' in topic_data.columns:
        # Flatten the list of genres
        all_genres = []
        for genres_list in topic_data['genres'].dropna():
            if isinstance(genres_list, list):
                all_genres.extend(genres_list)
        
        # Count occurrences
        from collections import Counter
        genre_counts = Counter(all_genres)
        stats['top_genres'] = genre_counts.most_common(3)
    
    topic_stats[topic] = stats

# Display statistics for top topics
for topic in top_topics:
    if topic in topic_stats:
        print(f"\nTopic {topic} Statistics:")
        print(f"Count: {topic_stats[topic]['count']}")
        print(f"Average Playtime: {topic_stats[topic]['avg_playtime']:.1f} hours")
        print(f"Percentage Positive: {topic_stats[topic]['pct_positive']:.1f}%")
        if 'top_genres' in topic_stats[topic]:
            print("Top Genres:", topic_stats[topic]['top_genres'])
        print("-" * 40)

In [None]:
# Save the model
logger.info("Saving BERTopic model...")
topic_model.save("models/bertopic_steam")

# Save topic assignments
logger.info("Saving topic assignments...")
df_sample[['review', 'clean_review', 'topic']].to_csv("outputs/topic_labels.csv", index=False)

# Save topic information
topic_model.get_topic_info().to_csv("outputs/topic_info.csv", index=False)

# Save topic words
all_topics = {}
for topic in set(topics):
    if topic != -1:  # Skip outliers
        all_topics[topic] = topic_model.get_topic(topic)
        
import json
with open("outputs/topic_words.json", "w") as f:
    # Convert to a more JSON-friendly format
    json_friendly = {str(k): [{"word": w, "score": s} for w, s in v] for k, v in all_topics.items()}
    json.dump(json_friendly, f, indent=2)

logger.info("All artifacts saved!")

In [None]:
# Generate a simple HTML report
logger.info("Generating HTML report...")

html = """
<html>
<head>
    <title>Steam Reviews Topic Analysis</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 40px; line-height: 1.6; }
        h1 { color: #333; }
        .topic { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; border-radius: 5px; }
        .keywords { color: #1a73e8; font-weight: bold; }
        .stats { color: #666; }
        .example { background-color: #f8f9fa; padding: 15px; margin: 10px 0; border-radius: 5px; }
    </style>
</head>
<body>
    <h1>Steam Reviews Topic Analysis</h1>
    <p>This report presents the key topics discovered in Steam game reviews.</p>
"""

# Add topics to the report
for topic in top_topics:
    if topic in topic_stats:
        # Get topic words
        topic_words = [word for word, _ in topic_model.get_topic(topic)[:10]]
        
        # Get example reviews
        doc_indices = [i for i, t in enumerate(topics) if t == topic][:3]
        examples = [df_sample.iloc[idx]['clean_review'][:300] + "..." for idx in doc_indices]
        
        html += f"""
        <div class="topic">
            <h2>Topic {topic}</h2>
            <p class="keywords">Keywords: {', '.join(topic_words)}</p>
            <p class="stats">
                Number of reviews: {topic_stats[topic]['count']}<br>
                Average playtime: {topic_stats[topic]['avg_playtime']:.1f} hours<br>
                Percentage positive: {topic_stats[topic]['pct_positive']:.1f}%
            </p>
            <h3>Example Reviews:</h3>
        """
        
        for i, example in enumerate(examples):
            html += f'<div class="example">{example}</div>'
        
        html += "</div>"

html += """
</body>
</html>
"""

# Write HTML report
with open("outputs/topic_report.html", "w") as f:
    f.write(html)

logger.info("HTML report generated at 'outputs/topic_report.html'")