In [1]:
# Install required libraries
%pip install bertopic sentence-transformers umap-learn hdbscan dask[dataframe] pyarrow
%pip install plotly  # For visualizations
%mkdir -p models_english outputs_english  # Create directories for outputs

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm.auto import tqdm

# Set up logging
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load all parquet files with Dask
logger.info("Loading parquet files...")
df = dd.read_parquet("top_100_parquet/*.parquet")
df = df[df['review_language'] == 'english']
# Check the shape before sampling
total_rows = len(df)
logger.info(f"Total rows before sampling: {total_rows}")

# Sample 10% of the data and convert to pandas
# Using random_state for reproducibility
logger.info("Sampling data...")
df_sample = df.sample(frac=0.1, random_state=42).compute()

logger.info(f"Sample shape: {df_sample.shape}")
df_sample.head()

INFO:__main__:Loading parquet files...
INFO:__main__:Total rows before sampling: 5645316
INFO:__main__:Sampling data...
INFO:__main__:Sample shape: (564533, 29)


Unnamed: 0,name,steam_appid,required_age,is_free,controller_support,detailed_description,about_the_game,short_description,price_overview,metacritic_score,...,author_num_reviews,author_playtime_forever,author_play_time_last_two_weeks,author_playtime_at_review,author_last_played,review,voted_up,votes_up,votes_funny,weighted_vote_score
59466,Counter-Strike,10,0,False,,Play the worlds number 1 online action game En...,Play the worlds number 1 online action game En...,Play the worlds number 1 online action game En...,$9.99,88,...,2,9515.0,0.0,9507.0,1387817000.0,Would recommend anytime best competetive FPS e...,True,0,1,0.5
63382,Counter-Strike,10,0,False,,Play the worlds number 1 online action game En...,Play the worlds number 1 online action game En...,Play the worlds number 1 online action game En...,$9.99,88,...,1,1095.0,0.0,637.0,1539682000.0,table tr td httpswwwyoutubecomwatchvBQ2lEHs3lG...,True,1,0,0.509804
52936,Counter-Strike,10,0,False,,Play the worlds number 1 online action game En...,Play the worlds number 1 online action game En...,Play the worlds number 1 online action game En...,$9.99,88,...,7,626.0,0.0,371.0,1641059000.0,Getting flamed by 20 year olds is always fun,True,0,0,0.5
84145,Counter-Strike,10,0,False,,Play the worlds number 1 online action game En...,Play the worlds number 1 online action game En...,Play the worlds number 1 online action game En...,$9.99,88,...,14,471.0,4.0,419.0,1743653000.0,csgo on min specs,True,1,0,0.52381
38855,Counter-Strike,10,0,False,,Play the worlds number 1 online action game En...,Play the worlds number 1 online action game En...,Play the worlds number 1 online action game En...,$9.99,88,...,1,47189.0,0.0,9965.0,1741425000.0,1993,True,0,0,0.5


In [2]:
import re
import string
from tqdm.auto import tqdm

def clean_text(text):
    """
    Clean text by removing URLs, emojis, and special characters
    """
    if not isinstance(text, str):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Remove emojis (simple approach)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0000257F"  # Enclosed characters
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply cleaning function and filter short reviews
logger.info("Cleaning text data...")
df_sample['clean_review'] = df_sample['review'].apply(clean_text)

# Filter reviews that are too short (less than 20 characters)
logger.info("Filtering short reviews...")
df_sample = df_sample[df_sample['clean_review'].str.len() >= 20].reset_index(drop=True)

logger.info(f"Shape after cleaning: {df_sample.shape}")
df_sample[['review', 'clean_review']].head(2)

INFO:__main__:Cleaning text data...
INFO:__main__:Filtering short reviews...
INFO:__main__:Shape after cleaning: (439139, 30)


Unnamed: 0,review,clean_review
0,Would recommend anytime best competetive FPS e...,would recommend anytime best competetive fps e...
1,Getting flamed by 20 year olds is always fun,getting flamed by 20 year olds is always fun


In [3]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import umap
import hdbscan

# Set up the embedding model
logger.info("Loading sentence transformer model...")
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Set up UMAP for dimensionality reduction
logger.info("Configuring UMAP...")
umap_model = umap.UMAP(
    n_components=5,      # Dimension of the low dimensional space
    n_neighbors=15,      # Size of local neighborhood
    min_dist=0.0,        # Minimum distance between points in low dimensional space
    metric='cosine',     # Distance metric
    random_state=42      # For reproducibility
)

# Set up HDBSCAN for clustering
logger.info("Configuring HDBSCAN...")
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=20,     # Minimum size of clusters
    metric='euclidean',      # Distance metric
    cluster_selection_method='eom',  # Excess of Mass algorithm
    prediction_data=True     # Required for predicting new examples
)

# Set up BERTopic
logger.info("Configuring BERTopic...")
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    top_n_words=10,
    verbose=True
)

# Prepare corpus for topic modeling
corpus = df_sample['clean_review'].tolist()

# Fit the model and transform to get topic assignments
logger.info("Fitting BERTopic model...")
topics, probs = topic_model.fit_transform(corpus)

# Add topics to the dataframe
df_sample['topic'] = topics

logger.info(f"Number of topics found: {len(set(topics))}")

INFO:__main__:Loading sentence transformer model...
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda:0
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:__main__:Configuring UMAP...
INFO:__main__:Configuring HDBSCAN...
INFO:__main__:Configuring BERTopic...
INFO:__main__:Fitting BERTopic model...
2025-04-29 23:28:38,392 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/13724 [00:00<?, ?it/s]

2025-04-29 23:29:41,614 - BERTopic - Embedding - Completed ✓
2025-04-29 23:29:41,614 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-04-29 23:33:40,738 - BERTopic - Dimensionality - Completed ✓
2025-04-29 23:33:40,740 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

In [4]:
# Show top topics
logger.info("Top topics:")
topic_info = topic_model.get_topic_info()
print(topic_info.head(10))

# For each of the first 5 topics (excluding -1 which is outliers), print example reviews
logger.info("Example reviews for top topics:")

# Get list of topics excluding -1 (outliers)
top_topics = [topic for topic in topic_info['Topic'].tolist() if topic != -1][:5]

for topic in top_topics:
    print(f"\n\n--- TOPIC {topic}: {', '.join(word for word, _ in topic_model.get_topic(topic)[:5])} ---")
    
    # Get indices of documents in this topic
    doc_indices = [i for i, t in enumerate(topics) if t == topic]
    
    # Print 3 example reviews (or fewer if there aren't 3)
    samples = min(3, len(doc_indices))
    for i in range(samples):
        idx = doc_indices[i]
        # Print truncated version of the review for readability
        review_text = df_sample.iloc[idx]['clean_review']
        print(f"Example {i+1}: {review_text[:200]}..." if len(review_text) > 200 else f"Example {i+1}: {review_text}")

INFO:__main__:Top topics:
INFO:__main__:Example reviews for top topics:


   Topic   Count                                  Name  \
0     -1  173574                      -1_you_to_get_it   
1      0    4775                0_dlc_dlcs_buy_edition   
2      1    4536       1_story_storyline_line_gameplay   
3      2    3957            2_1010_amazing_would_again   
4      3    3177  3_soundtrack_music_soundtracks_sound   
5      4    3129                4_gtx_fps_settings_ram   
6      5    2895      5_borderlands_handsome_vault_bl2   
7      6    2556          6_survival_mode_survive_best   
8      7    2301            7_guns_gun_shooter_bullets   
9      8    2245   8_fallout_vegas_bethesda_settlement   

                                      Representation  \
0  [you, to, get, it, and, my, your, this, or, have]   
1  [dlc, dlcs, buy, edition, sale, content, expen...   
2  [story, storyline, line, gameplay, plot, graph...   
3  [1010, amazing, would, again, soundtrack, grap...   
4  [soundtrack, music, soundtracks, sound, art, a...   
5  [gtx, fps, settings, r

In [5]:
import plotly.io as pio
pio.renderers.default = "notebook"  # Set default renderer for Jupyter

# Create topic frequency bar chart
logger.info("Generating topic frequency visualization...")
fig1 = topic_model.visualize_barchart(top_n_topics=10)
fig1.write_html("outputs_english/topic_barchart.html")
fig1

# Create UMAP 2D scatter plot
logger.info("Generating UMAP topic visualization...")
fig2 = topic_model.visualize_topics()
fig2.write_html("outputs_english/topics_scatter.html")
fig2

# Try a different visualization instead
logger.info("Generating hierarchical topic visualization...")
try:
    # Try hierarchical topic visualization
    fig3 = topic_model.visualize_hierarchy()
    fig3.write_html("outputs_english/topics_hierarchy.html")
    fig3
except Exception as e:
    logger.warning(f"Hierarchical visualization failed: {e}")
    
    # If that fails, try document visualization
    try:
        logger.info("Trying document visualization instead...")
        fig3 = topic_model.visualize_documents(df_sample['clean_review'])
        fig3.write_html("outputs_english/document_viz.html")
        fig3
    except Exception as e2:
        logger.warning(f"Document visualization also failed: {e2}")
        print("Skipping third visualization due to compatibility issues.")

INFO:__main__:Generating topic frequency visualization...
INFO:__main__:Generating UMAP topic visualization...
INFO:__main__:Generating hierarchical topic visualization...


In [6]:
# Let's analyze if certain game features correlate with specific topics
logger.info("Analyzing game features by topic...")

# Key features to analyze
features = ['categories', 'genres', 'voted_up', 'author_playtime_forever', 'achievements']

# Create summary statistics for each topic
topic_stats = {}

for topic in set(topics):
    if topic == -1:  # Skip outliers
        continue
        
    # Get data for this topic
    topic_data = df_sample[df_sample['topic'] == topic]
    
    # Calculate statistics
    stats = {
        'count': len(topic_data),
        'avg_playtime': topic_data['author_playtime_forever'].mean(),
        'pct_positive': (topic_data['voted_up'] == True).mean() * 100,
    }
    
    # Get top genres and categories
    if 'genres' in topic_data.columns:
        # Flatten the list of genres
        all_genres = []
        for genres_list in topic_data['genres'].dropna():
            if isinstance(genres_list, list):
                all_genres.extend(genres_list)
        
        # Count occurrences
        from collections import Counter
        genre_counts = Counter(all_genres)
        stats['top_genres'] = genre_counts.most_common(3)
    
    topic_stats[topic] = stats

# Display statistics for top topics
for topic in top_topics:
    if topic in topic_stats:
        print(f"\nTopic {topic} Statistics:")
        print(f"Count: {topic_stats[topic]['count']}")
        print(f"Average Playtime: {topic_stats[topic]['avg_playtime']:.1f} hours")
        print(f"Percentage Positive: {topic_stats[topic]['pct_positive']:.1f}%")
        if 'top_genres' in topic_stats[topic]:
            print("Top Genres:", topic_stats[topic]['top_genres'])
        print("-" * 40)

INFO:__main__:Analyzing game features by topic...



Topic 0 Statistics:
Count: 4775
Average Playtime: 24188.2 hours
Percentage Positive: 80.9%
Top Genres: []
----------------------------------------

Topic 1 Statistics:
Count: 4536
Average Playtime: 6720.6 hours
Percentage Positive: 96.6%
Top Genres: []
----------------------------------------

Topic 2 Statistics:
Count: 3957
Average Playtime: 15273.0 hours
Percentage Positive: 99.0%
Top Genres: []
----------------------------------------

Topic 3 Statistics:
Count: 3177
Average Playtime: 7829.9 hours
Percentage Positive: 97.1%
Top Genres: []
----------------------------------------

Topic 4 Statistics:
Count: 3129
Average Playtime: 11736.4 hours
Percentage Positive: 74.7%
Top Genres: []
----------------------------------------


In [7]:
# Save the model
logger.info("Saving BERTopic model...")
topic_model.save("models_english/bertopic_steam")

# Save topic assignments
logger.info("Saving topic assignments...")
df_sample[['review', 'clean_review', 'topic']].to_csv("outputs_english/topic_labels.csv", index=False)

# Save topic information
topic_model.get_topic_info().to_csv("outputs_english/topic_info.csv", index=False)

# Save topic words
all_topics = {}
for topic in set(topics):
    if topic != -1:  # Skip outliers
        all_topics[topic] = topic_model.get_topic(topic)
        
import json
with open("outputs_english/topic_words.json", "w") as f:
    # Convert to a more JSON-friendly format
    json_friendly = {str(k): [{"word": w, "score": s} for w, s in v] for k, v in all_topics.items()}
    json.dump(json_friendly, f, indent=2)

logger.info("All artifacts saved!")

INFO:__main__:Saving BERTopic model...
INFO:__main__:Saving topic assignments...
INFO:__main__:All artifacts saved!


In [8]:
# Generate a simple HTML report
logger.info("Generating HTML report...")

html = """
<html>
<head>
    <title>Steam Reviews Topic Analysis</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 40px; line-height: 1.6; }
        h1 { color: #333; }
        .topic { margin-bottom: 30px; border: 1px solid #ddd; padding: 20px; border-radius: 5px; }
        .keywords { color: #1a73e8; font-weight: bold; }
        .stats { color: #666; }
        .example { background-color: #f8f9fa; padding: 15px; margin: 10px 0; border-radius: 5px; }
    </style>
</head>
<body>
    <h1>Steam Reviews Topic Analysis</h1>
    <p>This report presents the key topics discovered in Steam game reviews.</p>
"""

# Add topics to the report
for topic in top_topics:
    if topic in topic_stats:
        # Get topic words
        topic_words = [word for word, _ in topic_model.get_topic(topic)[:10]]
        
        # Get example reviews
        doc_indices = [i for i, t in enumerate(topics) if t == topic][:3]
        examples = [df_sample.iloc[idx]['clean_review'][:300] + "..." for idx in doc_indices]
        
        html += f"""
        <div class="topic">
            <h2>Topic {topic}</h2>
            <p class="keywords">Keywords: {', '.join(topic_words)}</p>
            <p class="stats">
                Number of reviews: {topic_stats[topic]['count']}<br>
                Average playtime: {topic_stats[topic]['avg_playtime']:.1f} hours<br>
                Percentage positive: {topic_stats[topic]['pct_positive']:.1f}%
            </p>
            <h3>Example Reviews:</h3>
        """
        
        for i, example in enumerate(examples):
            html += f'<div class="example">{example}</div>'
        
        html += "</div>"

html += """
</body>
</html>
"""

# Write HTML report
with open("outputs_english/topic_report.html", "w") as f:
    f.write(html)

logger.info("HTML report generated at 'outputs_english/topic_report.html'")

INFO:__main__:Generating HTML report...
INFO:__main__:HTML report generated at 'outputs_english/topic_report.html'
