In [None]:
!pip install sentence-transformers umap-learn hdbscan scikit-learn transformers dask pandas matplotlib bertopic graphviz

In [None]:
# Cell 1: Install dependencies (run once)

# Cell 2: Imports, reproducibility seeds, and theme definitions
import random, numpy as np, torch
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

import dask.dataframe as dd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
import hdbscan
from transformers import pipeline
from collections import defaultdict

from IPython.display import Image, display

# Your domain seed topics:
themes = {
    # Branching narrative and emotional beats
    'story': [
        'story', 'plot', 'narrative', 'pacifist', 'genocide', 'neutral',
        'branching', 'ending', 'choice', 'consequence', 'moral',
        'friendship', 'redemption', 'hope', 'love', 'loss', 'memory',
        'determination'
    ],

    # Cast of monsters, humans, and the player avatar
    'characters': [
        'character', 'frisk', 'chara', 'sans', 'papyrus', 'toriel',
        'asgore', 'alphys', 'undyne', 'mettaton', 'flowey', 'asriel',
        'napstablook', 'monster kid', 'temmie', 'burgerpants',
        'npc', 'relationship', 'backstory'
    ],

    # Core RPG systems and moment‑to‑moment interaction
    'gameplay': [
        'gameplay', 'rpg', 'adventure', 'exploration', 'puzzle',
        'bullet hell', 'dodging', 'timing', 'menu', 'save', 'load',
        'checkpoint', 'inventory', 'quest', 'side quest', 'boss fight'
    ],

    # Turn‑based battles, ACT options, and mercy mechanics
    'combat': [
        'combat', 'battle', 'fight', 'act', 'mercy', 'spare', 'talk',
        'attack', 'defense', 'hp', 'exp', 'lv', 'soul', 'heart',
        'pattern', 'special attack', 'blue soul', 'green soul',
        'orange soul', 'purple soul', 'yellow soul'
    ],

    # Iconic chiptune OST by Toby Fox
    'music': [
        'music', 'soundtrack', 'ost', 'song', 'track', 'theme', 'melody',
        'motif', 'chiptune', 'chip‑tune', 'Megalovania', 'Hopes and Dreams',
        'His Theme', 'Undertale OST', 'piano', '8‑bit', 'orchestral'
    ],

    # Retro pixel art presentation and UI
    'visuals': [
        'visuals', 'graphics', 'sprite', 'pixel art', 'retro', 'color',
        'palette', 'animation', 'effect', 'lighting', 'ui', 'hud',
        'text box', 'portrait', 'scanline', 'CRT'
    ],

    # Signature humor, meta commentary, and fourth‑wall breaks
    'humor_dialogue': [
        'humor', 'dialogue', 'pun', 'joke', 'quirky', 'whimsical',
        'meta', 'fourth wall', 'narrator', 'sarcasm', 'flavor text',
        'reference', 'running gag', 'comic timing'
    ],

    # Moral philosophy, player agency, and in‑game judgment
    'morality': [
        'morality', 'ethics', 'choice', 'consequence', 'violence',
        'mercy', 'guilt', 'atonement', 'judgment hall', 'sins',
        'player agency', 'self‑reflection'
    ],

    # Hidden events, ARG elements, and file manipulation
    'secrets_meta': [
        'secret', 'hidden', 'easter egg', 'gaster', 'fun value',
        'mystery door', 'alternate route', 'hard mode', 'reset',
        'true lab', 'file delete', 'save file', 'corruption',
        'exe', 'meta narrative'
    ],

    # Regions, set pieces, and environmental storytelling
    'world': [
        'world', 'setting', 'underground', 'ruins', 'snowdin',
        'waterfall', 'hotland', 'core', 'new home', 'castle',
        'true lab', 'temmie village', 'river person', 'map',
        'environment', 'atmosphere'
    ]
}

# Cell 3: Load & Filter reviews (lazy), visualize the DAG, then persist
DF_PATH = '../../../parquet_output_indie/*.parquet'
df_lazy = dd.read_parquet(
    DF_PATH,
    columns=['review', 'votes_up', 'voted_up', 'review_language']
)
df_lazy = df_lazy[df_lazy['review_language'] == 'english']
df_lazy = df_lazy.repartition(npartitions=8)

# Diagnostics
print("▶ Dask partitions:", df_lazy.npartitions)
print("▶ Total rows:", df_lazy.shape[0].compute())

# Visualize the lazy task graph
df_lazy.visualize(filename="dask_graph.png", rankdir="LR")
display(Image("dask_graph.png"))

# Persist the filtered & repartitioned DataFrame
df = df_lazy.persist()

# Cell 4: Sampling function & balanced subsets
def sample_bucket(df, label, n=50000, random_state=42):
    bucket = df[df['voted_up'] == label][['review', 'votes_up']].dropna()
    total = bucket.shape[0].compute()
    frac = min(1.0, n/total) if total > 0 else 0.0
    sampled = (bucket.sample(frac=frac, random_state=random_state).compute()
               if frac > 0 else pd.DataFrame(columns=['review','votes_up']))
    return sampled['review'].tolist(), sampled['votes_up'].tolist()

# Down-sample likes to roughly match dislikes
dis_docs, dis_votes = sample_bucket(df, False)
likes_docs, likes_votes = sample_bucket(df, True, n=min(10000, 5*len(dis_votes)))

# Cell 5: Embeddings
device = 'cuda' if torch.cuda.is_available() else 'cpu'
embedder = SentenceTransformer('all-MiniLM-L6-v2', device=device)

def encode_docs(docs, batch_size=64):
    embs = []
    for i in range(0, len(docs), batch_size):
        embs.append(embedder.encode(docs[i:i+batch_size], convert_to_numpy=True))
    return np.vstack(embs)

all_docs = likes_docs + dis_docs
likes_emb = encode_docs(likes_docs)
dis_emb   = encode_docs(dis_docs)
all_emb   = np.vstack([likes_emb, dis_emb])

# Cell 6: Fit BERTopic with seed topics
seed_topic_list = list(themes.values())
topic_model = BERTopic(
    embedding_model      = embedder,
    seed_topic_list      = seed_topic_list,
    n_gram_range         = (1, 2),
    min_topic_size       = 30,
    hdbscan_model        = hdbscan.HDBSCAN(
                              min_cluster_size=15,
                              min_samples=5,
                              metric='euclidean',
                              cluster_selection_method='eom'
                          ),
    representation_model = KeyBERTInspired(),
    verbose              = True
)
topics, _ = topic_model.fit_transform(all_docs, all_emb)

# Cell 7: Extract per-topic keywords, counts & best example
sentiments = ['Like']*len(likes_docs) + ['Dislikes']*len(dis_docs)
df_all = pd.DataFrame({
    "Sentiment": sentiments,
    "TopicID":   topics,
    "Review":    all_docs,
    "Votes":     likes_votes + dis_votes
})

insights = {}
for tid in sorted(set(topics)):
    if tid < 0:  # skip outliers
        continue
    sub = df_all[df_all['TopicID'] == tid]
    if sub.empty:
        continue
    cnt  = len(sub)
    best = sub.loc[sub['Votes'].idxmax(), 'Review']
    kw   = [w for w,_ in topic_model.get_topic(tid)][:10]
    insights[tid] = {
        "Theme":   list(themes.keys())[tid] if tid < len(themes) else "Other",
        "Count":   cnt,
        "Keywords": kw,
        "Example": best
    }

# Cell 8: Summarize each example review
summarizer = pipeline(
    'summarization',
    model='sshleifer/distilbart-cnn-12-6',
    device=0 if torch.cuda.is_available() else -1
)

def safe_summarize(text):
    try:
        out = summarizer(text, max_length=40, min_length=15, do_sample=False, truncation=True)
        return out[0]['summary_text']
    except:
        return text[:200] + '...'

for data in insights.values():
    data['Summary'] = safe_summarize(data['Example'] or " ".join(data['Keywords']))

# Cell 9: Build and display the report DataFrame
df_report = pd.DataFrame([
    {
        "Theme":         data['Theme'],
        "Cluster (ID)":  tid,
        "Count":         data['Count'],
        "Keywords":      ", ".join(data['Keywords']),
        "Example Review":data['Example'],
        "Summary":       data['Summary']
    }
    for tid, data in insights.items()
])
display(df_report.style.set_caption("BERTopic-guided Review Insights").hide(axis="index"))

# Cell 10: Build & display summary table of counts & like ratios
plot_df = df_all.merge(
    df_report[['Cluster (ID)', 'Theme']],
    left_on='TopicID', right_on='Cluster (ID)',
    how='inner'
)

counts = plot_df.groupby('Theme')['Review'].count().rename('#Reviews')
likes  = plot_df[plot_df['Sentiment']=='Like'].groupby('Theme')['Review'].count()
ratios = (likes / counts * 100).round(1).astype(str) + '%'

summary_df = (
    pd.concat([counts, ratios.rename('LikeRatio')], axis=1)
      .reset_index()
      .merge(df_report[['Theme','Keywords']], on='Theme')
      .sort_values('#Reviews', ascending=False)
)
display(summary_df)

# Cell 11: Plot review counts by theme & sentiment
plt.figure(figsize=(10,6))
plot_df.groupby(['Theme','Sentiment'])['Review'] \
       .count() \
       .unstack('Sentiment') \
       .plot(kind='bar', rot=45)
plt.title("Review Counts per Theme & Sentiment")
plt.xlabel("Theme")
plt.ylabel("Number of Reviews")
plt.tight_layout()
plt.show()


In [None]:
# ─── Cell 1: (Run once) Install dependencies ───
# !pip install sentence-transformers umap-learn hdbscan scikit-learn transformers pandas matplotlib bertopic

# ─── Cell 2: Imports & reproducibility seeds ───
import random, numpy as np, torch
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
import hdbscan
from transformers import pipeline
from collections import defaultdict
from IPython.display import display

# ─── Cell 3: Load & Filter reviews from all Parquet files ───
PARQUET_GLOB = "../../../parquet_output_indie/*.parquet"
files = glob.glob(PARQUET_GLOB)
if not files:
    raise FileNotFoundError(f"No files match {PARQUET_GLOB!r}")

# Read & concatenate into one pandas DataFrame
df = pd.concat(
    [pd.read_parquet(f, columns=['review','votes_up','voted_up','review_language'])
     for f in files],
    ignore_index=True
)

# Filter English reviews
df = df[df['review_language'] == 'english']
print(f"▶ Total English reviews: {len(df)}")

# ─── Cell 4: Sampling function & balanced subsets ───
def sample_bucket_py(df, label, n=50000, random_state=42):
    bucket = df[df['voted_up'] == label][['review','votes_up']].dropna()
    total = len(bucket)
    frac  = min(1.0, n/total) if total>0 else 0.0
    if frac > 0:
        sampled = bucket.sample(frac=frac, random_state=random_state)
    else:
        sampled = pd.DataFrame(columns=['review','votes_up'])
    return sampled['review'].tolist(), sampled['votes_up'].tolist()

# Down-sample likes to roughly match dislikes
dis_docs, dis_votes     = sample_bucket_py(df, False)
likes_docs, likes_votes = sample_bucket_py(df, True, n=min(10000, 5*len(dis_votes)))

# ─── Cell 5: Embeddings ───
device   = 'cuda' if torch.cuda.is_available() else 'cpu'
embedder = SentenceTransformer('all-MiniLM-L6-v2', device=device)

def encode_docs(docs, batch_size=64):
    embs = []
    for i in range(0, len(docs), batch_size):
        embs.append(embedder.encode(docs[i:i+batch_size], convert_to_numpy=True))
    return np.vstack(embs)

all_docs  = likes_docs + dis_docs
likes_emb = encode_docs(likes_docs)
dis_emb   = encode_docs(dis_docs)
all_emb   = np.vstack([likes_emb, dis_emb])

# ─── Cell 6: Fit BERTopic with seed topics ───
themes = {
    # Branching narrative and emotional beats
    'story': [
        'story', 'plot', 'narrative', 'pacifist', 'genocide', 'neutral',
        'branching', 'ending', 'choice', 'consequence', 'moral',
        'friendship', 'redemption', 'hope', 'love', 'loss', 'memory',
        'determination'
    ],

    # Cast of monsters, humans, and the player avatar
    'characters': [
        'character', 'frisk', 'chara', 'sans', 'papyrus', 'toriel',
        'asgore', 'alphys', 'undyne', 'mettaton', 'flowey', 'asriel',
        'napstablook', 'monster kid', 'temmie', 'burgerpants',
        'npc', 'relationship', 'backstory'
    ],

    # Core RPG systems and moment‑to‑moment interaction
    'gameplay': [
        'gameplay', 'rpg', 'adventure', 'exploration', 'puzzle',
        'bullet hell', 'dodging', 'timing', 'menu', 'save', 'load',
        'checkpoint', 'inventory', 'quest', 'side quest', 'boss fight'
    ],

    # Turn‑based battles, ACT options, and mercy mechanics
    'combat': [
        'combat', 'battle', 'fight', 'act', 'mercy', 'spare', 'talk',
        'attack', 'defense', 'hp', 'exp', 'lv', 'soul', 'heart',
        'pattern', 'special attack', 'blue soul', 'green soul',
        'orange soul', 'purple soul', 'yellow soul'
    ],

    # Iconic chiptune OST by Toby Fox
    'music': [
        'music', 'soundtrack', 'ost', 'song', 'track', 'theme', 'melody',
        'motif', 'chiptune', 'chip‑tune', 'Megalovania', 'Hopes and Dreams',
        'His Theme', 'Undertale OST', 'piano', '8‑bit', 'orchestral'
    ],

    # Retro pixel art presentation and UI
    'visuals': [
        'visuals', 'graphics', 'sprite', 'pixel art', 'retro', 'color',
        'palette', 'animation', 'effect', 'lighting', 'ui', 'hud',
        'text box', 'portrait', 'scanline', 'CRT'
    ],

    # Signature humor, meta commentary, and fourth‑wall breaks
    'humor_dialogue': [
        'humor', 'dialogue', 'pun', 'joke', 'quirky', 'whimsical',
        'meta', 'fourth wall', 'narrator', 'sarcasm', 'flavor text',
        'reference', 'running gag', 'comic timing'
    ],

    # Moral philosophy, player agency, and in‑game judgment
    'morality': [
        'morality', 'ethics', 'choice', 'consequence', 'violence',
        'mercy', 'guilt', 'atonement', 'judgment hall', 'sins',
        'player agency', 'self‑reflection'
    ],

    # Hidden events, ARG elements, and file manipulation
    'secrets_meta': [
        'secret', 'hidden', 'easter egg', 'gaster', 'fun value',
        'mystery door', 'alternate route', 'hard mode', 'reset',
        'true lab', 'file delete', 'save file', 'corruption',
        'exe', 'meta narrative'
    ],

    # Regions, set pieces, and environmental storytelling
    'world': [
        'world', 'setting', 'underground', 'ruins', 'snowdin',
        'waterfall', 'hotland', 'core', 'new home', 'castle',
        'true lab', 'temmie village', 'river person', 'map',
        'environment', 'atmosphere'
    ]
}

seed_topic_list = list(themes.values())
topic_model = BERTopic(
    embedding_model      = embedder,
    seed_topic_list      = seed_topic_list,
    n_gram_range         = (1, 2),
    min_topic_size       = 30,
    hdbscan_model        = hdbscan.HDBSCAN(
                              min_cluster_size=15,
                              min_samples=5,
                              metric='euclidean',
                              cluster_selection_method='eom'
                          ),
    representation_model = KeyBERTInspired(),
    verbose              = True
)
topics, _ = topic_model.fit_transform(all_docs, all_emb)

# ─── Cell 7: Build insights DataFrame ───
sentiments = ['Like'] * len(likes_docs) + ['Dislikes'] * len(dis_docs)
df_all = pd.DataFrame({
    "Sentiment": sentiments,
    "TopicID":   topics,
    "Review":    all_docs,
    "Votes":     likes_votes + dis_votes
})

insights = {}
for tid in sorted(set(topics)):
    if tid < 0: continue
    sub = df_all[df_all['TopicID'] == tid]
    if sub.empty: continue
    cnt  = len(sub)
    best = sub.loc[sub['Votes'].idxmax(), 'Review']
    kw   = [w for w,_ in topic_model.get_topic(tid)][:10]
    insights[tid] = {
        "Theme":    list(themes.keys())[tid] if tid < len(themes) else "Other",
        "Count":    cnt,
        "Keywords": kw,
        "Example":  best
    }

# ─── Cell 8: Summarize each example ───
summarizer = pipeline(
    'summarization',
    model='sshleifer/distilbart-cnn-12-6',
    device=0 if torch.cuda.is_available() else -1
)

def safe_summarize(text):
    try:
        out = summarizer(text, max_length=40, min_length=15,
                         do_sample=False, truncation=True)
        return out[0]['summary_text']
    except:
        return text[:200] + '...'

for data in insights.values():
    data['Summary'] = safe_summarize(data['Example'] or " ".join(data['Keywords']))

# ─── Cell 9: Build & display the report ───
df_report = pd.DataFrame([
    {
        "Theme":          data['Theme'],
        "Cluster (ID)":   tid,
        "Count":          data['Count'],
        "Keywords":       ", ".join(data['Keywords']),
        "Example Review": data['Example'],
        "Summary":        data['Summary']
    }
    for tid, data in insights.items()
])
display(df_report.style.set_caption("BERTopic-guided Review Insights").hide(axis="index"))

# ─── Cell 10: Summary of counts & like-ratios ───
plot_df = df_all.merge(
    df_report[['Cluster (ID)','Theme']],
    left_on='TopicID', right_on='Cluster (ID)'
)
counts = plot_df.groupby('Theme')['Review'].count().rename('#Reviews')
likes  = plot_df[plot_df['Sentiment']=='Like'].groupby('Theme')['Review'].count()
ratios = (likes/counts*100).round(1).astype(str) + '%'

summary_df = (
    pd.concat([counts, ratios.rename('LikeRatio')], axis=1)
      .reset_index()
      .merge(df_report[['Theme','Keywords']], on='Theme')
      .sort_values('#Reviews', ascending=False)
)
display(summary_df)

# ─── Cell 11: Plot review counts by theme & sentiment ───
plt.figure(figsize=(10,6))
plot_df.groupby(['Theme','Sentiment'])['Review'] \
       .count() \
       .unstack('Sentiment') \
       .plot(kind='bar', rot=45)
plt.title("Review Counts per Theme & Sentiment")
plt.xlabel("Theme")
plt.ylabel("Number of Reviews")
plt.tight_layout()
plt.show()
