In [6]:
import pandas as pd
from bertopic import BERTopic
import plotly.io as pio
from umap import UMAP

ImportError: cannot import name 'PreTrainedModel' from 'transformers' (d:\School Files\ITS132L\redditwebscraping\.venv\Lib\site-packages\transformers\__init__.py)

In [None]:
# 1. Load the dataset
df = pd.read_csv("post_summary.csv")

In [None]:
# 2. Prepare inputs for BERTopic
texts = df['text'].astype(str).tolist()
timestamps = df['year'].astype(str).tolist()  # Treat year as string labels

In [None]:
# Get unique game categories
games = df['subreddit'].unique()
years = df['year'].unique()

In [None]:
# 6. Extract top 20 topics
pio.renderers.default = "browser"

top20_by_game = {}
topic_models_by_game = {}

for game in games:
    print(f"\n🎮 Processing game: {game}")
    game_df = df[df['subreddit'] == game]

    # Clean texts
    texts = game_df['text'].dropna().astype(str).tolist()
    texts = [t for t in texts if t.strip() != ""]

    if not texts:
        print(f"⚠️ Skipping {game} (no valid texts)")
        continue

    # Fit BERTopic
    umap_model = UMAP(n_neighbors=15, n_components=2, metric='cosine', random_state=42)
    topic_model = BERTopic(umap_model=umap_model)
    topics, _ = topic_model.fit_transform(texts)

    # Save model
    topic_models_by_game[game] = topic_model

    topic_model.save(f"{game}_bertopic_model")

    
    # Get topic info
    topic_info = topic_model.get_topic_info()
    top20 = topic_info.head(22)
    top20_by_game[game] = top20

    # Save interactive visualization to HTML
    fig = topic_model.visualize_topics(top_n_topics=20)
    html_path = f"{game}_topics_map.html"
    fig.write_html(html_path)
    print(f"✅ Saved topic map to: {html_path}")


🎮 Processing game: callofduty
✅ Saved topic map to: callofduty_topics_map.html

🎮 Processing game: gaming
✅ Saved topic map to: gaming_topics_map.html

🎮 Processing game: genshinimpact
✅ Saved topic map to: genshinimpact_topics_map.html

🎮 Processing game: gta
✅ Saved topic map to: gta_topics_map.html

🎮 Processing game: leagueoflegends
✅ Saved topic map to: leagueoflegends_topics_map.html

🎮 Processing game: minecraft
✅ Saved topic map to: minecraft_topics_map.html

🎮 Processing game: overwatch
✅ Saved topic map to: overwatch_topics_map.html

🎮 Processing game: roblox
✅ Saved topic map to: roblox_topics_map.html

🎮 Processing game: sims
✅ Saved topic map to: sims_topics_map.html

🎮 Processing game: skyrim
✅ Saved topic map to: skyrim_topics_map.html

🎮 Processing game: valorant
✅ Saved topic map to: valorant_topics_map.html


In [None]:
# 7. Print results 
for game, df_topics in top20_by_game.items():
    print(f"\nTop 20 topics for {game}")
    print(df_topics[['Topic', 'Name', 'Count']])


Top 20 topics for callofduty
    Topic                                Name  Count
0      -1                -1_get_game_play_one   1975
1       0    0_warzone_zone_streamer_warzone2    193
2       1             1_duty_call_mobile_best    152
3       2               2_zombie_map_cold_mob    132
4       3                 3_cdl_reward_iq_200    116
5       4          4_modern_warfare_siege_six    108
6       5        5_ghost_cosplay_reboot_would    106
7       6             6_ops_black_cold_series     96
8       7           7_sniper_shotgun_gun_shot     95
9       8    8_camo_challenge_unlock_prestige     73
10      9  9_activision_account_email_support     72
11     10  10_kill_killstreak_airstrike_death     72
12     11    11_relate_else_experience_wallet     67
13     12    12_rank_resurgence_ranked_casual     64
14     13              13_buy_worth_cpt_price     63
15     14       14_year_happy_birthday_cordis     61
16     15  15_map_underrate_favorite_crossmap     59
17     16       

----------------------------------------------------------

# LDA

In [None]:
from gensim import corpora
from gensim.models import LdaModel

# By games

In [None]:
df['subreddit'] = df['subreddit'].astype(str)

NameError: name 'df' is not defined

In [None]:
lda_by_game = {}

for game, group in df.groupby('subreddit'):
    print(f"\n Training LDA for {game}...")

    # Tokenize cleaned text
    texts = group['text'].astype(str).apply(lambda x: x.split()).tolist()


    # Create dictionary and corpus
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    corpus = [dictionary.doc2bow(text) for text in texts]

    # Train LDA model
    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=15,
        random_state=42,
        passes=10,
        alpha='auto'
    )

    # Store results
    lda_by_game[game] = {
        'model': lda_model,
        'corpus': corpus,
        'dictionary': dictionary
    }

    # Print top topics
    print(f"Top topics for {game}:")
    topics = lda_model.print_topics()
    for i, topic in enumerate(topics):
        print(f"  Topic {i}: {topic[1]}")


 Training LDA for callofduty...
Top topics for callofduty:
  Topic 0: 0.034*"back" + 0.033*"get" + 0.027*"game" + 0.022*"remember" + 0.021*"bo1" + 0.020*"new" + 0.018*"fun" + 0.018*"problem" + 0.017*"love" + 0.017*"sound"
  Topic 1: 0.058*"war" + 0.046*"season" + 0.044*"cold" + 0.042*"4" + 0.033*"end" + 0.031*"go" + 0.028*"gun" + 0.022*"mission" + 0.017*"start" + 0.017*"live"
  Topic 2: 0.041*"im" + 0.029*"team" + 0.029*"mobile" + 0.028*"happen" + 0.024*"even" + 0.024*"finally" + 0.023*"dead" + 0.023*"server" + 0.021*"way" + 0.021*"na"
  Topic 3: 0.069*"one" + 0.061*"year" + 0.035*"old" + 0.033*"series" + 0.027*"game" + 0.026*"much" + 0.019*"go" + 0.019*"many" + 0.019*"aw" + 0.019*"wrong"
  Topic 4: 0.139*"get" + 0.048*"kill" + 0.039*"rank" + 0.037*"player" + 0.027*"cant" + 0.025*"camo" + 0.025*"bug" + 0.023*"worth" + 0.022*"match" + 0.021*"lobby"
  Topic 5: 0.087*"like" + 0.064*"good" + 0.046*"really" + 0.031*"feel" + 0.028*"stop" + 0.027*"thing" + 0.025*"new" + 0.025*"ever" + 0.024*

In [None]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

In [None]:
game = 'callofduty'  # CHANGE THIS TO CHOOSE WHAT GAME TO CHECK MODEL

lda_model = lda_by_game[game]['model']
corpus = lda_by_game[game]['corpus']
dictionary = lda_by_game[game]['dictionary']

vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis_data)

NameError: name 'lda_by_game' is not defined

---------------------------------------------------------------------------------

# NMf

# By game

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [None]:
nmf_by_game = {}

for game, group in df.groupby("subreddit"):
    print(f"\n🎮 Running NMF for {game}...")

    docs = group["text"].dropna().astype(str).tolist()

    # TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
    tfidf = vectorizer.fit_transform(docs)

    # NMF model
    nmf = NMF(n_components=20, random_state=42)
    nmf.fit(tfidf)

    # Store model and vectorizer
    nmf_by_game[game] = {
        "model": nmf,
        "vectorizer": vectorizer,
        "tfidf": tfidf
    }

    # Display top words per topic
    feature_names = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(nmf.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
        print(f"Topic {topic_idx + 1}: {' | '.join(top_words)}")


🎮 Running NMF for callofduty...
Topic 1: game | video | different | pas | hate | favorite | look | crash | ive | love
Topic 2: duty | vanguard | mobile | franchise | activision | world | tribute | favorite | update | come
Topic 3: ops | black | series | release | cold | 2022 | spec | screen | year | ago
Topic 4: play | safe | rank | want | xbox | let | right | multiplayer | people | friend
Topic 5: like | look | really | player | feel | tho | character | people | lobby | doesnt
Topic 6: warfare | modern | infinite | remastered | advance | 2019 | multiplayer | 2007 | xbox | fix
Topic 7: warzone | camo | player | kill | better | hacker | moment | lag | bug | cheater
Topic 8: know | dont | fix | want | whats | true | na | anybody | gon | yall
Topic 9: best | whats | map | multiplayer | series | right | list | way | opinion | villain
Topic 10: new | map | season | update | drop | player | favorite | idea | skin | im
Topic 11: ghost | reboot | map | version | sequel | xbox | skin | cosplay