In [2]:
import pandas as pd
from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 1. Load the dataset
df = pd.read_csv("preprocessed_redditData-removedtopic.csv")

In [4]:
# 2. Prepare inputs for BERTopic
texts = df['text'].astype(str).tolist()
timestamps = df['year'].astype(str).tolist()  # Treat year as string labels

In [5]:
# 4. Fit BERTopic model
topic_model = BERTopic(verbose=True)
topics = topic_model.fit_transform(texts)

2025-07-18 06:30:22,957 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 2000/2000 [01:43<00:00, 19.36it/s]
2025-07-18 06:32:13,206 - BERTopic - Embedding - Completed ✓
2025-07-18 06:32:13,206 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-18 06:32:53,318 - BERTopic - Dimensionality - Completed ✓
2025-07-18 06:32:53,320 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-18 06:32:57,886 - BERTopic - Cluster - Completed ✓
2025-07-18 06:32:57,916 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-18 06:32:58,882 - BERTopic - Representation - Completed ✓


In [6]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,22279,-1_game_play_im_try,"[game, play, im, try, ive, people, would, exte...",[title say drop hard one week dont know happen...
1,0,1264,0_roblox_robux_studio_moderation,"[roblox, robux, studio, moderation, robloxs, h...","[roblox game dont feel like roblox, look roblo..."
2,1,445,1_mod_modding_modders_author,"[mod, modding, modders, author, modpack, modde...","[anyone know mod, help mod, get mod]"
3,2,395,2_genshin_impact_cbt2_walkthrough,"[genshin, impact, cbt2, walkthrough, hsr, gens...","[genshin impact character, genshin impact play..."
4,3,389,3_skin_prestige_drx_bundle,"[skin, prestige, drx, bundle, chroma, riot, va...","[still get skin, cant get skin, skin]"
...,...,...,...,...,...
1032,1031,10,1031_shootout_havoc_humble_ngl,"[shootout, havoc, humble, ngl, impression, fir...","[first game havoc, 1st game humble ngl, first ..."
1033,1032,10,1032_charge_rmb_knockdown_plating,"[charge, rmb, knockdown, plating, lav, ocs, un...",[lav gig aggressively charge titan backline re...
1034,1033,10,1033_dwemer_ruin_theoddone_grimsever,"[dwemer, ruin, theoddone, grimsever, edate, dy...","[one tell dwemer ruin run power, leave dwemer ..."
1035,1034,10,1034_involuntariy_wacr_thenwhen_everytimeso,"[involuntariy, wacr, thenwhen, everytimeso, sp...","[202110 kill 22 sec wacr, hashinshin stream 5 ..."


In [7]:
# 5. Get topics per year (class)
topics_by_year = topic_model.topics_per_class(
    docs=texts,
    classes=timestamps,
    global_tuning=True
)

6it [00:02,  2.66it/s]


In [8]:
# 6. Extract top 20 topics per year
top20_by_year = {}
for yr in sorted(df['year'].unique()):
    df_year = topics_by_year[topics_by_year['Class'] == str(yr)]
    top20 = df_year.sort_values('Frequency', ascending=False).head(20)
    top20_by_year[yr] = top20

In [9]:
print(topics_by_year.columns)

Index(['Topic', 'Words', 'Frequency', 'Class'], dtype='object')


In [10]:
# 7. Print results 
for yr, df_topics in top20_by_year.items():
    print(f"\nTop 20 topics for {yr}")
    print(df_topics[['Topic', 'Words', 'Frequency']])


Top 20 topics for 2020
    Topic                                              Words  Frequency
0      -1                csgo, valorant, gameplay, would, im       3933
3       2            genshin, impact, cbt2, walkthrough, cbt        130
1       0         roblox, studio, robux, customizer, robloxs        121
90     89               beta, close, register, closed, email         63
11     10            dragon, dragonborn, ender, defeat, frea         60
18     17                          euw, na, norm, lf, friend         60
73     72                cyberpunk, 2077, projekt, 2077s, cd         53
16     15  valorant, runeterra, valoran, announcement, va...         51
8       7  map, geoculuses, pixelsmash, interactable, bac...         51
9       8        skyrim, meadery, skyrimtogether, 058, tipsy         51
45     44                meme, effort, meepcity, sfs, mw2low         51
46     45                gacha, pull, weapon, currency, beta         49
2       1           mod, modders, moddin

# For Each Game

In [11]:
df = df.dropna(subset=['text'])
df = df[df['text'].str.strip() != ""]

In [12]:
# Get unique game categories
games = df['subreddit'].unique()

In [13]:
# Container for results
top20_by_game = {}


In [17]:
for game in games:
    print(f"\n🎮 Processing game: {game}")
    game_df = df[df['subreddit'] == game]

    texts = game_df['text'].tolist()

    # Fit BERTopic
    topic_model = BERTopic()
    topics = topic_model.fit_transform(texts)

    # Get topic info
    topic_info = topic_model.get_topic_info()
    top20 = topic_info.head(22)

    top20_by_game[game] = top20


🎮 Processing game: callofduty

🎮 Processing game: gaming

🎮 Processing game: genshinimpact

🎮 Processing game: gta

🎮 Processing game: leagueoflegends

🎮 Processing game: minecraft

🎮 Processing game: overwatch

🎮 Processing game: roblox

🎮 Processing game: sims

🎮 Processing game: skyrim

🎮 Processing game: valorant


In [18]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1836,-1_game_play_team_im,"[game, play, team, im, like, get, csgo, valora...",[hey everyone past couple month hardstuck iron...
1,0,303,0_valorant_lore_new_professional,"[valorant, lore, new, professional, news, game...","[get valorant, play valorant, valorant]"
2,1,117,1_agent_teaser_new_harbor,"[agent, teaser, new, harbor, 14, egg, easter, ...","[agent agent animation, help agent, new agent]"
3,2,108,2_ace_insane_second_fastest,"[ace, insane, second, fastest, lobby, first, e...","[second ace, 2 second ace, first ace]"
4,3,101,3_voice_sound_chat_mic,"[voice, sound, chat, mic, comms, mute, talk, h...","[voice comms sometimes work, need help voice c..."
...,...,...,...,...,...
120,119,11,119_rex_paper_genius_playoff,"[rex, paper, genius, playoff, copenhagen, evil...",[evil genius vs paper rex champion tour 2023 m...
121,120,11,120_yay_2022_window_future,"[yay, 2022, window, future, ggs, fashion, dram...","[welcome yay bleed, yay predict future, first ..."
122,121,11,121_advice_serious_need_helpadvice,"[advice, serious, need, helpadvice, desperate,...","[cant understand whats wrong need advice, advi..."
123,122,11,122_bomb_wolf_split_50,"[bomb, wolf, split, 50, light, 41k, doubleshoc...","[analysis wolf split, near 50 bomb split, near..."


In [19]:
# Print top 20 topics per game
for game, topic_df in top20_by_game.items():
    print(f"\n👑 Top 20 Topics for {game.capitalize()}")
    print(topic_df[['Topic', 'Name', 'Count']])


👑 Top 20 Topics for Callofduty
    Topic                                   Name  Count
0      -1                   -1_find_one_get_play   1587
1       0           0_game_player_community_play    220
2       1              1_kill_sniper_shotgun_gun    199
3       2         2_warzone_zone_streamer_animal    195
4       3            3_duty_call_many_soundtrack    165
5       4                 4_zombie_map_camos_fan    137
6       5             5_ghost_cosplay_mask_riley    131
7       6                   6_ops_black_cold_war    119
8       7             7_modern_warfare_siege_six    113
9       8     8_activision_email_account_support     72
10      9       9_camo_unlock_prestige_challenge     70
11     10        10_map_underrate_crossmap_villa     67
12     11                11_chat_sound_audio_mic     63
13     12        12_memory_og_nostalgia_remember     60
14     13          13_rank_resurgence_ranked_bug     60
15     14                  14_mwr_mwii_mwiii_mw1     53
16     15       

# LDA

In [25]:
texts = df['text'].dropna().astype(str).apply(lambda x: x.split()).tolist()
df['year'] = df['year'].astype(int)


In [29]:
from gensim import corpora
from gensim.models import LdaModel

# Store models and topics per year
lda_per_year = {}
# Loop through each year
for year, group in df.groupby('year'):
    print(f"\n Training LDA for year {year}...")
    
    # Tokenize cleaned text (split into list of words)
    texts = group['text'].astype(str).apply(lambda x: x.split()).tolist()
    
    # Create dictionary and corpus
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    # Train LDA
    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=20,
        random_state=42,
        passes=10,
        alpha='auto',
        per_word_topics=True
    )
    
    # Store results
    lda_per_year[year] = {
        'model': lda_model,
        'corpus': corpus,
        'dictionary': dictionary
    }

    # Display top topics
    topics = lda_model.print_topics(num_words=10)
    for i, topic in enumerate(topics):
        print(f"Topic {i}: {topic[1]}")


 Training LDA for year 2020...
Topic 0: 0.092*"new" + 0.043*"year" + 0.035*"build" + 0.027*"community" + 0.026*"ultimate" + 0.026*"close" + 0.022*"server" + 0.019*"current" + 0.018*"leave" + 0.018*"happy"
Topic 1: 0.045*"far" + 0.043*"run" + 0.040*"sure" + 0.028*"nothing" + 0.024*"mean" + 0.018*"body" + 0.017*"number" + 0.017*"note" + 0.017*"short" + 0.014*"game"
Topic 2: 0.123*"game" + 0.096*"play" + 0.036*"look" + 0.035*"make" + 0.032*"thing" + 0.024*"skyrim" + 0.020*"able" + 0.016*"fps" + 0.015*"like" + 0.015*"help"
Topic 3: 0.074*"beta" + 0.031*"name" + 0.031*"skin" + 0.029*"give" + 0.028*"pick" + 0.022*"lol" + 0.021*"house" + 0.019*"scene" + 0.019*"stuff" + 0.019*"info"
Topic 4: 0.099*"know" + 0.049*"na" + 0.049*"anyone" + 0.041*"dont" + 0.039*"content" + 0.031*"bit" + 0.023*"gon" + 0.020*"life" + 0.019*"come" + 0.018*"im"
Topic 5: 0.171*"ability" + 0.073*"buy" + 0.070*"csgo" + 0.039*"love" + 0.033*"5" + 0.021*"many" + 0.020*"footage" + 0.019*"reveal" + 0.019*"especially" + 0.018

# By games

In [30]:
df['subreddit'] = df['subreddit'].astype(str)

In [32]:
lda_by_game = {}

for game, group in df.groupby('subreddit'):
    print(f"\n Training LDA for {game}...")

    # Tokenize cleaned text
    texts = group['text'].astype(str).apply(lambda x: x.split()).tolist()


    # Create dictionary and corpus
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    corpus = [dictionary.doc2bow(text) for text in texts]

    # Train LDA model
    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=20,
        random_state=42,
        passes=10,
        alpha='auto'
    )

    # Store results
    lda_by_game[game] = {
        'model': lda_model,
        'corpus': corpus,
        'dictionary': dictionary
    }

    # Print top topics
    print(f"Top topics for {game}:")
    topics = lda_model.print_topics(num_words=10)
    for i, topic in enumerate(topics):
        print(f"  Topic {i}: {topic[1]}")


🎮 Training LDA for callofduty...
Top topics for callofduty:
  Topic 0: 0.042*"remastered" + 0.038*"try" + 0.033*"new" + 0.033*"community" + 0.030*"doesnt" + 0.029*"get" + 0.029*"sound" + 0.027*"something" + 0.025*"place" + 0.023*"game"
  Topic 1: 0.093*"zombie" + 0.084*"war" + 0.042*"cold" + 0.041*"win" + 0.035*"world" + 0.028*"game" + 0.027*"support" + 0.022*"play" + 0.020*"solo" + 0.017*"die"
  Topic 2: 0.125*"player" + 0.061*"thing" + 0.043*"mobile" + 0.041*"dead" + 0.039*"post" + 0.031*"mission" + 0.028*"always" + 0.017*"run" + 0.017*"ive" + 0.016*"leave"
  Topic 3: 0.125*"warzone" + 0.085*"play" + 0.063*"pc" + 0.058*"2" + 0.037*"fix" + 0.033*"safe" + 0.023*"2019" + 0.021*"moment" + 0.020*"game" + 0.020*"never"
  Topic 4: 0.066*"gun" + 0.063*"mode" + 0.061*"work" + 0.058*"skin" + 0.041*"find" + 0.038*"glitch" + 0.031*"match" + 0.030*"operator" + 0.026*"new" + 0.026*"bundle"
  Topic 5: 0.072*"use" + 0.067*"go" + 0.054*"kill" + 0.040*"whats" + 0.039*"server" + 0.035*"let" + 0.028*"w

# NNM

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [37]:
nmf_by_year = {}

for year, group in df.groupby("year"):
    print(f"\nRunning NMF for year {year}...")

    texts = group["text"].dropna().astype(str).tolist()

    # TF-IDF vectorizer
    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words="english")
    tfidf = vectorizer.fit_transform(texts)

    # Fit NMF model
    nmf = NMF(n_components=20, random_state=42)
    nmf.fit(tfidf)

    # Store for later use
    nmf_by_year[year] = {
        "model": nmf,
        "vectorizer": vectorizer,
        "tfidf": tfidf
    }

    # Print top words for each topic
    feature_names = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(nmf.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
        print(f"Topic {topic_idx + 1}: {' | '.join(top_words)}")


Running NMF for year 2020...
Topic 1: game | thing | help | roblox | free | video | release | need | win | run
Topic 2: make | minecraft | meme | block | video | map | hope | little | roblox | work
Topic 3: like | feel | hope | gta | really | 2020 | character | real | csgo | lol
Topic 4: na | gon | wan | duo | people | support | chill | group | lf | coach
Topic 5: new | character | happy | map | update | come | add | lunar | ability | need
Topic 6: know | dont | want | need | happen | people | post | thing | fix | right
Topic 7: play | people | want | tell | able | fun | feel | start | rank | beta
Topic 8: genshin | impact | beta | close | cbt2 | gameplay | live | wallpaper | final | character
Topic 9: im | try | really | sorry | proud | happy | sure | work | pretty | start
Topic 10: think | guy | cool | come | pretty | title | id | work | saw | skin
Topic 11: valorant | gameplay | release | agent | csgo | ability | riot | wallpaper | trailer | weapon
Topic 12: time | long | say | sta

# By game

In [39]:
nmf_by_game = {}

for game, group in df.groupby("subreddit"):
    print(f"\n🎮 Running NMF for {game}...")

    docs = group["text"].dropna().astype(str).tolist()

    # TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
    tfidf = vectorizer.fit_transform(docs)

    # NMF model
    nmf = NMF(n_components=20, random_state=42)
    nmf.fit(tfidf)

    # Store model and vectorizer
    nmf_by_game[game] = {
        "model": nmf,
        "vectorizer": vectorizer,
        "tfidf": tfidf
    }

    # Display top words per topic
    feature_names = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(nmf.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
        print(f"Topic {topic_idx + 1}: {' | '.join(top_words)}")


🎮 Running NMF for callofduty...
Topic 1: game | video | different | pas | hate | favorite | look | crash | ive | love
Topic 2: duty | vanguard | mobile | franchise | activision | world | tribute | favorite | update | come
Topic 3: ops | black | series | release | cold | 2022 | spec | screen | year | ago
Topic 4: play | safe | rank | want | xbox | let | right | multiplayer | people | friend
Topic 5: like | look | really | player | feel | tho | character | people | lobby | doesnt
Topic 6: warfare | modern | infinite | remastered | advance | 2019 | multiplayer | 2007 | xbox | fix
Topic 7: warzone | camo | player | kill | better | hacker | moment | lag | bug | cheater
Topic 8: know | dont | fix | want | whats | true | na | anybody | gon | yall
Topic 9: best | whats | map | multiplayer | series | right | list | way | opinion | villain
Topic 10: new | map | season | update | drop | player | favorite | idea | skin | im
Topic 11: ghost | reboot | map | version | sequel | xbox | skin | cosplay