In [82]:
import pandas as pd
from bertopic import BERTopic

In [83]:
# 1. Load the dataset
df = pd.read_csv("preprocessed_redditData-removedtopic.csv")

In [84]:
# 2. Prepare inputs for BERTopic
texts = df['text'].astype(str).tolist()
timestamps = df['year'].astype(str).tolist()  # Treat year as string labels

In [85]:
# 4. Fit BERTopic model
topic_model = BERTopic(verbose=True)
topics = topic_model.fit_transform(texts)

2025-07-17 23:00:42,283 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 2000/2000 [02:08<00:00, 15.59it/s]
2025-07-17 23:02:54,607 - BERTopic - Embedding - Completed ✓
2025-07-17 23:02:54,608 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-17 23:03:14,072 - BERTopic - Dimensionality - Completed ✓
2025-07-17 23:03:14,074 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-17 23:03:38,080 - BERTopic - Cluster - Completed ✓
2025-07-17 23:03:38,095 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-17 23:03:39,547 - BERTopic - Representation - Completed ✓


In [86]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,22187,-1_game_im_play_would,"[game, im, play, would, pc, like, try, house, ...",[finish game incredible gameplay awesome story...
1,0,1227,0_roblox_robux_studio_moderation,"[roblox, robux, studio, moderation, robloxs, a...","[new roblox game, new roblox, roblox]"
2,1,700,1_gta_vi_online_gta6,"[gta, vi, online, gta6, iv, sa, trilogy, missi...","[gta v, gta 6, gta v gta v gta v]"
3,2,511,2_sim_pregnant_alien_sims4,"[sim, pregnant, alien, sims4, teen, baby, sims...","[anyone know sim, new sim make, new sim]"
4,3,473,3_mod_modding_author_modders,"[mod, modding, author, modders, modder, modpac...","[use mod, help mod, mod like]"
...,...,...,...,...,...
1052,1051,10,1051_lantern_rite_lanternritepng_wanwood,"[lantern, rite, lanternritepng, wanwood, parch...","[wish liyue lantern festival, lantern rite, fa..."
1053,1052,10,1052_sort_sorter_comparators_ord,"[sort, sorter, comparators, ord, mysticats, va...","[way sort 2 item, unity tacticslike isometric ..."
1054,1053,10,1053_steve_alex_attractive_bocwdoes,"[steve, alex, attractive, bocwdoes, haines, ga...",[everyone show sexy steve alex fan art heres m...
1055,1054,10,1054_skse_skse64_sksetrampolineh80_16353,"[skse, skse64, sksetrampolineh80, 16353, sse, ...","[skse cant even load startup menu, pc sse impo..."


In [87]:
# 5. Get topics per year (class)
topics_by_year = topic_model.topics_per_class(
    docs=texts,
    classes=timestamps,
    global_tuning=True
)

6it [00:03,  1.60it/s]


In [88]:
# 6. Extract top 20 topics per year
top20_by_year = {}
for yr in sorted(df['year'].unique()):
    df_year = topics_by_year[topics_by_year['Class'] == str(yr)]
    top20 = df_year.sort_values('Frequency', ascending=False).head(20)
    top20_by_year[yr] = top20

In [89]:
print(topics_by_year.columns)

Index(['Topic', 'Words', 'Frequency', 'Class'], dtype='object')


In [90]:
# 7. Print results 
for yr, df_topics in top20_by_year.items():
    print(f"\nTop 20 topics for {yr}")
    print(df_topics[['Topic', 'Words', 'Frequency']])


Top 20 topics for 2020
    Topic                                             Words  Frequency
0      -1         csgo, valorant, gameplay, overwatch, like       3954
1       0            roblox, studio, robux, robloxs, filter        116
18     17           impact, genshin, walkthrough, cbt2, pax         93
9       8                          euw, na, lf, chill, norm         67
3       2             sim, sims, pregnant, simtown, simself         66
79     78              beta, close, email, register, access         66
8       7         dragon, dragonborn, ender, alduin, breton         64
2       1              gta, online, nutshell, vi, sarcastic         60
6       5              skin, prestige, trading, mvp, league         54
46     45           meme, effort, meepcity, sfs, unoahbogue         53
70     69               cyberpunk, 2077, projekt, cd, 2077s         50
7       6     map, pixelsmash, highground, minimap, yurodds         50
5       4  minecraft, minecrafts, motivate, disk, wat

In [91]:
df = df.dropna(subset=['text'])
df = df[df['text'].str.strip() != ""]

In [92]:
# Get unique game categories
games = df['subreddit'].unique()

In [93]:
# Container for results
top20_by_game = {}


In [94]:
for game in games:
    print(f"\n🎮 Processing game: {game}")
    game_df = df[df['subreddit'] == game]

    texts = game_df['text'].tolist()

    # Fit BERTopic
    topic_model = BERTopic()
    topics = topic_model.fit_transform(texts)

    # Get topic info
    topic_info = topic_model.get_topic_info()
    top20 = topic_info.head(21)

    top20_by_game[game] = top20


🎮 Processing game: callofduty

🎮 Processing game: gaming

🎮 Processing game: genshinimpact

🎮 Processing game: gta

🎮 Processing game: leagueoflegends

🎮 Processing game: minecraft

🎮 Processing game: overwatch

🎮 Processing game: roblox

🎮 Processing game: sims

🎮 Processing game: skyrim

🎮 Processing game: valorant


In [95]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1909,-1_game_play_team_im,"[game, play, team, im, like, get, valorant, cs...",[hey everyone past couple month hardstuck iron...
1,0,277,0_valorant_lore_new_news,"[valorant, lore, new, news, professional, game...","[get valorant, play valorant, valorant]"
2,1,120,1_agent_harbor_teaser_new,"[agent, harbor, teaser, new, 14, egg, easter, ...","[help agent, best agent, new agent]"
3,2,116,2_ace_fastest_second_first,"[ace, fastest, second, first, insane, lobby, e...","[2 second ace, second ace, first ace]"
4,3,105,3_rank_guess_reset_act,"[rank, guess, reset, act, distribution, stack,...","[guess rank please, guess rank, rank]"
...,...,...,...,...,...
111,110,12,110_console_xbox_crossplatform_embrace,"[console, xbox, crossplatform, embrace, uncapb...","[full game valorant come console, click wrong ..."
112,111,11,111_tactical_shooter_gunability_evolves,"[tactical, shooter, gunability, evolves, chart...","[valorant serious tactical shooter, valorant d..."
113,112,11,112_murder_kill_ahah_killrecord,"[murder, kill, ahah, killrecord, multikills, b...","[bo5 kill record smash, steel murder former bo..."
114,113,11,113_window_future_dink_kisser,"[window, future, dink, kisser, diary, iconic, ...","[yay predict future, probably best mid air din..."


In [100]:
for game in games:
    print(f"\n🎮 Processing game: {game}")
    game_df = df[df['subreddit'] == game]

    texts = game_df['text'].tolist()

    # Fit BERTopic
    topic_model = BERTopic()
    topics = topic_model.fit_transform(texts)

    # Get topic info
    topic_info = topic_model.get_topic_info()
    top20 = topic_info.head(22)

    top20_by_game[game] = top20


🎮 Processing game: callofduty

🎮 Processing game: gaming

🎮 Processing game: genshinimpact

🎮 Processing game: gta

🎮 Processing game: leagueoflegends

🎮 Processing game: minecraft

🎮 Processing game: overwatch

🎮 Processing game: roblox

🎮 Processing game: sims

🎮 Processing game: skyrim

🎮 Processing game: valorant


In [101]:
# Print top 20 topics per game (fixed column names)
for game, topic_df in top20_by_game.items():
    print(f"\n👑 Top 20 Topics for {game.capitalize()}")
    print(topic_df[['Topic', 'Name', 'Count']])


👑 Top 20 Topics for Callofduty
    Topic                                   Name  Count
0      -1                   -1_find_game_one_get   1756
1       0            0_warzone_lobby_zone_hacker    248
2       1            1_duty_call_mobile_timeline    158
3       2                 2_zombie_map_camos_fan    139
4       3                     3_xbox_safe_pc_ps5    137
5       4                  4_ops_black_cold_year    130
6       5      5_modern_warfare_siege_remastered    123
7       6          6_camo_unlock_challenge_shift     74
8       7     7_activision_email_account_support     73
9       8        8_map_underrate_design_crossmap     68
10      9         9_clip_video_subscribe_youtube     65
11     10     10_kill_killstreak_execution_count     55
12     11        11_memory_nostalgia_remember_og     55
13     12         12_ghost_lore_sequel_underrate     51
14     13                  13_mwr_mwii_mwiii_mw1     50
15     14             14_favorite_game_hate_best     47
16     15       