In [10]:
import pandas as pd
from bertopic import BERTopic

In [11]:
# 1. Load the dataset
df = pd.read_csv("preprocessed_redditData-removedtopic.csv")

In [12]:
# 2. Prepare inputs for BERTopic
texts = df['text'].astype(str).tolist()
timestamps = df['year'].astype(str).tolist()  # Treat year as string labels

In [20]:
# Get unique game categories
games = df['subreddit'].unique()
years = df['year'].unique()

In [13]:
# 4. Fit BERTopic model
topic_model = BERTopic(verbose=True)
topics = topic_model.fit_transform(texts)

2025-07-18 16:58:34,699 - BERTopic - Embedding - Transforming documents to embeddings.


Batches: 100%|██████████| 2000/2000 [01:44<00:00, 19.22it/s]
2025-07-18 17:00:22,791 - BERTopic - Embedding - Completed ✓
2025-07-18 17:00:22,792 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-18 17:01:03,864 - BERTopic - Dimensionality - Completed ✓
2025-07-18 17:01:03,866 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-18 17:01:08,807 - BERTopic - Cluster - Completed ✓
2025-07-18 17:01:08,819 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-18 17:01:09,841 - BERTopic - Representation - Completed ✓


In [14]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,23000,-1_game_play_im_would,"[game, play, im, would, pc, ive, like, start, ...",[start play first time day ago id like try win...
1,0,1267,0_roblox_robux_studio_moderation,"[roblox, robux, studio, moderation, robloxs, h...","[roblox game dont feel like roblox, look roblo..."
2,1,499,1_mod_modding_author_modders,"[mod, modding, author, modders, modder, modpac...","[im look mod, get mod, mod]"
3,2,489,2_gta_vi_online_gta6,"[gta, vi, online, gta6, iv, sa, trilogy, nutsh...","[gta, gta v gta v gta v, gta v]"
4,3,408,3_genshin_impact_cbt2_walkthrough,"[genshin, impact, cbt2, walkthrough, cbt3, hsr...","[genshin impact character, genshin impact, gen..."
...,...,...,...,...,...
1041,1040,10,1040_trilogy_hd_capt_definitive,"[trilogy, hd, capt, definitive, patrick, og, b...","[original trilogy, remember trilogy, og trilog..."
1042,1041,10,1041_flip_naw_upside_accidental,"[flip, naw, upside, accidental, pandemic, unex...","[bro flip happen game, flip, well cool back flip]"
1043,1042,10,1042_maintenance_jan_0th_smthn,"[maintenance, jan, 0th, smthn, hampsters, main...","[maintenance update continue across weekend, i..."
1044,1043,10,1043_cc_apartment_desk_functional,"[cc, apartment, desk, functional, shabby, favo...",[exterior apartment complex ive build use base...


In [15]:
# 5. Get topics per year (class)
topics_by_year = topic_model.topics_per_class(
    docs=texts,
    classes=timestamps,
    global_tuning=True
)

6it [00:02,  2.37it/s]


In [31]:
# 6. Extract top 20 topics
top20_by_game = {}
for game in games:
    print(f"\n🎮 Processing game: {game}")
    game_df = df[df['subreddit'] == game]

    texts = game_df['text'].tolist()

    # Fit BERTopic
    topic_model = BERTopic()
    topics = topic_model.fit_transform(texts)

    # Get topic info
    topic_info = topic_model.get_topic_info()
    top20 = topic_info.head(22)

    top20_by_game[game] = top20


🎮 Processing game: callofduty

🎮 Processing game: gaming

🎮 Processing game: genshinimpact

🎮 Processing game: gta

🎮 Processing game: leagueoflegends

🎮 Processing game: minecraft

🎮 Processing game: overwatch

🎮 Processing game: roblox

🎮 Processing game: sims

🎮 Processing game: skyrim

🎮 Processing game: valorant


In [17]:
print(topics_by_year.columns)

Index(['Topic', 'Words', 'Frequency', 'Class'], dtype='object')


In [35]:
# 7. Print results 
for game, df_topics in top20_by_game.items():
    print(f"\nTop 20 topics for {game}")
    print(df_topics[['Topic', 'Name', 'Count']])


Top 20 topics for callofduty
    Topic                                   Name  Count
0      -1                   -1_find_get_game_one   1863
1       0    0_warzone_zone_streamer_battlefield    200
2       1                1_duty_call_mobile_best    159
3       2                   2_zombie_mob_map_fan    121
4       3           3_ghost_cosplay_riley_reboot    117
5       4                   4_ops_black_cold_war    114
6       5             5_modern_warfare_siege_six    114
7       6     6_activision_account_email_support     72
8       7       7_camo_unlock_challenge_prestige     69
9       8        8_map_underrate_design_crossmap     69
10      9         9_memory_og_nostalgia_remember     64
11     10                10_end_move_finish_stop     58
12     11     11_kill_killstreak_execution_count     54
13     12                   12_guy_right_fuck_um     53
14     13                  13_mwr_mwii_mwiii_mw1     52
15     14              14_xbox_360_data_nowadays     48
16     15         

# For Each Game per year

In [19]:
df = df.dropna(subset=['text'])
df = df[df['text'].str.strip() != ""]

In [21]:
# Container for results
top_topics_by_game_year = {}

In [22]:
for game in games:
    for year in years:
        print(f"\n🎮 Processing: {game} | Year: {year}")
        subset = df[(df['subreddit'] == game) & (df['year'] == year)]

        texts = subset['text'].dropna().astype(str).tolist()

        topic_model = BERTopic()
        topics, _ = topic_model.fit_transform(texts)

        topic_info = topic_model.get_topic_info()
        top_topics = topic_info.head(20)  # Top 20 topics

        top_topics_by_game_year[(game, year)] = top_topics


🎮 Processing: callofduty | Year: 2020

🎮 Processing: callofduty | Year: 2021

🎮 Processing: callofduty | Year: 2022

🎮 Processing: callofduty | Year: 2023

🎮 Processing: callofduty | Year: 2024

🎮 Processing: callofduty | Year: 2025

🎮 Processing: gaming | Year: 2020

🎮 Processing: gaming | Year: 2021

🎮 Processing: gaming | Year: 2022

🎮 Processing: gaming | Year: 2023

🎮 Processing: gaming | Year: 2024

🎮 Processing: gaming | Year: 2025

🎮 Processing: genshinimpact | Year: 2020

🎮 Processing: genshinimpact | Year: 2021

🎮 Processing: genshinimpact | Year: 2022

🎮 Processing: genshinimpact | Year: 2023

🎮 Processing: genshinimpact | Year: 2024

🎮 Processing: genshinimpact | Year: 2025

🎮 Processing: gta | Year: 2020

🎮 Processing: gta | Year: 2021

🎮 Processing: gta | Year: 2022

🎮 Processing: gta | Year: 2023

🎮 Processing: gta | Year: 2024

🎮 Processing: gta | Year: 2025

🎮 Processing: leagueoflegends | Year: 2020

🎮 Processing: leagueoflegends | Year: 2021

🎮 Processing: leagueofl

In [23]:
# Print top 20 topics per game per year
for (game, year), topic_df in top_topics_by_game_year.items():
    print(f"\nTop 20 Topics for {game.capitalize()} in {year}")
    print(topic_df[['Topic', 'Name', 'Count']].head(20))


Top 20 Topics for Callofduty in 2020
    Topic                                Name  Count
0      -1                -1_get_time_like_one    420
1       0                 0_mean_tho_like_guy    136
2       1             1_zombie_ops_black_duty     86
3       2       2_warzone_lobby_new_chernobyl     43
4       3              3_shot_shoot_hit_knife     32
5       4    4_ghost_remember_memory_shepherd     30
6       5                5_win_solo_first_duo     23
7       6            6_gulag_fresh_squad_back     23
8       7    7_kill_disable_triple_killstreak     23
9       8  8_juggernaut_riot_shield_juggernog     20
10      9         9_cheater_cheat_report_game     19
11     10           10_death_sound_voice_work     18
12     11               11_god_soap_rip_mason     16
13     12             12_bunker_blend_mp7_key     16
14     13            13_game_far_gameplay_noc     15
15     14            14_meme_gold_make_mobile     15
16     15         15_step_vehicle_cargo_crate     13
17     1

# LDA

In [24]:
texts = df['text'].dropna().astype(str).apply(lambda x: x.split()).tolist()
df['year'] = df['year'].astype(int)


In [25]:
from gensim import corpora
from gensim.models import LdaModel

# Store models and topics per year
lda_by_game_year = {}
# Loop through each year
for game in games:
    for year in years:
        subset = df[(df['subreddit'] == game) & (df['year'] == year)]

        print(f"\n🧠 Training LDA for {game} | Year {year} ({len(subset)} posts)...")
        
        if len(subset) < 10:
            print("  ⚠️ Skipped due to too few posts")
            continue
        
        # Tokenize cleaned text (assuming already preprocessed)
        texts = subset['text'].astype(str).apply(lambda x: x.split()).tolist()
        
        # Create dictionary and corpus
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        
        # Train LDA
        lda_model = LdaModel(
            corpus=corpus,
            id2word=dictionary,
            num_topics=20,
            random_state=42,
            passes=10,
            alpha='auto',
            per_word_topics=True
        )
        
        # Store results
        lda_by_game_year[(game, year)] = {
            'model': lda_model,
            'corpus': corpus,
            'dictionary': dictionary
        }
        
        # Display top topics
        topics = lda_model.print_topics(num_words=10)
        for i, topic in enumerate(topics):
            print(f"🔹 Topic {i}: {topic[1]}")


🧠 Training LDA for callofduty | Year 2020 (982 posts)...
🔹 Topic 0: 0.012*"happy" + 0.012*"love" + 0.012*"yes" + 0.011*"one" + 0.011*"update" + 0.009*"squad" + 0.009*"game" + 0.009*"way" + 0.009*"car" + 0.009*"ah"
🔹 Topic 1: 0.019*"think" + 0.017*"game" + 0.015*"year" + 0.015*"u" + 0.015*"would" + 0.014*"thing" + 0.013*"5" + 0.012*"get" + 0.011*"go" + 0.010*"ive"
🔹 Topic 2: 0.037*"one" + 0.016*"old" + 0.014*"day" + 0.012*"game" + 0.012*"year" + 0.011*"every" + 0.011*"play" + 0.010*"try" + 0.009*"guy" + 0.008*"find"
🔹 Topic 3: 0.018*"make" + 0.016*"kill" + 0.016*"warzone" + 0.012*"gulag" + 0.012*"get" + 0.010*"gas" + 0.009*"see" + 0.009*"truck" + 0.009*"wife" + 0.009*"mask"
🔹 Topic 4: 0.030*"like" + 0.015*"get" + 0.015*"zombie" + 0.010*"ever" + 0.010*"felt" + 0.010*"drop" + 0.008*"game" + 0.008*"warfare" + 0.008*"didnt" + 0.008*"hit"
🔹 Topic 5: 0.020*"game" + 0.018*"know" + 0.011*"get" + 0.011*"want" + 0.009*"guy" + 0.009*"see" + 0.008*"play" + 0.007*"cheater" + 0.007*"duty" + 0.007*"c

# By games

In [26]:
df['subreddit'] = df['subreddit'].astype(str)

In [27]:
lda_by_game = {}

for game, group in df.groupby('subreddit'):
    print(f"\n Training LDA for {game}...")

    # Tokenize cleaned text
    texts = group['text'].astype(str).apply(lambda x: x.split()).tolist()


    # Create dictionary and corpus
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=5, no_above=0.5)
    corpus = [dictionary.doc2bow(text) for text in texts]

    # Train LDA model
    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=20,
        random_state=42,
        passes=10,
        alpha='auto'
    )

    # Store results
    lda_by_game[game] = {
        'model': lda_model,
        'corpus': corpus,
        'dictionary': dictionary
    }

    # Print top topics
    print(f"Top topics for {game}:")
    topics = lda_model.print_topics()
    for i, topic in enumerate(topics):
        print(f"  Topic {i}: {topic[1]}")


 Training LDA for callofduty...
Top topics for callofduty:
  Topic 0: 0.042*"remastered" + 0.038*"try" + 0.033*"new" + 0.033*"community" + 0.030*"doesnt" + 0.029*"get" + 0.029*"sound" + 0.027*"something" + 0.025*"place" + 0.023*"game"
  Topic 1: 0.093*"zombie" + 0.084*"war" + 0.042*"cold" + 0.041*"win" + 0.035*"world" + 0.028*"game" + 0.027*"support" + 0.022*"play" + 0.020*"solo" + 0.017*"die"
  Topic 2: 0.125*"player" + 0.061*"thing" + 0.043*"mobile" + 0.041*"dead" + 0.039*"post" + 0.031*"mission" + 0.028*"always" + 0.017*"run" + 0.017*"ive" + 0.016*"leave"
  Topic 3: 0.125*"warzone" + 0.085*"play" + 0.063*"pc" + 0.058*"2" + 0.037*"fix" + 0.033*"safe" + 0.023*"2019" + 0.021*"moment" + 0.020*"game" + 0.020*"never"
  Topic 4: 0.066*"gun" + 0.063*"mode" + 0.061*"work" + 0.058*"skin" + 0.041*"find" + 0.038*"glitch" + 0.031*"match" + 0.030*"operator" + 0.026*"new" + 0.026*"bundle"
  Topic 5: 0.072*"use" + 0.067*"go" + 0.054*"kill" + 0.040*"whats" + 0.039*"server" + 0.035*"let" + 0.028*"wa

# NNM

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [29]:
nmf_by_game_year = {}

for game in games:
    for year in years:
        subset = df[(df['subreddit'] == game) & (df['year'] == year)]
        print(f"\n🔍 Running NMF for {game} | Year {year} ({len(subset)} posts)...")

        if len(subset) < 10:
            print("  ⚠️ Skipped due to too few posts")
            continue

        texts = subset["text"].dropna().astype(str).tolist()

        # TF-IDF vectorizer
        vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words="english")
        tfidf = vectorizer.fit_transform(texts)

        # Fit NMF model
        nmf = NMF(n_components=20, random_state=42)
        nmf.fit(tfidf)

        # Store results
        nmf_by_game_year[(game, year)] = {
            "model": nmf,
            "vectorizer": vectorizer,
            "tfidf": tfidf
        }

        # Print top words for each topic
        feature_names = vectorizer.get_feature_names_out()
        for topic_idx, topic in enumerate(nmf.components_):
            top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
            print(f"🔹 Topic {topic_idx + 1}: {' | '.join(top_words)}")


🔍 Running NMF for callofduty | Year 2020 (982 posts)...
🔹 Topic 1: like | really | veteran | movie | drop | plan | math | cool | land | waw
🔹 Topic 2: game | love | dlc | ive | cheater | aw | single | really | lobby | history
🔹 Topic 3: zombie | meme | boy | mad | nostalgia | recently | door | strong | hurt | average
🔹 Topic 4: warzone | hacker | new | today | map | lobby | fun | mode | life | snipe
🔹 Topic 5: make | meme | friend | finally | campaign | aw | let | clear | squad | tried
🔹 Topic 6: win | dad | squad | gas | clutch | really | solo | new | duo | 50
🔹 Topic 7: black | ops | war | cold | duty | trailer | look | nutshell | let | hacker
🔹 Topic 8: time | try | problem | long | simple | maybe | stair | line | gun | ray
🔹 Topic 9: good | ghost | memory | felt | feel | sir | juggernaut | doesnt | piece | really
🔹 Topic 10: guy | use | leave | stop | seriously | ill | shield | new | riot | drop
🔹 Topic 11: know | dont | want | feel | boy | man | 2012 | launch | makarov | saw
🔹 To

# By game

In [30]:
nmf_by_game = {}

for game, group in df.groupby("subreddit"):
    print(f"\n🎮 Running NMF for {game}...")

    docs = group["text"].dropna().astype(str).tolist()

    # TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
    tfidf = vectorizer.fit_transform(docs)

    # NMF model
    nmf = NMF(n_components=20, random_state=42)
    nmf.fit(tfidf)

    # Store model and vectorizer
    nmf_by_game[game] = {
        "model": nmf,
        "vectorizer": vectorizer,
        "tfidf": tfidf
    }

    # Display top words per topic
    feature_names = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(nmf.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
        print(f"Topic {topic_idx + 1}: {' | '.join(top_words)}")


🎮 Running NMF for callofduty...
Topic 1: game | video | different | pas | hate | favorite | look | crash | ive | love
Topic 2: duty | vanguard | mobile | franchise | activision | world | tribute | favorite | update | come
Topic 3: ops | black | series | release | cold | 2022 | spec | screen | year | ago
Topic 4: play | safe | rank | want | xbox | let | right | multiplayer | people | friend
Topic 5: like | look | really | player | feel | tho | character | people | lobby | doesnt
Topic 6: warfare | modern | infinite | remastered | advance | 2019 | multiplayer | 2007 | xbox | fix
Topic 7: warzone | camo | player | kill | better | hacker | moment | lag | bug | cheater
Topic 8: know | dont | fix | want | whats | true | na | anybody | gon | yall
Topic 9: best | whats | map | multiplayer | series | right | list | way | opinion | villain
Topic 10: new | map | season | update | drop | player | favorite | idea | skin | im
Topic 11: ghost | reboot | map | version | sequel | xbox | skin | cosplay