In [1]:
import pandas as pd
from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# 1. Load the dataset
df = pd.read_csv("preprocessed_redditData.csv")

In [8]:
# 2. Prepare inputs for BERTopic
texts = df['text'].astype(str).tolist()
timestamps = df['year'].astype(str).tolist()  # Treat year as string labels

In [9]:
# 4. Fit BERTopic model
topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(texts)

2025-07-15 04:13:25,190 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 2007/2007 [00:17<00:00, 116.38it/s]
2025-07-15 04:13:46,215 - BERTopic - Embedding - Completed ✓
2025-07-15 04:13:46,216 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-15 04:14:03,038 - BERTopic - Dimensionality - Completed ✓
2025-07-15 04:14:03,040 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-15 04:14:06,317 - BERTopic - Cluster - Completed ✓
2025-07-15 04:14:06,329 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-15 04:14:07,381 - BERTopic - Representation - Completed ✓


In [10]:
# 5. Get topics per year (class)
topics_by_year = topic_model.topics_per_class(
    docs=texts,
    classes=timestamps,
    global_tuning=True
)

6it [00:02,  2.00it/s]


In [11]:
# 6. Extract top 20 topics per year
top20_by_year = {}
for yr in sorted(df['year'].unique()):
    df_year = topics_by_year[topics_by_year['Class'] == str(yr)]
    top20 = df_year.sort_values('Frequency', ascending=False).head(20)
    top20_by_year[yr] = top20

In [12]:
print(topics_by_year.columns)

Index(['Topic', 'Words', 'Frequency', 'Class'], dtype='object')


In [13]:
# 7. Print results 
for yr, df_topics in top20_by_year.items():
    print(f"\n👑 Top 20 topics for {yr}")
    print(df_topics[['Topic', 'Words', 'Frequency']])


👑 Top 20 topics for 2020
      Topic                                              Words  Frequency
2692     -1             csgo, valorant, character, would, like       3906
2693      0                    sims, sim, marley, family, teen        249
2699      6            genshin, impact, cbt2, walkthrough, cbt        124
2695      2  skyrim, together, morrowind, skyrimtogether, r...        121
2694      1             roblox, studio, robux, robloxs, filter        101
2701      8                       mw2, mw3, mw, shepherd, soap         78
2696      3                  cod, cod2, crop, harrier, trigram         67
2779     86                 beta, close, email, access, closed         62
2709     16     dragon, dragonborn, ender, fossil, dragonspine         58
2754     61                cyberpunk, 2077, projekt, 2077s, cd         53
2702      9                             bo2, bo, boi, bo3, bos         52
2708     15  valorant, runeterra, valorants, janky, uthetru...         51
2698      5 