In [1]:
import pandas as pd
from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. Load the dataset
df = pd.read_csv("./reddit_2020_2024.csv")

In [3]:
# 2. Data cleaning: remove missing and duplicate cleaned texts
df = df.dropna(subset=['cleaned'])
df = df.drop_duplicates(subset=['cleaned'])

In [4]:
# 3. Prepare inputs for BERTopic
texts = df['cleaned'].tolist()
timestamps = df['year'].astype(str).tolist()  # Treat year as string labels


In [5]:
# 4. Fit BERTopic model
topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(texts)

2025-07-13 13:16:39,184 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 871/871 [00:07<00:00, 113.45it/s]
2025-07-13 13:16:54,082 - BERTopic - Embedding - Completed ✓
2025-07-13 13:16:54,082 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-13 13:17:30,577 - BERTopic - Dimensionality - Completed ✓
2025-07-13 13:17:30,579 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-13 13:17:33,851 - BERTopic - Cluster - Completed ✓
2025-07-13 13:17:33,857 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-13 13:17:34,325 - BERTopic - Representation - Completed ✓


In [6]:
# 5. Get topics per year (class)
topics_by_year = topic_model.topics_per_class(
    docs=texts,
    classes=timestamps,
    global_tuning=True
)

5it [00:01,  4.89it/s]


In [None]:
# 6. Extract top 20 topics per year
top20_by_year = {}
for yr in sorted(df['year'].unique()):
    df_year = topics_by_year[topics_by_year['Class'] == str(yr)]
    top20 = df_year.sort_values('Frequency', ascending=False).head(20)
    top20_by_year[yr] = top20

In [9]:
print(topics_by_year.columns)

Index(['Topic', 'Words', 'Frequency', 'Class'], dtype='object')


In [10]:
# 7. Print results 
for yr, df_topics in top20_by_year.items():
    print(f"\n👑 Top 20 topics for {yr}")
    print(df_topics[['Topic', 'Words', 'Frequency']])


👑 Top 20 topics for 2020
     Topic                                          Words  Frequency
772     -1                            to, and, on, it, of       3427
773      0                    sims, sim, her, my, husband        147
775      2               cod, duty, warfare, call, modern        126
774      1             cc, house, built, apartment, build        123
782      9               na, looking, friends, duo, norms        110
776      3         roblox, studio, robux, account, filter         98
786     13                        mw2, mw3, mw, soap, cqb         77
781      8   skyrim, together, morrowind, enderal, beyond         74
784     11   valorant, knife, valorants, glitchpop, notes         66
780      7              her, she, baby, toddler, children         62
794     21          minecraft, worlds, mojang, made, maze         59
810     37            cyberpunk, 2077, cd, projekt, 2077s         55
778      5             gta, online, andreas, vi, nutshell         55
789     