In [1]:
import pandas as pd
from bertopic import BERTopic

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 1. Load the dataset
df = pd.read_csv("preprocessed_redditData-removedtopic.csv")

In [4]:
# 2. Prepare inputs for BERTopic
texts = df['text'].astype(str).tolist()
timestamps = df['year'].astype(str).tolist()  # Treat year as string labels

In [5]:
# 4. Fit BERTopic model
topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(texts)

2025-07-17 09:43:57,858 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 2000/2000 [00:17<00:00, 116.50it/s]
2025-07-17 09:44:23,002 - BERTopic - Embedding - Completed ✓
2025-07-17 09:44:23,002 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-17 09:45:06,281 - BERTopic - Dimensionality - Completed ✓
2025-07-17 09:45:06,283 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-17 09:45:11,468 - BERTopic - Cluster - Completed ✓
2025-07-17 09:45:11,557 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-17 09:45:12,739 - BERTopic - Representation - Completed ✓


In [6]:
# 5. Get topics per year (class)
topics_by_year = topic_model.topics_per_class(
    docs=texts,
    classes=timestamps,
    global_tuning=True
)

6it [00:02,  2.03it/s]


In [7]:
# 6. Extract top 20 topics per year
top20_by_year = {}
for yr in sorted(df['year'].unique()):
    df_year = topics_by_year[topics_by_year['Class'] == str(yr)]
    top20 = df_year.sort_values('Frequency', ascending=False).head(20)
    top20_by_year[yr] = top20

In [8]:
print(topics_by_year.columns)

Index(['Topic', 'Words', 'Frequency', 'Class'], dtype='object')


In [9]:
# 7. Print results 
for yr, df_topics in top20_by_year.items():
    print(f"\nTop 20 topics for {yr}")
    print(df_topics[['Topic', 'Words', 'Frequency']])


Top 20 topics for 2020
      Topic                                              Words  Frequency
4034     -1                         csgo, would, im, run, game       3926
4083     48                 beta, close, closed, access, email         82
4041      6                           euw, na, chill, lf, norm         71
4045     10    dragon, dragonborn, ender, defeat, dragonsreach         62
4037      2  map, geoculuses, pixelsmash, interactable, bac...         59
4064     29                meme, effort, meepcity, sfs, kahoot         57
4036      1               skin, prestige, trading, mvp, league         53
4035      0              mod, modding, loverslab, idle, modder         49
4112     77             cyberpunk, projekt, cd, witcher, delay         48
4039      4                   ps5, playstation, ps4, sony, ps3         45
4046     11     warzone, missile, warchild, 60274513, dropshot         39
4154    119     wallpaper, background, v200609, desktop, phone         39
4042      7   

In [None]:
#lda
from gensim import corpora
from gensim.models import LdaModel
from gensim.utils import simple_preprocess