In [1]:
# imports and load the preproccessed data
import pandas as pd
from ast import literal_eval
from gensim import corpora, models
from gensim.models import CoherenceModel
import pyLDAvis.gensim_models
import pyLDAvis
df = pd.read_csv("../preprocessed.csv")

In [2]:
# Convert the precossed_body into a list of token
df['tokens'] = df['preprocessed_body'].apply(literal_eval)

In [3]:
# Create the bag of words
dictionary = corpora.Dictionary(df['tokens'])
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=2000)
corpus = [dictionary.doc2bow(text) for text in df['tokens']]
bow_data = []
# Display the first 20 rows of BoW
for doc_idx, bow in enumerate(corpus[:20]):  
    word_freqs = [(dictionary[word_id], freq) for word_id, freq in bow]
    bow_data.append({'doc': doc_idx+1, 'words': word_freqs})
bow_df = pd.DataFrame(bow_data)
bow_df

Unnamed: 0,doc,words
0,1,"[(another, 1), (cost, 1), (course, 1), (cultur..."
1,2,"[(course, 1), (thats, 1), (thing, 1), (dont, 1..."
2,3,"[(well, 1), (real, 1), (actually, 1), (almost,..."
3,4,"[(culture, 1), (like, 1), (lot, 1), (often, 1)..."
4,5,"[(probably, 1), (global, 1), (warm, 1), (due, ..."
5,6,"[(like, 2), (might, 1), (need, 1), (say, 1), (..."
6,7,"[(another, 1), (answer, 1), (car, 1), (inequal..."
7,8,"[(like, 1), (people, 1), (try, 1), (week, 1), ..."
8,9,"[(cost, 1), (go, 2), (far, 1), (addition, 1), ..."
9,10,"[(like, 1), (come, 1), (defend, 1), (force, 1)..."


In [4]:
# Create thhe LDA with 3 topics
lda_model = models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=3,
    random_state=42,
    passes=10
)
# Print the topics
topics = lda_model.print_topics(-1, num_words=10)
for i, topic in topics:
    words = topic.replace('"', '').replace('*', '').replace('+', ',').split(',')
    clean_words = [w.strip() for w in words if w.strip()]
    print(f"Topic {i+1}: {', '.join(clean_words)}")

Topic 1: 0.009people, 0.009would, 0.009government, 0.008trump, 0.008country, 0.007money, 0.007make, 0.007policy, 0.006get, 0.006need
Topic 2: 0.014year, 0.009go, 0.009global, 0.008would, 0.007cause, 0.007human, 0.006warm, 0.006time, 0.006use, 0.006world
Topic 3: 0.020people, 0.016dont, 0.013say, 0.013think, 0.013like, 0.010make, 0.010im, 0.010get, 0.009thing, 0.009one


In [5]:
# Display the topics by using pyLDAvis 
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis 

In [6]:
# Evaluation metrics using Coherence score
coherence_model = CoherenceModel(model=lda_model, texts=df['tokens'], dictionary=dictionary, coherence='c_v')
coherence = coherence_model.get_coherence()
print(f"Coherence Score: {coherence:.4f}")

Coherence Score: 0.3991
