In [1]:
# imports and load the preproccessed data
import pandas as pd
from ast import literal_eval
from gensim import corpora, models
from gensim.models import CoherenceModel
import pyLDAvis.gensim_models
import pyLDAvis
df = pd.read_csv("../preprocessed.csv")

In [2]:
# Convert the precossed_body into a list of token
df['tokens'] = df['preprocessed_body'].apply(literal_eval)

In [3]:
# Create the bag of words
dictionary = corpora.Dictionary(df['tokens'])
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=2000)
corpus = [dictionary.doc2bow(text) for text in df['tokens']]
bow_data = []
# Display the first 20 rows of BoW
for doc_idx, bow in enumerate(corpus[:20]):  
    word_freqs = [(dictionary[word_id], freq) for word_id, freq in bow]
    bow_data.append({'doc': doc_idx+1, 'words': word_freqs})
bow_df = pd.DataFrame(bow_data)
bow_df

Unnamed: 0,doc,words
0,1,"[(another, 1), (cost, 1), (course, 1), (cultur..."
1,2,"[(course, 1), (reverse, 1), (threat, 1), (week..."
2,3,"[(almost, 1), (argue, 1), (debate, 2), (false,..."
3,4,"[(culture, 1), (often, 1), (state, 1), (access..."
4,5,"[(global, 1), (warm, 1), (flood, 1), (sea, 1)]"
5,6,"[(almost, 2), (scientist, 1), (time, 1), (true..."
6,7,"[(another, 1), (answer, 1), (bird, 1), (car, 1..."
7,8,"[(people, 1), (week, 1), (argument, 1), (denie..."
8,9,"[(cost, 1), (far, 1), (addition, 1), (clean, 1..."
9,10,"[(defend, 1), (force, 1), (full, 1), (mention,..."


In [4]:
# Create thhe LDA with 3 topics
lda_model = models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=3,
    random_state=42,
    passes=10
)
# Print the topics
topics = lda_model.print_topics(-1, num_words=10)
for i, topic in topics:
    words = topic.replace('"', '').replace('*', '').replace('+', ',').split(',')
    clean_words = [w.strip() for w in words if w.strip()]
    print(f"Topic {i+1}: {', '.join(clean_words)}")

Topic 1: 0.016year, 0.012global, 0.009cause, 0.008warm, 0.008human, 0.007water, 0.007energy, 0.007world, 0.007increase, 0.007time
Topic 2: 0.015people, 0.010government, 0.010trump, 0.010country, 0.008money, 0.008policy, 0.007tax, 0.007issue, 0.007vote, 0.007work
Topic 3: 0.028people, 0.010science, 0.008time, 0.006scientist, 0.006bad, 0.006good, 0.005world, 0.005issue, 0.005life, 0.005work


In [5]:
# Display the topics by using pyLDAvis 
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
vis 

In [6]:
# Evaluation metrics using Coherence score, Perplexity, and Topic Diversity
coherence_model = CoherenceModel(model=lda_model, texts=df['tokens'], dictionary=dictionary, coherence='c_v')
coherence = coherence_model.get_coherence()
print(f"Coherence Score: {coherence:.4f}")

perplexity = lda_model.log_perplexity(corpus)
print(f"Perplexity: {perplexity:.4f}")

def topic_diversity(model, topk=10):
    topic_words = [word for topic_id in range(model.num_topics)
                   for word, _ in model.show_topic(topic_id, topk)]
    unique_words = set(topic_words)
    return len(unique_words) / (model.num_topics * topk)
print("Topic Diversity:", topic_diversity(lda_model))


Coherence Score: 0.4876
Perplexity: -7.0714
Topic Diversity: 0.8333333333333334


In [7]:
# Save the results in a csv file 
bow_features_list = []
lda_topics_list = []

for doc_bow in corpus:
    # Make a short BoW sample vector
    bow_vector = [0]*6
    for word_id, freq in doc_bow:
        if word_id < 6:
            bow_vector[word_id] = freq
    bow_features_list.append(str(bow_vector)) 
    
    # Get LDA topic distribution
    topic_dist = lda_model.get_document_topics(doc_bow, minimum_probability=0)
    topic_vector = [round(prob, 2) for _, prob in sorted(topic_dist)]
    lda_topics_list.append(str(topic_vector))

#  Add to existing DataFrame
df['BoW Features'] = bow_features_list
df['LDA Topics'] = lda_topics_list
# Save to CSV
df.to_csv("../LDA_Result.csv", index=False)