# Topic Modeling

This notebook performs topic modeling using LDA to identify recurring themes in retail reviews.


In [None]:
import pandas as pd
import numpy as np
from gensim import corpora, models
from gensim.models import LdaModel
from wordcloud import WordCloud
import matplotlib.pyplot as plt

print("Libraries imported successfully!")


## Load Processed Data


In [None]:
df = pd.read_csv('../data/processed_reviews.csv')
df = df[df['processed_text'].notna() & (df['processed_text'].str.len() > 0)]
print(f"Loaded {len(df)} processed reviews")
df.head()


## Prepare Corpus


In [None]:
# Tokenize texts
texts = df['processed_text'].tolist()
tokenized_texts = [text.split() for text in texts if text and len(text) > 0]

# Create dictionary
dictionary = corpora.Dictionary(tokenized_texts)
dictionary.filter_extremes(no_below=2, no_above=0.5)

# Create corpus
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]

print(f"Dictionary size: {len(dictionary)}")
print(f"Corpus size: {len(corpus)}")


## Train LDA Model


In [None]:
# Train LDA model
num_topics = 5
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

print(f"LDA model trained with {num_topics} topics")


## Extract Topic Keywords


In [None]:
import re

topics = []
for idx, topic in lda_model.print_topics(-1, num_words=10):
    words = re.findall(r'"([^"]+)"', topic)
    topic_dict = {
        'topic_id': idx,
        'keywords': ', '.join(words[:10]),
        'top_words': words[:10]
    }
    topics.append(topic_dict)

topic_df = pd.DataFrame(topics)
print("Extracted Topics:")
for _, row in topic_df.iterrows():
    print(f"\nTopic {row['topic_id']}: {row['keywords']}")

topic_df


## Create Word Cloud


In [None]:
# Create word cloud from all topics
all_keywords = ' '.join([' '.join(row['top_words']) for _, row in topic_df.iterrows()])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_keywords)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Topic Word Cloud', fontsize=16)
plt.tight_layout()
plt.savefig('../results/visuals/topic_wordcloud.png', dpi=300, bbox_inches='tight')
plt.show()


## Save Results


In [None]:
topic_df.to_csv('../results/topic_keywords.csv', index=False)
print("Topic keywords saved to ../results/topic_keywords.csv")
