In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation

# Load TF-IDF matrix
tfidf_df = pd.read_parquet("tfidf_yelp_reviews.parquet")
tfidf_matrix = tfidf_df.values

# Define the number of topics
num_topics = 10  # You can tune this

# Initialize and fit LDA model
lda_model = LatentDirichletAllocation(
    n_components=num_topics, 
    random_state=42, 
    learning_method='batch',  # You can change to 'online' for large datasets
    max_iter=200  # Increase if needed
)
W = lda_model.fit_transform(tfidf_matrix)  # Document-Topic distribution (shape: 20234 x 10)
H = lda_model.components_  # Topic-Term matrix (shape: 10 x 20000)

# Extract feature names
feature_names = tfidf_df.columns
n_top_words = 10

# Print top words for each topic
for topic_idx, topic_weights in enumerate(H):
    top_word_indices = topic_weights.argsort()[::-1][:n_top_words]
    top_words = [feature_names[i] for i in top_word_indices]
    print(f"Topic #{topic_idx}:")
    print("  " + " ".join(top_words))
    print("-" * 50)

# Assign each document its most probable topic
doc_topics = np.argmax(W, axis=1)

# If you want to add this back to a DataFrame
# (Assuming you have a matching index of documents)
# original_df['dominant_topic'] = doc_topics


Topic #0:
  gelati kaldi beiler whopper paesano insomnia kolaches matzo ube caje
--------------------------------------------------
Topic #1:
  shelly bimbo rasoi uzbek enforcing bono teller giovanni nikki alisha
--------------------------------------------------
Topic #2:
  ed dc dv8 luna manger mccoy lemp arby anyplace tbbc
--------------------------------------------------
Topic #3:
  omakase halo hinoki siu ming hunan washoe iztaccihuatl yummm delilah
--------------------------------------------------
Topic #4:
  sampan wegman finney twistee nibbled coffeeshop riverbench thy elsie evos
--------------------------------------------------
Topic #5:
  vault naughty alway spectacle phuong bobacup sardi josephine galaxy distrito
--------------------------------------------------
Topic #6:
  certificate goo lechon taffy snacking gabby culver nova layover pierogis
--------------------------------------------------
Topic #7:
  gay woody rous marte greg verti fil honky tonk molly
-----------

In [2]:
import pandas as pd

# Save dominant topic for each document
doc_topic_df = pd.DataFrame({
    'document_index': range(len(doc_topics)),
    'dominant_topic': doc_topics
})
doc_topic_df.to_parquet("document_topics.parquet", index=False)

# Save top words per topic
topics_data = []
for topic_idx, topic_weights in enumerate(H):
    top_word_indices = topic_weights.argsort()[::-1][:n_top_words]
    top_words = [feature_names[i] for i in top_word_indices]
    topics_data.append({"topic": topic_idx, "top_words": ", ".join(top_words)})

topics_df = pd.DataFrame(topics_data)
topics_df.to_parquet("lda_topics.parquet", index=False)

# Save full topic probability distribution (W matrix)
W_df = pd.DataFrame(W, columns=[f"topic_{i}" for i in range(num_topics)])
W_df.to_parquet("document_topic_distribution.parquet", index=False)

print("Results saved successfully!")


Results saved successfully!
