In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
data  = pd.read_csv("Downloads/cleaned_review_data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,category,rating,label,text,tokens,joined_text
0,0,Home_and_Kitchen_5,5.0,CG,love this well made sturdy and very comfortab...,"['love', 'well', 'made', 'sturdy', 'comfortabl...",love well made sturdy comfortable love itvery ...
1,1,Home_and_Kitchen_5,5.0,CG,love it a great upgrade from the original ive...,"['love', 'great', 'upgrade', 'original', 'ive'...",love great upgrade original ive mine couple year
2,2,Home_and_Kitchen_5,5.0,CG,this pillow saved my back i love the look and ...,"['pillow', 'saved', 'back', 'love', 'look', 'f...",pillow saved back love look feel pillow
3,3,Home_and_Kitchen_5,1.0,CG,missing information on how to use it but it is...,"['missing', 'information', 'use', 'great', 'pr...",missing information use great product price
4,4,Home_and_Kitchen_5,5.0,CG,very nice set good quality we have had the set...,"['nice', 'set', 'good', 'quality', 'set', 'two...",nice set good quality set two month


In [3]:
data.drop(columns = 'Unnamed: 0', inplace = True)

In [4]:
texts = data['joined_text']

In [5]:
data.isnull().sum()

category       0
rating         0
label          0
text           1
tokens         0
joined_text    1
dtype: int64

In [6]:
texts = texts.dropna()

In [7]:
# Tf - Idf vecotization for K-means
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)

In [8]:
# Apply K-Means
num_clusters = 5
kmeans_model = KMeans(n_clusters=num_clusters, random_state=42)
kmeans_model.fit(tfidf_matrix)



In [9]:
# Extract Top Words for Each Cluster
def get_kmeans_topics(cluster_centers, terms, n_top_words=10):
    topics = []
    for cluster_idx, cluster in enumerate(cluster_centers):
        top_indices = cluster.argsort()[-n_top_words:][::-1]
        topics.append([terms[i] for i in top_indices])
    return topics

In [10]:
kmeans_topics = get_kmeans_topics(kmeans_model.cluster_centers_, tfidf_vectorizer.get_feature_names_out())

In [11]:
# Count Vectorization for LDA
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)
count_matrix = count_vectorizer.fit_transform(texts)

In [12]:
# Apply LDA
lda_model = LatentDirichletAllocation(n_components=num_clusters, random_state=42)
lda_model.fit(count_matrix)

In [13]:
# Extract LDA Topics
def get_lda_topics(lda_model, feature_names, n_top_words=10):
    topics = []
    for topic_idx, topic in enumerate(lda_model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append(top_words)
    return topics

In [14]:
lda_topics = get_lda_topics(lda_model, count_vectorizer.get_feature_names_out())

In [15]:
# Print Results
print("K-Means Topics:")
for i, topic in enumerate(kmeans_topics):
    print(f"Cluster {i+1}: {topic}")

K-Means Topics:
Cluster 1: ['dog', 'love', 'food', 'small', 'great', 'product', 'treat', 'cat', 'toy', 'bought']
Cluster 2: ['great', 'work', 'good', 'like', 'use', 'little', 'nice', 'quality', 'product', 'easy']
Cluster 3: ['movie', 'good', 'acting', 'watch', 'story', 'great', 'love', 'film', 'action', 'like']
Cluster 4: ['love', 'fit', 'size', 'son', 'great', 'bought', 'comfortable', 'little', 'shoe', 'wear']
Cluster 5: ['book', 'read', 'story', 'character', 'series', 'author', 'enjoyed', 'good', 'reading', 'developed']


In [16]:
print("\nLDA Topics:")
for i, topic in enumerate(lda_topics):
    print(f"Topic {i+1}: {topic}")


LDA Topics:
Topic 1: ['movie', 'good', 'like', 'time', 'film', 'great', 'story', 'acting', 'life', 'watch']
Topic 2: ['book', 'story', 'read', 'character', 'good', 'love', 'series', 'great', 'author', 'enjoyed']
Topic 3: ['dog', 'love', 'cat', 'work', 'great', 'use', 'product', 'like', 'food', 'good']
Topic 4: ['like', 'use', 'work', 'time', 'good', 'great', 'dont', 'make', 'really', 'water']
Topic 5: ['great', 'love', 'little', 'bought', 'good', 'fit', 'size', 'nice', 'quality', 'small']
