In [2]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [3]:
data  = pd.read_csv("Downloads/cleaned_review_data.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,category,rating,label,text,tokens,joined_text
0,0,Home_and_Kitchen_5,5.0,CG,love this well made sturdy and very comfortab...,"['love', 'well', 'made', 'sturdy', 'comfortabl...",love well made sturdy comfortable love itvery ...
1,1,Home_and_Kitchen_5,5.0,CG,love it a great upgrade from the original ive...,"['love', 'great', 'upgrade', 'original', 'ive'...",love great upgrade original ive mine couple year
2,2,Home_and_Kitchen_5,5.0,CG,this pillow saved my back i love the look and ...,"['pillow', 'saved', 'back', 'love', 'look', 'f...",pillow saved back love look feel pillow
3,3,Home_and_Kitchen_5,1.0,CG,missing information on how to use it but it is...,"['missing', 'information', 'use', 'great', 'pr...",missing information use great product price
4,4,Home_and_Kitchen_5,5.0,CG,very nice set good quality we have had the set...,"['nice', 'set', 'good', 'quality', 'set', 'two...",nice set good quality set two month


In [4]:
data.drop(columns = 'Unnamed: 0', inplace = True)

In [5]:
texts = data['joined_text']

In [6]:
data.isnull().sum()

category       0
rating         0
label          0
text           1
tokens         0
joined_text    1
dtype: int64

In [7]:
texts = texts.dropna()

In [8]:
# Tf - Idf vecotization for K-means
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)

In [9]:
# Apply K-Means
num_clusters = 5
kmeans_model = KMeans(n_clusters=num_clusters, random_state=42)
kmeans_model.fit(tfidf_matrix)



In [10]:
# Extract Top Words for Each Cluster
def get_kmeans_topics(cluster_centers, terms, n_top_words=10):
    topics = []
    for cluster_idx, cluster in enumerate(cluster_centers):
        top_indices = cluster.argsort()[-n_top_words:][::-1]
        topics.append([terms[i] for i in top_indices])
    return topics

In [11]:
kmeans_topics = get_kmeans_topics(kmeans_model.cluster_centers_, tfidf_vectorizer.get_feature_names_out())

In [12]:
# Count Vectorization for LDA
count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)
count_matrix = count_vectorizer.fit_transform(texts)

In [13]:
# Apply LDA
lda_model = LatentDirichletAllocation(n_components=num_clusters, random_state=42)
lda_model.fit(count_matrix)

In [14]:
# Extract LDA Topics
def get_lda_topics(lda_model, feature_names, n_top_words=10):
    topics = []
    for topic_idx, topic in enumerate(lda_model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append(top_words)
    return topics

In [15]:
lda_topics = get_lda_topics(lda_model, count_vectorizer.get_feature_names_out())

In [16]:
# Print Results
print("K-Means Topics:")
for i, topic in enumerate(kmeans_topics):
    print(f"Cluster {i+1}: {topic}")

K-Means Topics:
Cluster 1: ['dog', 'love', 'food', 'small', 'great', 'product', 'treat', 'cat', 'toy', 'bought']
Cluster 2: ['great', 'work', 'good', 'like', 'use', 'little', 'nice', 'quality', 'product', 'easy']
Cluster 3: ['movie', 'good', 'acting', 'watch', 'story', 'great', 'love', 'film', 'action', 'like']
Cluster 4: ['love', 'fit', 'size', 'son', 'great', 'bought', 'comfortable', 'little', 'shoe', 'wear']
Cluster 5: ['book', 'read', 'story', 'character', 'series', 'author', 'enjoyed', 'good', 'reading', 'developed']


In [17]:
print("\nLDA Topics:")
for i, topic in enumerate(lda_topics):
    print(f"Topic {i+1}: {topic}")


LDA Topics:
Topic 1: ['movie', 'good', 'like', 'time', 'film', 'great', 'story', 'acting', 'life', 'watch']
Topic 2: ['book', 'story', 'read', 'character', 'good', 'love', 'series', 'great', 'author', 'enjoyed']
Topic 3: ['dog', 'love', 'cat', 'work', 'great', 'use', 'product', 'like', 'food', 'good']
Topic 4: ['like', 'use', 'work', 'time', 'good', 'great', 'dont', 'make', 'really', 'water']
Topic 5: ['great', 'love', 'little', 'bought', 'good', 'fit', 'size', 'nice', 'quality', 'small']


In [1]:
# LDA topics and their top words
lda_topics = {
    "Topic 1": ['movie', 'good', 'like', 'time', 'film', 'great', 'story', 'acting', 'life', 'watch'],
    "Topic 2": ['book', 'story', 'read', 'character', 'good', 'love', 'series', 'great', 'author', 'enjoyed'],
    "Topic 3": ['dog', 'love', 'cat', 'work', 'great', 'use', 'product', 'like', 'food', 'good'],
    "Topic 4": ['like', 'use', 'work', 'time', 'good', 'great', 'dont', 'make', 'really', 'water'],
    "Topic 5": ['great', 'love', 'little', 'bought', 'good', 'fit', 'size', 'nice', 'quality', 'small']
}

# names for each topic
topic_names = {
    "Topic 1": "Movies and Entertainment",
    "Topic 2": "Books and Literature",
    "Topic 3": "Pets and Products",
    "Topic 4": "Everyday Use and Experiences",
    "Topic 5": "Shopping and Fashion"
}

#topics with their assigned names
for topic, keywords in lda_topics.items():
    print(f"{topic} ({topic_names[topic]}):")
    print(f"  Keywords: {', '.join(keywords)}")
    print()


Topic 1 (Movies and Entertainment):
  Keywords: movie, good, like, time, film, great, story, acting, life, watch

Topic 2 (Books and Literature):
  Keywords: book, story, read, character, good, love, series, great, author, enjoyed

Topic 3 (Pets and Products):
  Keywords: dog, love, cat, work, great, use, product, like, food, good

Topic 4 (Everyday Use and Experiences):
  Keywords: like, use, work, time, good, great, dont, make, really, water

Topic 5 (Shopping and Fashion):
  Keywords: great, love, little, bought, good, fit, size, nice, quality, small



## Sentiment Analysis using Pre-trained model

In [25]:
from transformers import pipeline

# Load the sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

# Perform sentiment analysis
data["Sentiment"] = data["text"].apply(lambda x: sentiment_pipeline(x)[0]['label'])

print(data)


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


RuntimeError: Failed to import transformers.models.distilbert.modeling_tf_distilbert because of the following error (look up to see its traceback):
module 'inspect' has no attribute 'ArgSpec'

In [19]:
!pip install --upgrade transformers keras tensorflow


Collecting keras
  Downloading keras-3.7.0-py3-none-any.whl (1.2 MB)
                                              0.0/1.2 MB ? eta -:--:--
                                              0.0/1.2 MB ? eta -:--:--
                                              0.0/1.2 MB ? eta -:--:--
                                              0.0/1.2 MB ? eta -:--:--
                                              0.0/1.2 MB ? eta -:--:--
                                              0.0/1.2 MB 65.6 kB/s eta 0:00:19
                                              0.0/1.2 MB 65.6 kB/s eta 0:00:19
                                              0.0/1.2 MB 65.6 kB/s eta 0:00:19
                                              0.0/1.2 MB 65.6 kB/s eta 0:00:19
                                              0.0/1.2 MB 65.6 kB/s eta 0:00:19
     -                                        0.0/1.2 MB 54.6 kB/s eta 0:00:22
     -                                        0.0/1.2 MB 54.6 kB/s eta 0:00:22
     -                 

ERROR: Exception:
Traceback (most recent call last):
  File "C:\Users\adith\.conda\include\New folder\Lib\site-packages\pip\_vendor\urllib3\response.py", line 438, in _error_catcher
    yield
  File "C:\Users\adith\.conda\include\New folder\Lib\site-packages\pip\_vendor\urllib3\response.py", line 561, in read
    data = self._fp_read(amt) if not fp_closed else b""
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\adith\.conda\include\New folder\Lib\site-packages\pip\_vendor\urllib3\response.py", line 527, in _fp_read
    return self._fp.read(amt) if amt is not None else self._fp.read()
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\adith\.conda\include\New folder\Lib\site-packages\pip\_vendor\cachecontrol\filewrapper.py", line 90, in read
    data = self.__fp.read(amt)
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\adith\.conda\include\New folder\Lib\http\client.py", line 466, in read
    s = self.fp.read(amt)
        ^^^^^^^^^^^^^^^^^
  File "C:\Users\adith\.conda\include\New folder\Lib

In [28]:
!pip install tensorflow==2.13 keras==2.13.1


Collecting tensorflow==2.13
  Using cached tensorflow-2.13.0-cp311-cp311-win_amd64.whl (1.9 kB)
Collecting keras==2.13.1
  Downloading keras-2.13.1-py3-none-any.whl (1.7 MB)
                                              0.0/1.7 MB ? eta -:--:--
                                              0.0/1.7 MB ? eta -:--:--
                                              0.0/1.7 MB 165.2 kB/s eta 0:00:11
                                              0.0/1.7 MB 165.2 kB/s eta 0:00:11
                                              0.0/1.7 MB 165.2 kB/s eta 0:00:11
                                              0.0/1.7 MB 140.9 kB/s eta 0:00:12
                                              0.0/1.7 MB 140.9 kB/s eta 0:00:12
                                              0.0/1.7 MB 140.9 kB/s eta 0:00:12
                                              0.0/1.7 MB 140.9 kB/s eta 0:00:12
     -                                        0.1/1.7 MB 105.0 kB/s eta 0:00:16
     -                                      

ERROR: THESE PACKAGES DO NOT MATCH THE HASHES FROM THE REQUIREMENTS FILE. If you have updated the package versions, please update the hashes. Otherwise, examine the package contents carefully; someone may have tampered with them.
    tensorflow-intel==2.13.0 from https://files.pythonhosted.org/packages/2f/2f/3c84f675931ce3bcbc7e23acbba1e5d7f05ce769adab48322de57a9f5928/tensorflow_intel-2.13.0-cp311-cp311-win_amd64.whl (from tensorflow==2.13):
        Expected sha256 b3a12aefc0bbacdb0132674bc3fed03a70c14c91d48b00d13fcefee15d868e36
             Got        8c4d9576f06055a4229ce277b9f5286bc12ad5eb1b8fcb21465f9e950256654f




     -----                                 40.4/276.6 MB 141.3 kB/s eta 0:27:51
     -----                                 40.4/276.6 MB 141.3 kB/s eta 0:27:51
     -----                                 40.4/276.6 MB 140.6 kB/s eta 0:28:01
     -----                                 40.4/276.6 MB 140.6 kB/s eta 0:28:01
     -----                                 40.4/276.6 MB 140.6 kB/s eta 0:28:01
     -----                                 40.4/276.6 MB 140.6 kB/s eta 0:28:01
     -----                                 40.4/276.6 MB 140.6 kB/s eta 0:28:01
     -----                                 40.4/276.6 MB 140.6 kB/s eta 0:28:01
     -----                                 40.4/276.6 MB 140.6 kB/s eta 0:28:01
     -----                                 40.4/276.6 MB 140.6 kB/s eta 0:28:01
     -----                                 40.4/276.6 MB 140.6 kB/s eta 0:28:01
     -----                                 40.4/276.6 MB 140.6 kB/s eta 0:28:01
     -----                             

In [21]:
!pip uninstall keras

^C


In [22]:
!pip install --upgrade tensorflow


Collecting tensorflow
  Using cached tensorflow-2.18.0-cp311-cp311-win_amd64.whl (7.5 kB)
Collecting tensorflow-intel==2.18.0 (from tensorflow)
  Downloading tensorflow_intel-2.18.0-cp311-cp311-win_amd64.whl (390.2 MB)
                                              0.0/390.2 MB ? eta -:--:--
                                              0.0/390.2 MB ? eta -:--:--
                                              0.0/390.2 MB ? eta -:--:--
                                              0.0/390.2 MB ? eta -:--:--
                                              0.0/390.2 MB ? eta -:--:--
                                              0.0/390.2 MB ? eta -:--:--
                                              0.0/390.2 MB ? eta -:--:--
                                            0.0/390.2 MB 109.5 kB/s eta 0:59:23
                                            0.0/390.2 MB 109.5 kB/s eta 0:59:23
                                            0.0/390.2 MB 109.5 kB/s eta 0:59:23
                              

ERROR: Exception:
Traceback (most recent call last):
  File "C:\Users\adith\.conda\include\New folder\Lib\site-packages\pip\_vendor\urllib3\response.py", line 438, in _error_catcher
    yield
  File "C:\Users\adith\.conda\include\New folder\Lib\site-packages\pip\_vendor\urllib3\response.py", line 561, in read
    data = self._fp_read(amt) if not fp_closed else b""
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\adith\.conda\include\New folder\Lib\site-packages\pip\_vendor\urllib3\response.py", line 527, in _fp_read
    return self._fp.read(amt) if amt is not None else self._fp.read()
           ^^^^^^^^^^^^^^^^^^
  File "C:\Users\adith\.conda\include\New folder\Lib\site-packages\pip\_vendor\cachecontrol\filewrapper.py", line 90, in read
    data = self.__fp.read(amt)
           ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\adith\.conda\include\New folder\Lib\http\client.py", line 466, in read
    s = self.fp.read(amt)
        ^^^^^^^^^^^^^^^^^
  File "C:\Users\adith\.conda\include\New folder\Lib

                                             3.9/390.2 MB 61.8 kB/s eta 1:44:14
                                             3.9/390.2 MB 61.8 kB/s eta 1:44:14
                                             3.9/390.2 MB 61.8 kB/s eta 1:44:14
                                             3.9/390.2 MB 61.8 kB/s eta 1:44:14
                                             3.9/390.2 MB 61.8 kB/s eta 1:44:14
                                             3.9/390.2 MB 61.8 kB/s eta 1:44:14
                                             3.9/390.2 MB 61.8 kB/s eta 1:44:14
                                             3.9/390.2 MB 61.8 kB/s eta 1:44:14
                                             3.9/390.2 MB 61.8 kB/s eta 1:44:14
                                             3.9/390.2 MB 61.8 kB/s eta 1:44:14
                                             3.9/390.2 MB 61.8 kB/s eta 1:44:14
                                             3.9/390.2 MB 61.8 kB/s eta 1:44:14
                                        

In [26]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="LiYuan/amazon-review-sentiment-analysis")

config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


RuntimeError: Failed to import transformers.models.bert.modeling_tf_bert because of the following error (look up to see its traceback):
module 'inspect' has no attribute 'ArgSpec'

In [27]:
!pip show Tensorflow

Name: tensorflow
Version: 2.17.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: C:\Users\adith\.conda\include\New folder\Lib\site-packages
Requires: tensorflow-intel
Required-by: 
