<a href="https://colab.research.google.com/github/Marvin2798/roboreviews-project-/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import libraries

In [1]:
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, Trainer, TrainingArguments

import nltk
nltk.download('stopwords')

  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Load data

In [6]:
dataset_directory = "/root/.cache/kagglehub/datasets/datafiniti/consumer-reviews-of-amazon-products/versions/5/"
filenames = [
    "/content/archive (1).zip",
    "/content/archive (2).zip",
    "/content/archive (3).zip"
]

dataframes = []
for file in filenames:
    file_path = os.path.join(dataset_directory, file)
    if os.path.exists(file_path):
        df = pd.read_csv(file_path, encoding='utf-8')
        dataframes.append(df)

combined_df = pd.concat(dataframes, ignore_index=True).drop_duplicates()
print("Combined Dataset Info:")
print(combined_df.info())
combined_df.to_csv("combined_dataset.csv", index=False)

  df = pd.read_csv(file_path, encoding='utf-8')


Combined Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 67897 entries, 0 to 67991
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    67897 non-null  object 
 1   dateAdded             33237 non-null  object 
 2   dateUpdated           33237 non-null  object 
 3   name                  61137 non-null  object 
 4   asins                 67895 non-null  object 
 5   brand                 67897 non-null  object 
 6   categories            67897 non-null  object 
 7   primaryCategories     33237 non-null  object 
 8   imageURLs             33237 non-null  object 
 9   keys                  67897 non-null  object 
 10  manufacturer          67897 non-null  object 
 11  manufacturerNumber    33237 non-null  object 
 12  reviews.date          67858 non-null  object 
 13  reviews.dateSeen      67897 non-null  object 
 14  reviews.didPurchase   10 non-null     object 
 15  r

Data preprocessing

In [7]:
keywords = ['smart speaker', 'Alexa', 'Siri', 'Google Home', 'Bose', 'Sonos', 'Bluetooth Speaker']
smart_speakers = combined_df[combined_df.apply(
    lambda row: row.astype(str).str.contains('|'.join(keywords), case=False, na=False).any(),
    axis=1
)].drop_duplicates()

print("Filtered Smart Speaker Reviews:")
print(smart_speakers.head())
smart_speakers.to_csv("smart_speaker_reviews.csv", index=False)

Filtered Smart Speaker Reviews:
                        id             dateAdded           dateUpdated  \
352   AVpgNzjwLJeJML43Kpxn  2015-10-30T08:59:32Z  2019-04-25T09:08:16Z   
677   AVpgNzjwLJeJML43Kpxn  2015-10-30T08:59:32Z  2019-04-25T09:08:16Z   
989   AVpgNzjwLJeJML43Kpxn  2015-10-30T08:59:32Z  2019-04-25T09:08:16Z   
1186  AVpgNzjwLJeJML43Kpxn  2015-10-30T08:59:32Z  2019-04-25T09:08:16Z   
1267  AVpgNzjwLJeJML43Kpxn  2015-10-30T08:59:32Z  2019-04-25T09:08:16Z   

                                                   name  \
352   AmazonBasics AAA Performance Alkaline Batterie...   
677   AmazonBasics AAA Performance Alkaline Batterie...   
989   AmazonBasics AAA Performance Alkaline Batterie...   
1186  AmazonBasics AAA Performance Alkaline Batterie...   
1267  AmazonBasics AAA Performance Alkaline Batterie...   

                      asins         brand  \
352   B00QWO9P0O,B00LH3DMUO  Amazonbasics   
677   B00QWO9P0O,B00LH3DMUO  Amazonbasics   
989   B00QWO9P0O,B00LH3DMUO  Amaz

data cleaning

In [8]:
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

smart_speakers['cleaned_text'] = smart_speakers['reviews.text'].apply(clean_text)


In [9]:
smart_speakers['cleaned_text'] = smart_speakers['reviews.text'].apply(clean_text)

smart_speakers.to_csv("smart_speaker_reviews_cleaned.csv", index=False)

print("Smart Speaker Reviews with Cleaned Text:")
print(smart_speakers.head())


Smart Speaker Reviews with Cleaned Text:
                        id             dateAdded           dateUpdated  \
352   AVpgNzjwLJeJML43Kpxn  2015-10-30T08:59:32Z  2019-04-25T09:08:16Z   
677   AVpgNzjwLJeJML43Kpxn  2015-10-30T08:59:32Z  2019-04-25T09:08:16Z   
989   AVpgNzjwLJeJML43Kpxn  2015-10-30T08:59:32Z  2019-04-25T09:08:16Z   
1186  AVpgNzjwLJeJML43Kpxn  2015-10-30T08:59:32Z  2019-04-25T09:08:16Z   
1267  AVpgNzjwLJeJML43Kpxn  2015-10-30T08:59:32Z  2019-04-25T09:08:16Z   

                                                   name  \
352   AmazonBasics AAA Performance Alkaline Batterie...   
677   AmazonBasics AAA Performance Alkaline Batterie...   
989   AmazonBasics AAA Performance Alkaline Batterie...   
1186  AmazonBasics AAA Performance Alkaline Batterie...   
1267  AmazonBasics AAA Performance Alkaline Batterie...   

                      asins         brand  \
352   B00QWO9P0O,B00LH3DMUO  Amazonbasics   
677   B00QWO9P0O,B00LH3DMUO  Amazonbasics   
989   B00QWO9P0O,B00LH3D

Sentiment Labels

In [20]:
# 3. Create Sentiment Labels (based on ratings if available)
def assign_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

if 'reviews.rating' in df.columns:
    df['sentiment'] = df['reviews.rating'].apply(assign_sentiment)

# Check if the sentiment labels are assigned correctly
print(df['sentiment'].value_counts())

sentiment
positive    32316
neutral      1499
negative      845
Name: count, dtype: int64


In [None]:
print(value_counts())


Split Data

In [11]:
train, test = train_test_split(smart_speakers, test_size=0.2, random_state=42)
print("Training Set Size:", train.shape)
print("Testing Set Size:", test.shape)

Training Set Size: (9189, 29)
Testing Set Size: (2298, 29)


Sentiment classification

In [12]:
# Initialize HuggingFace tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Tokenize the training and testing data
train_encodings = tokenizer(list(train['cleaned_text']), truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(list(test['cleaned_text']), truncation=True, padding=True, max_length=256)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
# prompt: inference from model on the train_encondings

import numpy as np

# Assuming 'model' is your trained sentiment classification model and 'train_encodings' are the tokenized training data.

# Create a pipeline for inference
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Perform inference on a subset of the training data for demonstration (adjust as needed)
sample_size = min(len(train_encodings['input_ids']), 10)  # Infer on the first 10 samples
train_texts = list(train['cleaned_text'])[10:45]
results = classifier(train_texts)

# Print the results
for i, result in enumerate(results):
  print(f"Text: {train_texts[i]}")
  print(f"Predicted Label: {result['label']}, Score: {result['score']}")

Text: one good item bought 2017 thanksgiving deals
Predicted Label: LABEL_0, Score: 0.38369858264923096
Text: finally decided try echo system months months debate purchased echo plus echo dot plus easily handles main two floors dot handles upstairs pluss speakers incredible size agree made right choice get plus
Predicted Label: LABEL_0, Score: 0.3760433495044708
Text: alexa really works sceptical first part family im cooking three different things go microwave stovetop outside grill need pulled heat different times problem ask alexa set timers item go along awesome time watch movie alexa turn living room lights lights dim confirms ok nice little bit warm alexa set temperature 69 airconditioner comes magical nephew like cooking alexa open dominos asks order choice within seconds dominos order paid arrives time truly one coolest devices ever purchased almost forgot anytime want hear piece music loved past call pull right listen near bose quality sound magic
Predicted Label: LABEL_0, Scor

In [35]:
# Create a balanced dataset (where neutral size = positive size = negative size)
# Fine tune the model (loss and the accuracy, 3 epoch first and increase if necessary)
# Save this model and MAKE SURE you download it!!! You dont want to lose it
# Evaluate (ie scikitlearn classification report)

{'LABEL_0'}

In [33]:
train['sentiment'][10:45]

Unnamed: 0,sentiment
29799,positive
29453,positive
60792,positive
56108,positive
20680,positive
56028,positive
21109,neutral
64587,positive
26789,positive
31488,positive


clustering

In [15]:
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer

sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = sentence_transformer.encode(smart_speakers['cleaned_text'].tolist())

#TODO : try different n_clusters values -> find the best one using the elbow method / fine tuning
kmeans = KMeans(n_clusters=4, random_state=42)
smart_speakers['cluster'] = kmeans.fit_predict(embeddings)

print(smart_speakers['cluster'].value_counts())


#evaluation
# you need add metrics to evluate yuur clustering
# kmeans = sihluete score (but double check the sikit learn db and other extra metrics in here)
# text = entropy value for the clustering (should be low for the same cluster and hight between clusters)
# buld average representation of the cluster (either with the embdded vector or by chossing the words that are apprearing more frequently)



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

cluster
2    3532
1    3482
3    2665
0    1808
Name: count, dtype: int64


In [19]:
print(smart_speakers['cluster'].unique())

[1 3 0 2]


verify the data

In [18]:
print(smart_speakers.columns)


Index(['id', 'dateAdded', 'dateUpdated', 'name', 'asins', 'brand',
       'categories', 'primaryCategories', 'imageURLs', 'keys', 'manufacturer',
       'manufacturerNumber', 'reviews.date', 'reviews.dateSeen',
       'reviews.didPurchase', 'reviews.doRecommend', 'reviews.id',
       'reviews.numHelpful', 'reviews.rating', 'reviews.sourceURLs',
       'reviews.text', 'reviews.title', 'reviews.username', 'sourceURLs',
       'reviews.dateAdded', 'reviews.userCity', 'reviews.userProvince',
       'cleaned_text', 'sentiment', 'cluster'],
      dtype='object')
