In [1]:
# Step 1: Install necessary packages
!pip install sklearn
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[

In [2]:
# Step 2: Load and preprocess the dataset
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
import spacy

# Load spacy model for sentence splitting
nlp = spacy.load('en_core_web_sm')

# Load the dataset
data = pd.read_csv('/content/pre_processed_data.csv')

# Basic data cleaning
data.dropna(subset=['content', 'rating'], inplace=True)
data['rating'] = pd.to_numeric(data['rating'], errors='coerce')

# Define a function to categorize ratings into sentiment classes
def categorize_sentiment(rating):
    if rating in [4, 5]:
        return 'Positive'
    elif rating == 3:
        return 'Neutral'
    else:
        return 'Negative'

# Apply the function to create a new 'sentiment' column
data['sentiment'] = data['rating'].apply(categorize_sentiment)

# Display the first few rows of the dataset
data.head()


Unnamed: 0,sno,product,title,content,date,author,rating,category,sentiment
0,1,Intel Core I5 12400F 12 Gen Generation Desktop...,great processor,processor power efficient develop android apps...,01-04-2024,ARUN,5,i5,Positive
1,2,Intel Core I5 12400F 12 Gen Generation Desktop...,got,try get check ranking detail section buying an...,01-05-2024,Ayush,5,i5,Positive
2,3,Intel Core I5 12400F 12 Gen Generation Desktop...,highest price performance processor gaming,upgrade performance gain huge use cooler maste...,01-03-2024,Hruaia,5,i5,Positive
3,4,Intel Core I5 12400F 12 Gen Generation Desktop...,nice processor,best aaaaaaaa gaming multitasking processor pr...,01-04-2024,Mr.GeAr,5,i5,Positive
4,5,Intel Core I5 12400F 12 Gen Generation Desktop...,title,run well productivity task core cpu minimum re...,01-01-2024,Joy Mukherjee,5,i5,Positive


In [3]:
# Step 3: Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = tfidf_vectorizer.fit_transform(data['content'])


In [4]:
# Step 4: Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan.fit(X)

# Add the cluster labels to the data
data['cluster'] = dbscan.labels_

# Display the first few rows of the dataset with cluster labels
data.head()


Unnamed: 0,sno,product,title,content,date,author,rating,category,sentiment,cluster
0,1,Intel Core I5 12400F 12 Gen Generation Desktop...,great processor,processor power efficient develop android apps...,01-04-2024,ARUN,5,i5,Positive,-1
1,2,Intel Core I5 12400F 12 Gen Generation Desktop...,got,try get check ranking detail section buying an...,01-05-2024,Ayush,5,i5,Positive,-1
2,3,Intel Core I5 12400F 12 Gen Generation Desktop...,highest price performance processor gaming,upgrade performance gain huge use cooler maste...,01-03-2024,Hruaia,5,i5,Positive,-1
3,4,Intel Core I5 12400F 12 Gen Generation Desktop...,nice processor,best aaaaaaaa gaming multitasking processor pr...,01-04-2024,Mr.GeAr,5,i5,Positive,-1
4,5,Intel Core I5 12400F 12 Gen Generation Desktop...,title,run well productivity task core cpu minimum re...,01-01-2024,Joy Mukherjee,5,i5,Positive,-1


In [5]:
# Step 5: Analyze the clusters
for i in set(dbscan.labels_):
    cluster_reviews = data[data['cluster'] == i]['content']
    print(f"Cluster {i} reviews:")
    print(cluster_reviews.head())
    print()


Cluster 0 reviews:
11     good product
355    good product
771    good product
815    good product
824    good product
Name: content, dtype: object

Cluster 1 reviews:
29             top
37             top
43             top
731            top
1542    everything
Name: content, dtype: object

Cluster 2 reviews:
75      work great
475     work great
584     work great
1688    work great
1808    work great
Name: content, dtype: object

Cluster 3 reviews:
782          thank
1477    thank much
1836    good thank
1867    thank much
2021    good thank
Name: content, dtype: object

Cluster 4 reviews:
784     awesome product
925     awesome product
959     awesome product
1780    awesome product
2129    awesome product
Name: content, dtype: object

Cluster 5 reviews:
841     good processor
849     good processor
884     good processor
1050    good processor
1335    good processor
Name: content, dtype: object

Cluster 6 reviews:
932     working great
941     great working
1435    working great
1

In [6]:
# Step 6: Assign new reviews to the closest cluster
def assign_cluster(review, vectorizer, dbscan_model):
    review_vector = vectorizer.transform([review])
    distances = dbscan_model.fit(review_vector)
    cluster = dbscan_model.labels_[0]
    return cluster

# Example usage
new_review = "The product quality is decent but not as expected. It works fine for basic tasks but struggles with more demanding applications. Overall, it's an average product."
assigned_cluster = assign_cluster(new_review, tfidf_vectorizer, dbscan)
print(f"Assigned Cluster: {assigned_cluster}")


Assigned Cluster: -1
