### Uploading the Dataset 

In [1]:
#importing dataset
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving productReviewShopee.csv to productReviewShopee.csv
User uploaded file "productReviewShopee.csv" with length 2243721 bytes


In [2]:
!pip install TextBlob



### Importing required Libraries

In [0]:
import numpy as np
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import gensim
import string
from nltk.tokenize import word_tokenize
import nltk
from collections import OrderedDict
from textblob import TextBlob
from collections import defaultdict

### Loading the Dataset

In [4]:
product_data=pd.read_csv("./productReviewShopee.csv")
product_data.describe()
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

### 2-1 Initial Text Pre-Processing

In [0]:
def clean_comments(text):
  if (type(text)==str):
    text=[x.lower() for x in text.split(" ")]
    text=[re.sub(r'[^a-zA-Z#]','',x) for x in text]
    stop_words=set(stopwords.words("english"))
    text=[x for x in text if x not in stop_words]
    text=list(filter(None,[x for x in text]))
    text=[x for x in text if len(x)>3]
    text=" ".join(sorted(set(text), key=text.index))
    return text
  else:
    return ""

In [0]:
product_data['comments']=product_data['comments'].apply(clean_comments)
product_data=product_data[product_data.comments!=""]

### 2-2 Most Frequent and Rare Words Removal

In [0]:
comments=product_data['comments'].values.tolist()
dictionary=[]
for x in comments:
  for y in x.split():
    dictionary.append(y)
    
comments_dictionary=defaultdict(int)
for word in dictionary:
  comments_dictionary[word]+=1

comments_dictionary=sorted(comments_dictionary.items(), key=lambda kv:kv[1],reverse=True)

rare_words=[]
for (key,value) in comments_dictionary:
  if value<=5:
    rare_words.append(key)

In [0]:
most_freq=pd.Series(" ".join(product_data['comments']).split()).value_counts()[:10]
most_freq=list(most_freq.index)
product_data['comments']=product_data['comments'].apply(lambda x: " ".join([y for y in x.split() if y not in most_freq]))

#less_freq=pd.Series(" ".join(product_data['comments']).split()).value_counts()[-10:]
#less_freq=list(less_freq.index)
product_data['comments']=product_data['comments'].apply(lambda x: " ".join([y for y in x.split() if y not in rare_words]))

product_data['comments']=product_data['comments'].apply(lambda x: " ".join([y for y in x.split() if y!="baht"]))


### 2-3 Lemmatization & Stemming the Text

In [0]:
def lemmatize_stem_text(text):
  lemmatizer=WordNetLemmatizer()
  stemmer=PorterStemmer()
  text=[lemmatizer.lemmatize(x,pos="v") for x in text.split()]
  text=[stemmer.stem(x) for x in text]
  text=" ".join([x for x in text])
  return text

### 2-4 Removing duplicates, NAN rows and rows with no comments

In [10]:
product_data['comments']=product_data['comments'].apply(lemmatize_stem_text)
product_data= product_data[pd.notnull(product_data['comments'])]
product_data=product_data.dropna()
product_data=product_data.drop_duplicates()
product_data=product_data[product_data.comments!=""]
product_data.describe()

Unnamed: 0,rating
count,6374.0
mean,4.166144
std,1.231106
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


### 2-5 Changing titles to general category 

In [0]:
def change_titles(text):
  if text=="Borderless panty":
    text="general panty"
  elif text=="Cheapest 🔥‼ ️ ready to send Borderless panties The fabric is not thin, soft, comfortable to wear, soft fabric, comfortable to order.":
    text="soft panty"
  elif text=="Cheapest!! Borderless panties Some fabrics, made to order, special, quality guaranteed":
    text="special panty"
  elif text=="Heroin Polo shirts ":
    text="heroin polo shirt"
  elif text=="Mini bag-xiaoyang02-y19 shoulder bag":
    text="y19 shoulder bag"
  elif text=="Minimal polo shirt":
    text="minimal polo shirt"
  elif text=="New cute shoulder bag Cc-152":
    text="cc-152 shoulder bag"
  elif text=="Polo shirt":
    text="simple polo shirt"
  elif text=="Polo shirt, couple shirt, team shirt, fashion shirt, The King Lion [new customer use code NEWPLAY0004, discount 80 baht]":
    text="special polo shirt"
  elif text=="🔥 Mini shoulder bag (M-688)":
    text="m-688 shoulder bag"
  else:
    text="UNK"
  return text

In [0]:
product_data['title']=product_data['title'].apply(change_titles)

### 3-1 Giving sentiment using polarity measure by TextBlob Library

In [13]:

product_data['sentiment'] = product_data['comments'].apply(lambda x: TextBlob(x).sentiment[0] )
product_data[['comments','sentiment']].head()

Unnamed: 0,comments,sentiment
0,give star order today sorri,0.0
1,order discount piec worth leav small cabl leng...,0.016667
2,small cute compact sash look like littl fragil...,0.25
3,size larg stitch wrong line like lose order,-0.5
4,compar okay send slowli thing send wrong color...,0.0


In [0]:
data=product_data['comments'].values.tolist()

In [15]:
data[0]

'give star order today sorri'

### 4-1 TF-IDF for feature generation using penta-gram model as it gives higher semantic relation

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(lowercase=False,ngram_range=(2,5),norm=None)
x = vectorizer.fit_transform(data)
x = x.toarray()

### 4-2 Applying Kmeans Clustering for generation of text clusters 

In [17]:
#Applying Kmeans clustering for language generation
from sklearn.cluster import KMeans
kmeans1 = KMeans(n_clusters=2, init='random', max_iter=100, n_init=1,verbose=1)
kmeans1.fit(x)

Initialization complete
start iteration
done sorting
end inner loop
Iteration 0, inertia 6362090.9406880345
start iteration
done sorting
end inner loop
Iteration 1, inertia 6362090.9406880345
center shift 0.000000e+00 within tolerance 1.540048e-06


KMeans(algorithm='auto', copy_x=True, init='random', max_iter=100, n_clusters=2,
       n_init=1, n_jobs=None, precompute_distances='auto', random_state=None,
       tol=0.0001, verbose=1)

In [18]:
cluster_values=list()
order_centroids = kmeans1.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(2):
  print("Cluster %d:" % i, end='')
  cluster=list()
  for ind in order_centroids[i, :10]:
    print(' %s' % terms[ind], end='')
    cluster.append(terms[ind])
  print()
  cluster_values.append(cluster)

Cluster 0: dont line afraid lose wear lose wear heavi item strap afraid lose wear heavi afraid lose wear afraid lose cute quiet reason dont line cute quiet reason dont cute quiet reason cute quiet reason dont
Cluster 1: ship compani provid ship compani provid ship comfort wear beauti color provid servic transport compani soft comfort exactli order long time


In [0]:
 def assign_cluster(text):
    y=vectorizer.transform([text])
    prediction=kmeans1.predict(y)
    return prediction[0]
      

In [20]:
product_data['generated_sentiment']=product_data['comments'].apply(assign_cluster)
product_data.head(5)

Unnamed: 0,title,rating,date,comments,product_option,sentiment,generated_sentiment
0,m-688 shoulder bag,4,2019-05-22,give star order today sorri,Pink:,0.0,1
1,m-688 shoulder bag,5,2019-07-18,order discount piec worth leav small cabl leng...,Black,0.016667,1
2,m-688 shoulder bag,5,2019-07-02,small cute compact sash look like littl fragil...,Tau,0.25,1
3,m-688 shoulder bag,1,2018-12-15,size larg stitch wrong line like lose order,Black,-0.5,1
4,m-688 shoulder bag,1,2019-07-24,compar okay send slowli thing send wrong color...,Black,0.0,1


In [0]:
def change_sentiment(text):
  if text>0.0:
    label=0
  else:
    label=1
  return label

### 4-3 Accuracy of model

In [22]:
product_data.sentiment=product_data.sentiment.apply(change_sentiment)
print ("Accuracy of Kmeans Clustering = "+str((np.sum(product_data.sentiment==product_data.generated_sentiment)/len(product_data.sentiment))*100))

Accuracy of Kmeans Clustering = 59.11515531848133
