# **Review Similarity Using Word2Vec**

#### **A review similarity system using Word2Vec embeddings and cosine similarity to compare semantic meaning of customer reviews.**

In [46]:
!pip install gensim
!pip install python-Levenshtein



#### **Connect Google Drive**

In [47]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### **Import the Libraries**

In [48]:
import gensim
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

#### **Load the dataset**

In [49]:
df = pd.read_json("/content/drive/MyDrive/Colab Notebooks/NLP Projects/NLP Dataset/Cell_Phones_and_Accessories_5.json",lines=True)

In [50]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [51]:
df.shape

(194439, 9)

#### **Text Preprocessing**

In [52]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)

In [53]:
review_text

Unnamed: 0,reviewText
0,"[they, look, good, and, stick, good, just, don..."
1,"[these, stickers, work, like, the, review, say..."
2,"[these, are, awesome, and, make, my, phone, lo..."
3,"[item, arrived, in, great, time, and, was, in,..."
4,"[awesome, stays, on, and, looks, great, can, b..."
...,...
194434,"[works, great, just, like, my, original, one, ..."
194435,"[great, product, great, packaging, high, quali..."
194436,"[this, is, great, cable, just, as, good, as, t..."
194437,"[really, like, it, becasue, it, works, well, w..."


#### **Word2Vec Model Training**

In [54]:
model = gensim.models.Word2Vec(
    window = 10,
    min_count = 2
)

In [55]:
model.build_vocab(review_text, progress_per=1000)

In [56]:
model.train(
    review_text,
    total_examples=model.corpus_count,
    epochs=model.epochs
)

(61505568, 83868975)

In [57]:
model.save("/content/drive/MyDrive/Colab Notebooks/NLP Projects/Models/Cell_Phones_and_Accessories_Reviews_Model.model")

In [58]:
model.wv.most_similar("bad")

[('terrible', 0.7092544436454773),
 ('shabby', 0.6483732461929321),
 ('horrible', 0.5990914106369019),
 ('good', 0.5817903876304626),
 ('awful', 0.5745896697044373),
 ('okay', 0.534587562084198),
 ('legit', 0.5308262705802917),
 ('funny', 0.5194070339202881),
 ('sad', 0.5005893111228943),
 ('ok', 0.49931105971336365)]

#### **Word Similarity Analysis**

In [59]:
model.wv.similarity(w1="cheap", w2="inexpensive")

np.float32(0.5119761)

In [60]:
model.wv.similarity(w1="great", w2="good")

np.float32(0.7768655)

In [61]:
model.wv.similarity(w1="great", w2="worst")

np.float32(0.09689658)

#### **Review Similarity Using Cosine Similarity**

In [62]:
def review_vector_conversion(review, model):
  review_vectors = []
  for word in review :
    if word in model.wv:
      word_vector = model.wv[word]
      review_vectors.append(word_vector)

  if review_vectors :
    return np.mean(review_vectors,axis=0)
  else :
    return np.zeros(model.vector_size)

In [63]:
review_vectors = review_text.apply(lambda x: review_vector_conversion(x, model))

In [64]:
cosine_similarity([review_vectors[0]], [review_vectors[1]])

array([[0.47541314]], dtype=float32)

In [65]:
def compare_reviews(review1 ,review2) :
  review1_list = gensim.utils.simple_preprocess(review1)
  review2_list = gensim.utils.simple_preprocess(review2)

  vec1 = review_vector_conversion(review1_list, model)
  vec2 = review_vector_conversion(review2_list, model)

  similarity = cosine_similarity([vec1], [vec2])
  similarity_score = similarity[0][0]
  print(f"Similarity between the reviews : {similarity_score}")


In [66]:
compare_reviews("great product quality","excellent item quality")

Similarity between the reviews : 0.9160928726196289


In [67]:
compare_reviews("great product quality","poor item quality")

Similarity between the reviews : 0.792166531085968


In [68]:
compare_reviews("bad product quality","worst quality")

Similarity between the reviews : 0.7115164995193481
