# Gensim Word2Vec

In [None]:
!unzip dataset.zip

In [2]:
import glob
import pandas as pdn
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from gensim.models import Word2Vec
from nltk.stem import WordNetLemmatizer
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Reading the input files and appending the reviews in one dataframe.

reviews = []

df = None

for f in glob.glob('main_product/*.csv'):
  temp = pd.read_csv(f, index_col=False, usecols=['reviews'])
  if df is None:
    df = temp
  else:
    df = pd.concat([df, temp])

In [4]:
df.shape

(11929, 1)

In [5]:
reviews = list(df['reviews'])

In [6]:
stop_words = ['i','me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", 
                "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',
                'she', "she's", 'her', 'hers', 'herself', 'them', 'their', 'theirs', 'themselves', 'who', 'whom',
                'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
                'did', 'doing', 'a', 'an', 'the', 'and', 's', 't', 'd', 'll', 'm', 'o', 're', 've', 'y', 'in', 'ma', 'it']

Steps:<br>
1) Remove non alphabetical characters.<br>
2) Lowercase the string.<br>
3) Remove excess spaces, if any.<br>
4) Split the text string into a list of strings.<br>
5) Lemmatize and remove stop words.<br>

In [7]:
lemma = WordNetLemmatizer()
def textProcessing(inp):
  inp = re.sub(r"[^a-zA-z ]", '', inp)
  inp = inp.lower()
  inp = re.sub(r" +", ' ', inp)
  inp_split = inp.split()
  nostop = [lemma.lemmatize(word) for word in inp_split if word not in stop_words]
  return nostop

In [8]:
data = [textProcessing(str(inp)) for inp in reviews]

In [9]:
model = Word2Vec(data, size=250, sg = 1)

In [10]:
print("Words similar to 'price' are: ", model.wv.most_similar('price', topn=50), "\n\n")

Words similar to 'price' are:  [('value', 0.8818212747573853), ('money', 0.8491735458374023), ('paid', 0.8382959365844727), ('cost', 0.8359091281890869), ('priced', 0.8351467847824097), ('premium', 0.8291002511978149), ('reliability', 0.8275336027145386), ('performance', 0.8247945308685303), ('fair', 0.8229182362556458), ('durability', 0.8167815208435059), ('pricy', 0.8159868717193604), ('category', 0.8140328526496887), ('class', 0.8133774399757385), ('deserves', 0.8117427825927734), ('considering', 0.8079848885536194), ('dollar', 0.8077625036239624), ('pricey', 0.8076850771903992), ('willing', 0.8073707222938538), ('pricei', 0.8033735752105713), ('imo', 0.8024996519088745), ('functionality', 0.8024208545684814), ('inferior', 0.8019040822982788), ('reccomend', 0.8014135956764221), ('pay', 0.8001075983047485), ('deal', 0.799735963344574), ('wise', 0.7994759678840637), ('opinion', 0.7993313074111938), ('paying', 0.7985329031944275), ('sold', 0.7965651750564575), ('lightit', 0.79601317644

In [11]:
print("Words similar to 'price' are: ", model.wv.most_similar('battery', topn=50), "\n\n")

Words similar to 'price' are:  [('shelf', 0.7770299911499023), ('pair', 0.7588521838188171), ('batter', 0.7584606409072876), ('batt', 0.747565507888794), ('ion', 0.7466127872467041), ('batts', 0.7450363636016846), ('lithiumion', 0.7429304718971252), ('forever', 0.7426040768623352), ('rechargables', 0.7421103715896606), ('common', 0.7416403889656067), ('alkaline', 0.7415514588356018), ('protected', 0.7395178079605103), ('specialized', 0.738003134727478), ('proper', 0.7378340363502502), ('alkalines', 0.7358019351959229), ('disposable', 0.7342775464057922), ('cell', 0.7338579893112183), ('freshly', 0.7329918742179871), ('nonrechargeable', 0.7311081290245056), ('readily', 0.7298227548599243), ('special', 0.7264997959136963), ('charger', 0.7247888445854187), ('extend', 0.7238486409187317), ('lithium', 0.7236818671226501), ('separately', 0.720176637172699), ('required', 0.7168635725975037), ('weird', 0.7167133688926697), ('replaceable', 0.7156758904457092), ('greatly', 0.7151648998260498), (

In [12]:
model.similarity("price","sale")

  """Entry point for launching an IPython kernel.


0.76690876

In [13]:
model.similarity("price","sturdy")

  """Entry point for launching an IPython kernel.


0.6144402

In [14]:
model.similarity("rechargable","battery")

  """Entry point for launching an IPython kernel.


0.66370153