**Word2Vec**

Convert word into vector based on semantic meaning (Context)

In [1]:
!pip install gensim
# !pip install python-Levenshtein

INFO: pip is looking at multiple versions of numpy to determine which version is compatible with other requirements. This could take a while.
INFO: pip is looking at multiple versions of gensim to determine which version is compatible with other requirements. This could take a while.

The conflict is caused by:
    gensim 4.1.2 depends on numpy>=1.17.0
    scipy 1.9.1 depends on numpy<1.25.0 and >=1.18.5

To fix this you could try to:
1. loosen the range of package versions you've specified
2. remove package versions to allow pip attempt to solve the dependency conflict



ERROR: Cannot install gensim==4.1.2 and scipy==1.9.1 because these package versions have conflicting dependencies.
ERROR: ResolutionImpossible: for help visit https://pip.pypa.io/en/latest/topics/dependency-resolution/#dealing-with-dependency-conflicts


In [2]:
# import required libraries
import gensim
import pandas as pd



In [3]:
# Read Json file
df = pd.read_json(r'Cell_Phones_and_Accessories_5.json', lines=True)
df.head(3)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"


In [4]:
'''
Use "gensim" to make preprocessing on text
such as convert text into lowercase , remove punctuations ,
remove stop words,Tokenization and store tokens in list
'''
review_text = df['reviewText'].apply(gensim.utils.simple_preprocess)
review_text

0         [they, look, good, and, stick, good, just, don...
1         [these, stickers, work, like, the, review, say...
2         [these, are, awesome, and, make, my, phone, lo...
3         [item, arrived, in, great, time, and, was, in,...
4         [awesome, stays, on, and, looks, great, can, b...
                                ...                        
194434    [works, great, just, like, my, original, one, ...
194435    [great, product, great, packaging, high, quali...
194436    [this, is, great, cable, just, as, good, as, t...
194437    [really, like, it, becasue, it, works, well, w...
194438    [product, as, described, have, wasted, lot, of...
Name: reviewText, Length: 194439, dtype: object

In [5]:
# Print the first ten list
for index,lis in  enumerate (review_text):
    if index < 10:
        print(f"List of index {index}: {lis}\n")

List of index 0: ['they', 'look', 'good', 'and', 'stick', 'good', 'just', 'don', 'like', 'the', 'rounded', 'shape', 'because', 'was', 'always', 'bumping', 'it', 'and', 'siri', 'kept', 'popping', 'up', 'and', 'it', 'was', 'irritating', 'just', 'won', 'buy', 'product', 'like', 'this', 'again']

List of index 1: ['these', 'stickers', 'work', 'like', 'the', 'review', 'says', 'they', 'do', 'they', 'stick', 'on', 'great', 'and', 'they', 'stay', 'on', 'the', 'phone', 'they', 'are', 'super', 'stylish', 'and', 'can', 'share', 'them', 'with', 'my', 'sister']

List of index 2: ['these', 'are', 'awesome', 'and', 'make', 'my', 'phone', 'look', 'so', 'stylish', 'have', 'only', 'used', 'one', 'so', 'far', 'and', 'have', 'had', 'it', 'on', 'for', 'almost', 'year', 'can', 'you', 'believe', 'that', 'one', 'year', 'great', 'quality']

List of index 3: ['item', 'arrived', 'in', 'great', 'time', 'and', 'was', 'in', 'perfect', 'condition', 'however', 'ordered', 'these', 'buttons', 'because', 'they', 'were',

Training the Word2Vec Model


In [6]:
# Build the model 
model = gensim.models.Word2Vec(
    window=10, # window size (10 words before target word and 10 words after target word)
    min_count=2, # the minimum words in sentence 
    workers=4, #  number of  CPU threads
)

In [7]:
# Vocabulary 
model.build_vocab(review_text, progress_per=1000)

In [8]:
# Train the model
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(61506253, 83868975)

In [9]:
# Get similar words 
model.wv.most_similar("time")

[('day', 0.6674665808677673),
 ('night', 0.5648826360702515),
 ('footstep', 0.5599106550216675),
 ('penny', 0.5552220344543457),
 ('morning', 0.5199440121650696),
 ('moment', 0.5119136571884155),
 ('weekend', 0.478390097618103),
 ('cycle', 0.47252070903778076),
 ('minute', 0.4657999873161316),
 ('mins', 0.44832199811935425)]

In [10]:
model.wv.most_similar("great")

[('fantastic', 0.8566185832023621),
 ('wonderful', 0.8261293172836304),
 ('good', 0.7917861342430115),
 ('awesome', 0.7523834109306335),
 ('perfect', 0.723669707775116),
 ('fabulous', 0.7136659026145935),
 ('excellent', 0.7020270824432373),
 ('nice', 0.682478129863739),
 ('terrific', 0.6758182048797607),
 ('amazing', 0.6699962615966797)]

In [11]:
model.wv.most_similar("product")

[('item', 0.8794766664505005),
 ('products', 0.5259015560150146),
 ('company', 0.5047276020050049),
 ('seller', 0.5039994716644287),
 ('vendor', 0.4982939064502716),
 ('value', 0.4964480400085449),
 ('workmanship', 0.4771156907081604),
 ('transaction', 0.4650908410549164),
 ('prodcut', 0.4529708921909332),
 ('accessory', 0.44575241208076477)]