# Word2Vec in gensim; Similarity of Words

In [6]:
import gensim
import pandas as pd

In [2]:
df = pd.read_json("Cell_Phones_and_Accessories_5.json", lines=True)
df.head(2)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"


In [3]:
df.shape

(194439, 9)

In [4]:
df.reviewText[0]

"They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"

In [9]:
gensim.utils.simple_preprocess("They look good and stick good! I just don't like the rounded shape")

['they',
 'look',
 'good',
 'and',
 'stick',
 'good',
 'just',
 'don',
 'like',
 'the',
 'rounded',
 'shape']

In [10]:
tokenized_text = df.reviewText.apply(gensim.utils.simple_preprocess)
tokenized_text

0         [they, look, good, and, stick, good, just, don...
1         [these, stickers, work, like, the, review, say...
2         [these, are, awesome, and, make, my, phone, lo...
3         [item, arrived, in, great, time, and, was, in,...
4         [awesome, stays, on, and, looks, great, can, b...
                                ...                        
194434    [works, great, just, like, my, original, one, ...
194435    [great, product, great, packaging, high, quali...
194436    [this, is, great, cable, just, as, good, as, t...
194437    [really, like, it, becasue, it, works, well, w...
194438    [product, as, described, have, wasted, lot, of...
Name: reviewText, Length: 194439, dtype: object

In [11]:
model = gensim.models.Word2Vec(
    window=10,  # means 10 words before a target word and 10 words after
    min_count=2,  # minimum number of words present in a sentence
    workers=4  # numer of CPU threads used in a training
)

In [12]:
model.build_vocab(tokenized_text, progress_per=1000)

In [13]:
model.epochs

5

In [14]:
model.corpus_count

194439

In [15]:
model.train(tokenized_text, total_examples=model.corpus_count, epochs=model.epochs)

(61506902, 83868975)

In [16]:
# model.save("./word2vec_model")

In [18]:
model.wv.most_similar("bad")

[('terrible', 0.6694045066833496),
 ('shabby', 0.5996960401535034),
 ('good', 0.5845207571983337),
 ('awful', 0.5808196663856506),
 ('horrible', 0.5800839066505432),
 ('poor', 0.5340906977653503),
 ('crappy', 0.5280666351318359),
 ('okay', 0.5232283473014832),
 ('sad', 0.5149849057197571),
 ('ok', 0.5111637115478516)]

In [19]:
model.wv.similarity(w1="cheap", w2="inexpensive")

0.5370709

In [20]:
model.wv.similarity(w1="good", w2="great")

0.78430796

In [21]:
model.wv.similarity(w1="good", w2="good")

1.0

In [24]:
model.wv.similarity(w1="good", w2="bad")

0.5845207

In [25]:
model.wv.similarity(w1="good", w2="tree")

-0.26968098