In [3]:
import gensim #gensim is a NLP library
import pandas as pd

### Reading and Exploring the Dataset
The dataset we are using here is a subset of Amazon reviews from the Cell Phones & Accessories category. The data is stored as a JSON file and can be read using pandas.

Link to the Dataset: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz

In [11]:
df = pd.read_json("Cell_Phones_and_Accessories_5.json" , lines=True)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


### We want to train a word2vec model using only a review text , so we will focus on reviwText column

In [12]:
df.shape

(194439, 9)

In [13]:
df.reviewText[0]

"They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"

# 1.Preproceesing Data

we will begin with preprocessing the data , such convert all words to lowercase  , removing trailing spaces ... and all this can done by using **gensim** library

In [14]:
gensim.utils.simple_preprocess("They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"
)

['they',
 'look',
 'good',
 'and',
 'stick',
 'good',
 'just',
 'don',
 'like',
 'the',
 'rounded',
 'shape',
 'because',
 'was',
 'always',
 'bumping',
 'it',
 'and',
 'siri',
 'kept',
 'popping',
 'up',
 'and',
 'it',
 'was',
 'irritating',
 'just',
 'won',
 'buy',
 'product',
 'like',
 'this',
 'again']

In [15]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)
review_text

0         [they, look, good, and, stick, good, just, don...
1         [these, stickers, work, like, the, review, say...
2         [these, are, awesome, and, make, my, phone, lo...
3         [item, arrived, in, great, time, and, was, in,...
4         [awesome, stays, on, and, looks, great, can, b...
                                ...                        
194434    [works, great, just, like, my, original, one, ...
194435    [great, product, great, packaging, high, quali...
194436    [this, is, great, cable, just, as, good, as, t...
194437    [really, like, it, becasue, it, works, well, w...
194438    [product, as, described, have, wasted, lot, of...
Name: reviewText, Length: 194439, dtype: object

### Now we will initialize a gensim model

In [19]:
model = gensim.models.Word2Vec(
    window=10 ,# means 10 words before our target word and 10 words after our target word
    min_count=2 , # at least 2 words in the sentence to be considered as a training sample
    workers=4 # workers is how many cpu threads you want to use to train model 
)

# Build Vocabulary

##### Build vocabulary means build unique list of words

In [20]:
model.build_vocab(review_text,progress_per=1000)

In [21]:
model.epochs

5

In [22]:
model.corpus_count

194439

### Train the Word2Vec Model

In [23]:
model.train(review_text,total_examples=model.corpus_count,epochs=model.epochs)

(61505857, 83868975)

# Save the Model
Save the model so that it can be reused in other applications

In [24]:
model.save("./word2vec-amazon-cell-accessories-reviews-short.model")

### Finding Similar Words and Similarity between words
https://radimrehurek.com/gensim/models/word2vec.html

In [26]:
model.wv.most_similar("bad")

[('shabby', 0.6951686143875122),
 ('terrible', 0.660505473613739),
 ('good', 0.5901959538459778),
 ('horrible', 0.5894935131072998),
 ('okay', 0.5409718751907349),
 ('awful', 0.5249417424201965),
 ('sad', 0.5241829752922058),
 ('keen', 0.513152539730072),
 ('poor', 0.5114068984985352),
 ('funny', 0.5088415741920471)]

In [27]:
model.wv.similarity(w1="cheap", w2="inexpensive")

0.53702664

In [28]:
model.wv.similarity(w1="great", w2="good")

0.77942073

In [29]:
model.wv.similarity(w1="great", w2="product")

-0.052851617

In [30]:
model.wv.similarity(w1="great", w2="nice")

0.68947595

In [31]:
model.wv.similarity(w1="great", w2="great")

1.0

### Further Reading
You can read about gensim more at https://radimrehurek.com/gensim/models/word2vec.html

Explore other Datasets related to Amazon Reviews: http://jmcauley.ucsd.edu/data/amazon/