In [2]:
import gensim
import pandas as pd

In [3]:
data = pd.read_json('http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Sports_and_Outdoors_5.json.gz', lines=True)
data.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]",This came in on time and I am veru happy with ...,5,Woks very good,1390694400,"01 26, 2014"
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]",I had a factory Glock tool that I was using fo...,5,Works as well as the factory tool,1328140800,"02 2, 2012"
2,AB2W04NI4OEAD,1881509818,J. Fernald,"[2, 2]",If you don't have a 3/32 punch or would like t...,4,"It's a punch, that's all.",1330387200,"02 28, 2012"
3,A148SVSWKTJKU6,1881509818,"Jusitn A. Watts ""Maverick9614""","[0, 0]",This works no better than any 3/32 punch you w...,4,It's a punch with a Glock logo.,1328400000,"02 5, 2012"
4,AAAWJ6LW9WMOO,1881509818,Material Man,"[0, 0]",I purchased this thinking maybe I need a speci...,4,"Ok,tool does what a regular punch does.",1366675200,"04 23, 2013"


In [4]:
# let's see the shape of our data
data.shape

(296337, 9)

### Preprocessing & Tokenization
- The first thing to do for any data science task is to clean the data. For NLP, we apply various processing like converting all the words to lower case, trimming spaces, removing punctuations. This is something we will do over here too.

- Additionally, we can also remove stop words like 'and', 'or', 'is', 'the', 'a', 'an' and convert words to their root forms like 'running' to 'run'.

In [5]:
review_text = data['reviewText'].apply(gensim.utils.simple_preprocess)
review_text

0         [this, came, in, on, time, and, am, veru, happ...
1         [had, factory, glock, tool, that, was, using, ...
2         [if, you, don, have, punch, or, would, like, t...
3         [this, works, no, better, than, any, punch, yo...
4         [purchased, this, thinking, maybe, need, speci...
                                ...                        
296332    [this, is, water, bottle, done, right, it, is,...
296333    [if, you, re, looking, for, an, insulated, wat...
296334    [this, hydracentials, sporty, oz, double, insu...
296335    [as, usual, received, this, item, free, in, ex...
296336    [hydracentials, insulated, oz, water, bottle, ...
Name: reviewText, Length: 296337, dtype: object

### Training the Word2Vec Model
- Train the model for reviews. Use a window of size 10 i.e. 10 words before the present word and 10 words ahead. A sentence with at least 3 words should only be considered, configure this using min_count parameter.

- Workers define how many CPU threads to be used.

In [6]:
# initialize the model
model = gensim.models.Word2Vec(
    window=10,
    min_count=3,
    workers=4,
)

### build vocabulary

In [7]:
# build vocabulary
model.build_vocab(review_text, progress_per=1000)

### Train the Word2Vec Model

In [8]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(91208821, 121496535)

### Save the Model
- Save the model so that it can be reused in other applications

In [9]:
model.save('./word2vec.model')

### Finding Similar Words and Similarity between words

In [12]:
# similar words to the word happy
model.wv.most_similar('happy')

[('pleased', 0.95903480052948),
 ('satisfied', 0.9395011067390442),
 ('impressed', 0.8282353281974792),
 ('unhappy', 0.7698390483856201),
 ('satisified', 0.7645992040634155),
 ('thrilled', 0.7334235310554504),
 ('delighted', 0.7175753116607666),
 ('dissapointed', 0.6747905015945435),
 ('grateful', 0.6719464063644409),
 ('satified', 0.6707915663719177)]

In [16]:
# similarity between words
model.wv.similarity(w1='happy', w2='water')

-0.07584594

- the result above shows that the words are not similar