In [1]:
import gensim # an NLP library
import pandas as pd

Reading and Exploring the Dataset

The dataset we are using here is a subset of Amazon reviews from the Cell Phones & Accessories category. The data is stored as a JSON file and can be read using pandas.

Link to the Dataset: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Cell_Phones_and_Accessories_5.json.gz

In [5]:
df = pd.read_json("Excels\\Cell_Phones_and_Accessories_5.json",lines=True) # lines = True , treats each line as 1 json object
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


In [6]:
df.shape

(194439, 9)

In [8]:
df['reviewText'][0]

"They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"

In [11]:
gensim.utils.simple_preprocess("They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again") 
# it preprocess all sentences
# removes punctautions , removes I , lowers all , make all words as tokens

['they',
 'look',
 'good',
 'and',
 'stick',
 'good',
 'just',
 'don',
 'like',
 'the',
 'rounded',
 'shape',
 'because',
 'was',
 'always',
 'bumping',
 'it',
 'and',
 'siri',
 'kept',
 'popping',
 'up',
 'and',
 'it',
 'was',
 'irritating',
 'just',
 'won',
 'buy',
 'product',
 'like',
 'this',
 'again']

In [13]:
review_text = df['reviewText'].apply(gensim.utils.simple_preprocess)
review_text

0         [they, look, good, and, stick, good, just, don...
1         [these, stickers, work, like, the, review, say...
2         [these, are, awesome, and, make, my, phone, lo...
3         [item, arrived, in, great, time, and, was, in,...
4         [awesome, stays, on, and, looks, great, can, b...
                                ...                        
194434    [works, great, just, like, my, original, one, ...
194435    [great, product, great, packaging, high, quali...
194436    [this, is, great, cable, just, as, good, as, t...
194437    [really, like, it, becasue, it, works, well, w...
194438    [product, as, described, have, wasted, lot, of...
Name: reviewText, Length: 194439, dtype: object

In [14]:
model = gensim.models.Word2Vec(
    window=10, # 10 before and after target word
    min_count=2, # a sentence must have 2 words for it to enter the model
    workers=2, # how many cpu threads/cores to use
)  

In [15]:
model.build_vocab(review_text,progress_per=1000) # after 1000 words show progress bar

In [16]:
model.epochs # deafult is 5

5

In [17]:
model.corpus_count

194439

In [18]:
model.train(review_text,total_examples=model.corpus_count,epochs=model.epochs) 

(61508774, 83868975)

In [19]:
model.save("./word2vec-amazon-cell-accessories-reviews-short.model") # saves the model as a pre-trainned model

In [21]:
model.wv.most_similar('bad') # wv = Word2Vec # it says similar vectors

[('terrible', 0.6388096213340759),
 ('shabby', 0.6222387552261353),
 ('good', 0.5758284330368042),
 ('horrible', 0.5655933618545532),
 ('crappy', 0.5245072245597839),
 ('funny', 0.5209336280822754),
 ('okay', 0.5196484327316284),
 ('legit', 0.5152790546417236),
 ('awful', 0.5119503736495972),
 ('disappointing', 0.5048129558563232)]

In [23]:
model.wv.similarity(w1='cheap',w2='inexpensive')

0.5167894

In [24]:
model.wv.similarity(w1='great',w2='good')

0.7848578

# EXERCISE

"They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"

Training the Word2Vec Model

Train the model for reviews. Use a window of size 10 i.e. 10 words before the present word and 10 words ahead. A sentence with at least 2 words should only be considered, configure this using min_count parameter.

Workers define how many CPU threads to be used.

In [1]:
import gensim 
import pandas as pd

In [3]:
df = pd.read_json("Excels/Sports_and_Outdoors_5.json",lines=True)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]",This came in on time and I am veru happy with ...,5,Woks very good,1390694400,"01 26, 2014"
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]",I had a factory Glock tool that I was using fo...,5,Works as well as the factory tool,1328140800,"02 2, 2012"
2,AB2W04NI4OEAD,1881509818,J. Fernald,"[2, 2]",If you don't have a 3/32 punch or would like t...,4,"It's a punch, that's all.",1330387200,"02 28, 2012"
3,A148SVSWKTJKU6,1881509818,"Jusitn A. Watts ""Maverick9614""","[0, 0]",This works no better than any 3/32 punch you w...,4,It's a punch with a Glock logo.,1328400000,"02 5, 2012"
4,AAAWJ6LW9WMOO,1881509818,Material Man,"[0, 0]",I purchased this thinking maybe I need a speci...,4,"Ok,tool does what a regular punch does.",1366675200,"04 23, 2013"


In [4]:
df.shape

(296337, 9)

In [5]:
df['reviewText'][0]

'This came in on time and I am veru happy with it, I haved used it already and it makes taking out the pins in my glock 32 very easy'

# Simple Preprocessing & Tokenization

In [6]:
gensim.utils.simple_preprocess("They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again") 
# it preprocess all sentences
# removes punctautions , removes I , lowers all , make all words as tokens

['they',
 'look',
 'good',
 'and',
 'stick',
 'good',
 'just',
 'don',
 'like',
 'the',
 'rounded',
 'shape',
 'because',
 'was',
 'always',
 'bumping',
 'it',
 'and',
 'siri',
 'kept',
 'popping',
 'up',
 'and',
 'it',
 'was',
 'irritating',
 'just',
 'won',
 'buy',
 'product',
 'like',
 'this',
 'again']

In [8]:
review_text = df['reviewText'].apply(gensim.utils.simple_preprocess)
review_text

0         [this, came, in, on, time, and, am, veru, happ...
1         [had, factory, glock, tool, that, was, using, ...
2         [if, you, don, have, punch, or, would, like, t...
3         [this, works, no, better, than, any, punch, yo...
4         [purchased, this, thinking, maybe, need, speci...
                                ...                        
296332    [this, is, water, bottle, done, right, it, is,...
296333    [if, you, re, looking, for, an, insulated, wat...
296334    [this, hydracentials, sporty, oz, double, insu...
296335    [as, usual, received, this, item, free, in, ex...
296336    [hydracentials, insulated, oz, water, bottle, ...
Name: reviewText, Length: 296337, dtype: object

# Training the Word2Vec Model


In [10]:
model = gensim.models.Word2Vec(
    window=10, # 10 before and after target word
    min_count=2, # a sentence must have 2 for it to enter the model
    workers=2 # CPU threads/cores to use
)

# Build Vocabulary


In [11]:
model.build_vocab(review_text,progress_per=1000)

# Train the Word2Vec Model


In [16]:
model.corpus_count # Each sentence or document is typically considered a "corpus" in this context

296337

In [12]:
model.train(review_text,total_examples=model.corpus_count,epochs=model.epochs)

(91341083, 121496535)

In [14]:
model.save("./Sports_and_Outdoors_5.model") # saves the model as a pre-trainned model

# Finding Similar Words and Similarity between words


In [15]:
model.wv.most_similar("awful")

[('terrible', 0.6722495555877686),
 ('horrible', 0.6610020995140076),
 ('overpowering', 0.5920022130012512),
 ('overwhelming', 0.5905032753944397),
 ('unpleasant', 0.5866462588310242),
 ('ugly', 0.5856055617332458),
 ('insane', 0.559581995010376),
 ('horrendous', 0.5569956302642822),
 ('enormous', 0.552455484867096),
 ('admirable', 0.5367516279220581)]

In [17]:
model.wv.similarity(w1="good", w2="great")


0.7881181

In [18]:
model.wv.similarity(w1="slow", w2="steady")


0.39352134