In [1]:
# !pip install gensim
# !pip install python-Levenshtein

In [1]:
import gensim
import pandas as pd

### Reading and Exploring the Dataset
The dataset we are using here is a subset of Amazon reviews from the Sports & Outdoors category. The data is stored as a JSON file and can be read using pandas.

Link to the Dataset: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Sports_and_Outdoors_5.json.gz

In [4]:
df = pd.read_json("Sports_and_Outdoors_5.json", lines=True)
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]",This came in on time and I am veru happy with ...,5,Woks very good,1390694400,"01 26, 2014"
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]",I had a factory Glock tool that I was using fo...,5,Works as well as the factory tool,1328140800,"02 2, 2012"
2,AB2W04NI4OEAD,1881509818,J. Fernald,"[2, 2]",If you don't have a 3/32 punch or would like t...,4,"It's a punch, that's all.",1330387200,"02 28, 2012"
3,A148SVSWKTJKU6,1881509818,"Jusitn A. Watts ""Maverick9614""","[0, 0]",This works no better than any 3/32 punch you w...,4,It's a punch with a Glock logo.,1328400000,"02 5, 2012"
4,AAAWJ6LW9WMOO,1881509818,Material Man,"[0, 0]",I purchased this thinking maybe I need a speci...,4,"Ok,tool does what a regular punch does.",1366675200,"04 23, 2013"


In [5]:
df.shape

(296337, 9)

In [8]:
review_text = df['reviewText'].apply(gensim.utils.simple_preprocess)

In [9]:
review_text[:5]

0    [this, came, in, on, time, and, am, veru, happ...
1    [had, factory, glock, tool, that, was, using, ...
2    [if, you, don, have, punch, or, would, like, t...
3    [this, works, no, better, than, any, punch, yo...
4    [purchased, this, thinking, maybe, need, speci...
Name: reviewText, dtype: object

In [11]:
model = gensim.models.Word2Vec ( 
    window = 5,
    min_count = 2,
)

In [14]:
model

<gensim.models.word2vec.Word2Vec at 0x149fe1fd840>

In [15]:
model.build_vocab(review_text, progress_per=1000)

In [25]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(91334714, 121496535)

In [22]:
# a.save("./Exercise.model")

In [26]:
model.wv.most_similar("awful")

[('horrible', 0.742700457572937),
 ('terrible', 0.7405649423599243),
 ('overwhelming', 0.6695225238800049),
 ('ugly', 0.6345702409744263),
 ('unpleasant', 0.6078774929046631),
 ('horrendous', 0.5991968512535095),
 ('overpowering', 0.5910825133323669),
 ('extraordinary', 0.5797730088233948),
 ('authentic', 0.5788567662239075),
 ('horrid', 0.5609738230705261)]

In [24]:
model.wv.similarity(w1="good", w2="bad")

0.524984

#### Initialize the model

In [12]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)


#### Build Vocabulary

In [13]:
model.build_vocab(review_text, progress_per=1000)

#### Train the Word2Vec Model

In [14]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(91337426, 121496535)

### Finding Similar Words and Similarity between words
https://radimrehurek.com/gensim/models/word2vec.html

In [19]:
model.wv.most_similar("awful")

[('horrible', 0.710124671459198),
 ('terrible', 0.7084428071975708),
 ('ugly', 0.6775537729263306),
 ('overwhelming', 0.6621577143669128),
 ('authentic', 0.6380588412284851),
 ('unpleasant', 0.6188114285469055),
 ('horrendous', 0.6016426682472229),
 ('unfortunate', 0.5886159539222717),
 ('unbelievable', 0.5829745531082153),
 ('unusual', 0.5810099840164185)]

In [16]:
model.wv.similarity(w1="good", w2="great")

0.77770394

In [17]:
model.wv.similarity(w1="slow", w2="steady")

0.34121108