# Importing Libraries

In [27]:
import gensim
import pandas as pd

# Dataset
The Yelp dataset is a subset of their businesses, reviews, and user data for use in personal, educational, and academic purposes. The dataset contains 6,990,280 reviews.

In [28]:
 %time yelp_reviews = pd.read_json("G:\yelp_academic_dataset_review.json", encoding = 'ISO-8859-1', lines=True, nrows=100000) 

Wall time: 744 ms


In [29]:
yelp_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [30]:
yelp_reviews.isnull().any()

review_id      False
user_id        False
business_id    False
stars          False
useful         False
funny          False
cool           False
text           False
date           False
dtype: bool

In [31]:
doc = yelp_reviews.text.apply(gensim.utils.simple_preprocess)

In [32]:
doc.loc[0]

['if',
 'you',
 'decide',
 'to',
 'eat',
 'here',
 'just',
 'be',
 'aware',
 'it',
 'is',
 'going',
 'to',
 'take',
 'about',
 'hours',
 'from',
 'beginning',
 'to',
 'end',
 'we',
 'have',
 'tried',
 'it',
 'multiple',
 'times',
 'because',
 'want',
 'to',
 'like',
 'it',
 'have',
 'been',
 'to',
 'it',
 'other',
 'locations',
 'in',
 'nj',
 'and',
 'never',
 'had',
 'bad',
 'experience',
 'the',
 'food',
 'is',
 'good',
 'but',
 'it',
 'takes',
 'very',
 'long',
 'time',
 'to',
 'come',
 'out',
 'the',
 'waitstaff',
 'is',
 'very',
 'young',
 'but',
 'usually',
 'pleasant',
 'we',
 'have',
 'just',
 'had',
 'too',
 'many',
 'experiences',
 'where',
 'we',
 'spent',
 'way',
 'too',
 'long',
 'waiting',
 'we',
 'usually',
 'opt',
 'for',
 'another',
 'diner',
 'or',
 'restaurant',
 'on',
 'the',
 'weekends',
 'in',
 'order',
 'to',
 'be',
 'done',
 'quicker']

# Training Word2vec

In [33]:
model = gensim.models.Word2Vec (doc, size=150, window=10, min_count=2, workers=10)
model.train(doc,total_examples=len(doc),epochs=10)

(72129411, 95117840)

# Lets see what we can do with the model

## Finding similar words

**Looking for words similar to the word "bad"**

In [34]:
model.wv.most_similar("awful")

[('terrible', 0.8593562245368958),
 ('horrible', 0.8489099144935608),
 ('horrendous', 0.7160866260528564),
 ('lousy', 0.7073488235473633),
 ('poor', 0.6838691234588623),
 ('disgusting', 0.6769656538963318),
 ('abysmal', 0.6426184177398682),
 ('gross', 0.6106832027435303),
 ('sucked', 0.6006918549537659),
 ('subpar', 0.5950561761856079)]

**Looking for words similar to the word "food"**

In [35]:
model.wv.most_similar("food")

[('cuisine', 0.5357833504676819),
 ('meal', 0.5206096768379211),
 ('meals', 0.47635331749916077),
 ('sushi', 0.4658252000808716),
 ('restaurant', 0.4470158815383911),
 ('fare', 0.4332970976829529),
 ('salsa', 0.4281695783138275),
 ('pizza', 0.4265521466732025),
 ('dishes', 0.40064430236816406),
 ('grub', 0.39823710918426514)]

## Finding similarity between two words

**similarity between related words**

In [36]:
model.wv.similarity(w1="great", w2="good")

0.68801594

**similarity between unrelated words**

In [37]:
# similarity between two unrelated words
model.wv.similarity(w1="dirty",w2="clean")

0.39876875

## Finding analogy

In [38]:
def analogy(x1, x2, y1):
    result = model.wv.most_similar(positive=[y1, x2], negative=[x1])
    return result[0][0]

In [39]:
analogy('japan', 'japanese', 'india')

'indian'

In [40]:
analogy('tall', 'tallest', 'long')

'longest'

## Finding odd one

In [45]:
# Which one is the odd one out in this list?
model.wv.doesnt_match("computer laptop france".split())

'france'

In [42]:
model.wv.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'