## Library Imports

In [1]:
import ast
import numpy as np
import multiprocessing
from time import time
from collections import defaultdict

import pandas as pd
import spacy
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Data Processing

In [2]:
rtr_df = pd.read_csv("../zomato/zomato.csv")

In [3]:
rtr_df.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


#### Converting String to List

In [None]:
a_dummy = [(4,lets see how to fetch the review)]

a_dummy[0] = (4,lets see how to fetch the review)
a_dummy[0][1]


a = [1,2,3,4,5]
a[0] = 1

In [4]:
raw_review_list = rtr_df["reviews_list"]
valid_cnt = 0
invalid_cnt = 0
review_list = []
for review in raw_review_list:
    try:
        review_text = ast.literal_eval(review)[0][1]
        review_list.append(review_text)        
        valid_cnt += 1
    except:
        invalid_cnt += 1
        
print (valid_cnt)
print (invalid_cnt)


44122
7595


In [5]:
# Removing Rated
review_list = [review.replace("RATED\n","") for review in review_list]
review_list = [review.lower() for review in review_list]

In [7]:
cntr = 0
for idx,review in enumerate(review_list):
    print (idx)
    print (review)
    cntr += 1
    if cntr == 6:
        break

0
  a beautiful place to dine in.the interiors take you back to the mughal era. the lightings are just perfect.we went there on the occasion of christmas and so they had only limited items available. but the taste and service was not compromised at all.the only complaint is that the breads could have been better.would surely like to come here again.
1
  had been here for dinner with family. turned out to be a good choose suitable for all ages of people. can try this place. we liked the most was their starters. service is good. prices are affordable. will recommend this restaurant for early dinner. the place is little noisy.
2
  ambience is not that good enough and it's not a pocket friendly cafe and the quantity is not that good and desserts are too good enough ??..
3
  great food and proper karnataka style full meals. been there twice and was fully satisfied.. will give 5 stars if it's well managed............
4
  very good restaurant in neighbourhood. buffet system is properly arrang

> Spacy model for lemmatization

In [8]:
nlp = spacy.load('en_core_web_sm',disable=['ner'])

In [10]:
def preprocess_review(doc):
    """ lemmatizing words in a sentence"""
    clean_review = [word.lemma_ for word in doc]
    
    return " ".join(clean_review)

In [11]:
# Convering each word of a sentence to its lemmatized form
start_time = time()
review_text = [ preprocess_review(review) for review in nlp.pipe(review_list,batch_size=1000,n_threads=-1)]

print (round((time()-start_time)/60,2))

3.2


In [12]:
review_data_frame = pd.DataFrame({"Review":review_text})

In [14]:
review_data_frame.shape
review_data_frame.head()
# (44122, 1)

Unnamed: 0,Review
0,a beautiful place to dine in.the interior t...
1,have be here for dinner with family . turn ...
2,ambience be not that good enough and -PRON-...
3,great food and proper karnataka style full ...
4,very good restaurant in neighbourhood . buf...


In [None]:
review_data  = review_data.drop_duplicates()
review_data.shape
# Unique - (13730, 1)

In [17]:
print (review_data_frame["Review"][0])
phraser_sent = [review.split() for review in review_data_frame["Review"]]

print (phraser_sent[0])

   a beautiful place to dine in.the interior take -PRON- back to the mughal era . the lighting be just perfect.we go there on the occasion of christmas and so -PRON- have only limited item available . but the taste and service be not compromise at all.the only complaint be that the bread could have be better.would surely like to come here again .
['a', 'beautiful', 'place', 'to', 'dine', 'in.the', 'interior', 'take', '-PRON-', 'back', 'to', 'the', 'mughal', 'era', '.', 'the', 'lighting', 'be', 'just', 'perfect.we', 'go', 'there', 'on', 'the', 'occasion', 'of', 'christmas', 'and', 'so', '-PRON-', 'have', 'only', 'limited', 'item', 'available', '.', 'but', 'the', 'taste', 'and', 'service', 'be', 'not', 'compromise', 'at', 'all.the', 'only', 'complaint', 'be', 'that', 'the', 'bread', 'could', 'have', 'be', 'better.would', 'surely', 'like', 'to', 'come', 'here', 'again', '.']


In [20]:
phrases = Phrases(phraser_sent,min_count=30,progress_per=100)

In [21]:
phrase_gram = Phraser(phrases)

In [35]:
sentences = phrase_gram[phraser_sent]

word_freq = defaultdict(int)
for sent in sentences:
    for word in sent:
        word_freq[word] += 1
        

In [27]:
review_phrases = []
for word in word_freq:
    if "_" in word:
        review_phrases.append(word)
review_phrases = list(set(review_phrases))

In [36]:
for word in review_phrases:
    print (word)
    break

@chef_bb


## Word2Vec Training 

In [37]:
cores = multiprocessing.cpu_count()
print (cores)

12


In [38]:
# Model Definition
w2v_model = Word2Vec(min_count=20,window=2,size=300,sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=cores-2)

In [39]:
start_time = time()

w2v_model.build_vocab(sentences, progress_per=100)

print((round((time() - start_time) / 60, 2)))

0.11


In [40]:
start_time = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start_time) / 60, 2)))

Time to train the model: 3.17 mins


In [43]:
w2v_model.wv.most_similar(positive=["bread"])

[('puffs', 0.34401941299438477),
 ('pita', 0.3427869975566864),
 ('flaky', 0.3411823809146881),
 ('sandwich', 0.3360087275505066),
 ('cheese', 0.3156045973300934),
 ('omlette', 0.2941094636917114),
 ('garlic_bread', 0.29402774572372437),
 ('cheesy', 0.28658920526504517),
 ('toast', 0.28091931343078613),
 ('brown_bread', 0.280399888753891)]

In [44]:
w2v_model.wv.similarity("paneer", 'naan')

0.32222542

In [45]:
w2v_model.wv.similarity("paneer", 'chicken')

0.5097032

In [None]:
w2v_model.wv.similarity("waiter", 'paneer_tikka')

In [None]:
import numpy as np
def word_dist_visual(model, word,mode):
    arrays = np.empty((0, 300), dtype='f')
    word_labels = [word]
    color_list  = ['blue']

    # adding vector for input word
    arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
    
    # fetch most similar words
    close_words = model.wv.most_similar([word])
    
    # fetch most dissimilar words
    negative_words = model.wv.most_similar(negative=[word])
    
    # adding vector for similar words
    if ((mode == "pos") or (mode == "all")):        
        for wrd_score in close_words:
            wrd_vector = model.wv.__getitem__([wrd_score[0]])
            word_labels.append(wrd_score[0])
            color_list.append('limegreen')
            arrays = np.append(arrays, wrd_vector, axis=0)
    
    # adding vector for negative words
    if ((mode == "neg") or (mode == "all")):        
        for wrd_score in negative_words:
            wrd_vector = model.wv.__getitem__([wrd_score[0]])
            word_labels.append(wrd_score[0])
            color_list.append('red')
            arrays = np.append(arrays, wrd_vector, axis=0)   
    
        
    # Dimensionality reduction from 300 to 10
    reduc = PCA(n_components=10).fit_transform(arrays)
    
    t_sne_cmp = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
    
    # Sets everything up to plot
    df = pd.DataFrame({'x': [x for x in t_sne_cmp[:, 0]],
                       'y': [y for y in t_sne_cmp[:, 1]],
                       'words': word_labels,
                       'color': color_list})
    
    fig, _ = plt.subplots()
    fig.set_size_inches(10, 10)
    
    # Basic plot
    p1 = sns.regplot(data=df,x="x",y="y",fit_reg=False,marker="o",scatter_kws={'s': 40,'facecolors': df['color']})
    
    # Adds annotations one by one with a loop
    for line in range(0, df.shape[0]):
         p1.text(df["x"][line],df['y'][line],'  ' + df["words"][line].title(),horizontalalignment='left',verticalalignment='bottom', size='medium',color=df['color'][line],weight='normal').set_size(15)
            
    plt.title('t-SNE visualization for {}'.format(word.title()))

In [None]:
word_dist_visual(w2v_model, 'sambar',"all")