In [13]:
import sqlite3
import os
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

# Loading Dataset

In [2]:
if os.path.isfile('final.sqlite'):
    conn = sqlite3.connect('final.sqlite')
    final = pd.read_sql_query('select * from reviews', conn)
    conn.close()
else:
    print('Please run Text Preprocessing code file')

# Word2Vec

In [3]:
# Method-1 -> Using Google News Word2Vectors

# In this project we are using a pretrained model by Google
# its 3.3GB file, once you load this into your memory it occupies
# ~9GB, so please do this step only if you have > 12GB of RAM.

# It contains all our courpus words as keys and  model[word] as values.

# To use this code-snippet, download "GoogleNews-vectors-negative300.bin" 
# from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
# it's 1.9GB in size.

if os.path.isfile('GoogleNews-vectors-negative300.bin'):
    model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [4]:
print("the vector representation of word 'computer' : \n", model['computer'])
print("the similarity between the words 'woman' and 'man' : \n", model.similarity('woman', 'man')) # partially similar
print("the most similar words to 'woman'", model.most_similar('woman'))

the vector representation of word 'computer' : 
 [ 1.07421875e-01 -2.01171875e-01  1.23046875e-01  2.11914062e-01
 -9.13085938e-02  2.16796875e-01 -1.31835938e-01  8.30078125e-02
  2.02148438e-01  4.78515625e-02  3.66210938e-02 -2.45361328e-02
  2.39257812e-02 -1.60156250e-01 -2.61230469e-02  9.71679688e-02
 -6.34765625e-02  1.84570312e-01  1.70898438e-01 -1.63085938e-01
 -1.09375000e-01  1.49414062e-01 -4.65393066e-04  9.61914062e-02
  1.68945312e-01  2.60925293e-03  8.93554688e-02  6.49414062e-02
  3.56445312e-02 -6.93359375e-02 -1.46484375e-01 -1.21093750e-01
 -2.27539062e-01  2.45361328e-02 -1.24511719e-01 -3.18359375e-01
 -2.20703125e-01  1.30859375e-01  3.66210938e-02 -3.63769531e-02
 -1.13281250e-01  1.95312500e-01  9.76562500e-02  1.26953125e-01
  6.59179688e-02  6.93359375e-02  1.02539062e-02  1.75781250e-01
 -1.68945312e-01  1.21307373e-03 -2.98828125e-01 -1.15234375e-01
  5.66406250e-02 -1.77734375e-01 -2.08984375e-01  1.76757812e-01
  2.38037109e-02 -2.57812500e-01 -4.46777

In [5]:
# Method-2 -> Train your own Word2Vec model using your own text corpus
list_of_sent = []
for sent in final['CleanedText'].values:
    list_of_sent.append(sent.split())

In [6]:
print(final['CleanedText'][0])
print("********************************************************")
print(list_of_sent[0])

bought sever vital can dog food product found good qualiti product look like stew process meat smell better labrador finicki appreci product better
********************************************************
['bought', 'sever', 'vital', 'can', 'dog', 'food', 'product', 'found', 'good', 'qualiti', 'product', 'look', 'like', 'stew', 'process', 'meat', 'smell', 'better', 'labrador', 'finicki', 'appreci', 'product', 'better']


In [7]:
# min_count = 2 considers only words that occured atleast 2 times
w2v_model = Word2Vec(list_of_sent, min_count=2, vector_size=50, workers=4)

In [8]:
w2v_words = list(w2v_model.wv.index_to_key)
print("number of words that occured minimum 2 times : ", len(w2v_words))
print("sample words : ", w2v_words[:50])

number of words that occured minimum 2 times :  19065
sample words :  ['like', 'tast', 'flavor', 'good', 'love', 'one', 'product', 'great', 'use', 'tri', 'coffe', 'tea', 'food', 'get', 'make', 'would', 'dog', 'buy', 'eat', 'time', 'realli', 'dont', 'amazon', 'much', 'order', 'price', 'drink', 'also', 'bag', 'littl', 'find', 'best', 'even', 'well', 'chocol', 'ive', 'store', 'better', 'treat', 'box', 'cup', 'day', 'mix', 'recommend', 'look', 'sugar', 'first', 'give', 'year', 'sweet']


In [9]:
w2v_model.wv.most_similar('tasti')

[('delici', 0.8055939078330994),
 ('yummi', 0.7851197123527527),
 ('satisfi', 0.6936739683151245),
 ('hearti', 0.6652274131774902),
 ('good', 0.6376382112503052),
 ('nice', 0.6251686811447144),
 ('terrif', 0.6196783781051636),
 ('dens', 0.6193222403526306),
 ('fantast', 0.6180269718170166),
 ('nutriti', 0.6091058254241943)]

In [10]:
w2v_model.wv.most_similar('like')

[('dislik', 0.7316311597824097),
 ('weird', 0.7020769715309143),
 ('okay', 0.6741054058074951),
 ('prefer', 0.6582061052322388),
 ('gross', 0.6522704362869263),
 ('odd', 0.6448681950569153),
 ('think', 0.6437071561813354),
 ('appeal', 0.636507511138916),
 ('hate', 0.6267186999320984),
 ('funni', 0.6081758141517639)]

# Avg Word2Vec

In [11]:
# Average Word2Vec
# compute average word2vec for each review
sent_vectors = [] # the avg-w2v for each sentence/review is stored in this list
for sent in tqdm(list_of_sent): # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of length 50
    cnt_words = 0 # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in rview/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words # taking average
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

100%|██████████| 87773/87773 [02:28<00:00, 591.46it/s] 


87773
50


# TFIDF-Word2Vec

In [15]:
model = TfidfVectorizer()
tf_idf_matrix = model.fit_transform(final['CleanedText'].values)
# we are creating a dictionary with word as key, and its idf as value
dictionary = dict(zip(model.get_feature_names(), list(model.idf_)))

In [17]:
# TF-IDF weighted Word2Vec
tfidf_feat = model.get_feature_names() # tfidf words/col-names
# tf_idf_matrix is a sparse matrix with row=sentence, col=word and cell_val=tfidf

tfidf_sent_vectors = [] # the tfidf-w2v for each sentence/review is stored in this list
row=0
for sent in tqdm(list_of_sent): # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of 50 length
    weight_sum = 0 # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            # tf_idf = tf_idf_matrix[row, tfidf_feat.index(word)]
            # to reduce the computations we are using below method to compute tf_idf
            # sent.count(word) = tf values of word in curr review
            # dictionary[word] = idf value of word in whole corpus
            tf_idf = (sent.count(word)/len(sent)) * dictionary[word]
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1


  0%|          | 0/87773 [00:00<?, ?it/s]
  0%|          | 52/87773 [00:00<02:50, 515.98it/s]
  0%|          | 127/87773 [00:00<02:34, 568.32it/s]
  0%|          | 206/87773 [00:00<02:21, 619.49it/s]
  0%|          | 289/87773 [00:00<02:11, 667.50it/s]
  0%|          | 402/87773 [00:00<01:54, 759.99it/s]
  1%|          | 488/87773 [00:00<01:51, 779.49it/s]
  1%|          | 565/87773 [00:00<01:53, 765.69it/s]
  1%|          | 661/87773 [00:00<01:47, 813.61it/s]
  1%|          | 743/87773 [00:00<01:52, 776.90it/s]
  1%|          | 824/87773 [00:01<01:51, 782.60it/s]
  1%|          | 937/87773 [00:01<01:41, 854.84it/s]
  1%|          | 1025/87773 [00:01<01:43, 840.20it/s]
  1%|▏         | 1123/87773 [00:01<01:39, 870.11it/s]
  1%|▏         | 1217/87773 [00:01<01:37, 888.04it/s]
  1%|▏         | 1307/87773 [00:01<01:42, 847.07it/s]
  2%|▏         | 1393/87773 [00:01<02:01, 708.79it/s]
  2%|▏         | 1469/87773 [00:01<02:07, 679.48it/s]
  2%|▏         | 1541/87773 [00:01<02:07, 676.04it/

 14%|█▍        | 12162/87773 [00:17<01:37, 774.38it/s]
 14%|█▍        | 12242/87773 [00:17<01:53, 667.06it/s]
 14%|█▍        | 12327/87773 [00:18<01:46, 710.01it/s]
 14%|█▍        | 12402/87773 [00:18<02:05, 602.44it/s]
 14%|█▍        | 12468/87773 [00:18<02:49, 444.22it/s]
 14%|█▍        | 12523/87773 [00:18<02:53, 433.99it/s]
 14%|█▍        | 12574/87773 [00:18<02:55, 429.45it/s]
 14%|█▍        | 12639/87773 [00:18<02:37, 477.36it/s]
 15%|█▍        | 12737/87773 [00:18<02:13, 563.43it/s]
 15%|█▍        | 12845/87773 [00:19<01:54, 656.98it/s]
 15%|█▍        | 12954/87773 [00:19<01:40, 744.72it/s]
 15%|█▍        | 13042/87773 [00:19<01:50, 678.54it/s]
 15%|█▍        | 13121/87773 [00:19<01:49, 679.50it/s]
 15%|█▌        | 13197/87773 [00:19<01:55, 645.23it/s]
 15%|█▌        | 13310/87773 [00:19<01:40, 740.31it/s]
 15%|█▌        | 13397/87773 [00:19<01:36, 774.53it/s]
 15%|█▌        | 13481/87773 [00:19<01:42, 724.65it/s]
 15%|█▌        | 13559/87773 [00:20<01:42, 720.56it/s]
 16%|█▌   

 29%|██▉       | 25801/87773 [00:35<01:09, 887.76it/s]
 30%|██▉       | 25925/87773 [00:35<01:03, 968.80it/s]
 30%|██▉       | 26027/87773 [00:35<01:05, 938.40it/s]
 30%|██▉       | 26152/87773 [00:35<01:00, 1012.44it/s]
 30%|██▉       | 26258/87773 [00:35<01:00, 1016.23it/s]
 30%|███       | 26363/87773 [00:35<00:59, 1024.61it/s]
 30%|███       | 26468/87773 [00:35<01:00, 1015.89it/s]
 30%|███       | 26573/87773 [00:35<00:59, 1022.97it/s]
 30%|███       | 26688/87773 [00:36<00:58, 1053.11it/s]
 31%|███       | 26795/87773 [00:36<01:06, 917.12it/s] 
 31%|███       | 26891/87773 [00:36<01:10, 862.76it/s]
 31%|███       | 26982/87773 [00:36<01:09, 874.07it/s]
 31%|███       | 27072/87773 [00:36<01:12, 833.49it/s]
 31%|███       | 27205/87773 [00:36<01:04, 937.16it/s]
 31%|███       | 27324/87773 [00:36<01:00, 999.19it/s]
 31%|███▏      | 27430/87773 [00:36<01:02, 967.37it/s]
 31%|███▏      | 27531/87773 [00:36<01:01, 977.70it/s]
 31%|███▏      | 27632/87773 [00:37<01:01, 979.43it/s]
 32

 45%|████▍     | 39284/87773 [00:51<01:19, 608.65it/s]
 45%|████▍     | 39349/87773 [00:52<01:22, 585.92it/s]
 45%|████▍     | 39411/87773 [00:52<01:24, 572.91it/s]
 45%|████▌     | 39500/87773 [00:52<01:15, 641.33it/s]
 45%|████▌     | 39573/87773 [00:52<01:12, 664.26it/s]
 45%|████▌     | 39683/87773 [00:52<01:03, 752.71it/s]
 45%|████▌     | 39765/87773 [00:52<01:02, 770.05it/s]
 45%|████▌     | 39870/87773 [00:52<00:57, 836.42it/s]
 46%|████▌     | 39977/87773 [00:52<00:53, 888.96it/s]
 46%|████▌     | 40071/87773 [00:52<00:57, 835.80it/s]
 46%|████▌     | 40159/87773 [00:53<01:02, 763.22it/s]
 46%|████▌     | 40266/87773 [00:53<00:56, 833.56it/s]
 46%|████▌     | 40361/87773 [00:53<00:55, 856.28it/s]
 46%|████▌     | 40450/87773 [00:53<00:55, 849.47it/s]
 46%|████▌     | 40555/87773 [00:53<00:52, 897.10it/s]
 46%|████▋     | 40648/87773 [00:53<00:52, 896.96it/s]
 46%|████▋     | 40752/87773 [00:53<00:50, 934.42it/s]
 47%|████▋     | 40848/87773 [00:53<00:52, 885.47it/s]
 47%|████▋

 60%|██████    | 52869/87773 [01:08<00:48, 720.70it/s]
 60%|██████    | 52943/87773 [01:08<00:49, 706.20it/s]
 60%|██████    | 53015/87773 [01:08<00:52, 663.84it/s]
 60%|██████    | 53083/87773 [01:09<00:52, 655.62it/s]
 61%|██████    | 53153/87773 [01:09<00:52, 665.04it/s]
 61%|██████    | 53228/87773 [01:09<00:50, 686.66it/s]
 61%|██████    | 53298/87773 [01:09<00:53, 648.59it/s]
 61%|██████    | 53374/87773 [01:09<00:50, 677.09it/s]
 61%|██████    | 53454/87773 [01:09<00:48, 708.37it/s]
 61%|██████    | 53536/87773 [01:09<00:46, 735.11it/s]
 61%|██████    | 53648/87773 [01:09<00:41, 816.55it/s]
 61%|██████    | 53749/87773 [01:09<00:39, 860.30it/s]
 61%|██████▏   | 53839/87773 [01:09<00:40, 840.81it/s]
 61%|██████▏   | 53941/87773 [01:10<00:38, 870.07it/s]
 62%|██████▏   | 54030/87773 [01:10<00:38, 875.53it/s]
 62%|██████▏   | 54131/87773 [01:10<00:37, 893.31it/s]
 62%|██████▏   | 54222/87773 [01:10<00:37, 885.94it/s]
 62%|██████▏   | 54342/87773 [01:10<00:35, 948.39it/s]
 62%|█████

 76%|███████▌  | 66630/87773 [01:25<00:24, 856.12it/s]
 76%|███████▌  | 66717/87773 [01:25<00:25, 841.08it/s]
 76%|███████▌  | 66838/87773 [01:25<00:22, 924.21it/s]
 76%|███████▋  | 66934/87773 [01:25<00:22, 920.12it/s]
 76%|███████▋  | 67029/87773 [01:26<00:25, 812.97it/s]
 76%|███████▋  | 67115/87773 [01:26<00:31, 656.49it/s]
 77%|███████▋  | 67189/87773 [01:26<00:30, 678.12it/s]
 77%|███████▋  | 67301/87773 [01:26<00:26, 768.06it/s]
 77%|███████▋  | 67388/87773 [01:26<00:25, 794.45it/s]
 77%|███████▋  | 67478/87773 [01:26<00:24, 819.53it/s]
 77%|███████▋  | 67568/87773 [01:26<00:24, 840.40it/s]
 77%|███████▋  | 67681/87773 [01:26<00:22, 910.29it/s]
 77%|███████▋  | 67787/87773 [01:27<00:21, 943.30it/s]
 77%|███████▋  | 67885/87773 [01:27<00:23, 829.16it/s]
 77%|███████▋  | 67973/87773 [01:27<00:25, 765.33it/s]
 78%|███████▊  | 68054/87773 [01:27<00:26, 744.54it/s]
 78%|███████▊  | 68141/87773 [01:27<00:25, 776.68it/s]
 78%|███████▊  | 68223/87773 [01:27<00:24, 787.45it/s]
 78%|█████

 91%|█████████ | 79714/87773 [01:42<00:11, 681.82it/s]
 91%|█████████ | 79789/87773 [01:42<00:11, 678.66it/s]
 91%|█████████ | 79889/87773 [01:42<00:10, 749.89it/s]
 91%|█████████ | 80006/87773 [01:42<00:09, 838.91it/s]
 91%|█████████▏| 80099/87773 [01:42<00:08, 861.05it/s]
 91%|█████████▏| 80191/87773 [01:43<00:08, 847.05it/s]
 91%|█████████▏| 80306/87773 [01:43<00:08, 919.15it/s]
 92%|█████████▏| 80403/87773 [01:43<00:08, 878.80it/s]
 92%|█████████▏| 80495/87773 [01:43<00:08, 831.21it/s]
 92%|█████████▏| 80608/87773 [01:43<00:07, 901.20it/s]
 92%|█████████▏| 80702/87773 [01:43<00:09, 742.66it/s]
 92%|█████████▏| 80823/87773 [01:43<00:08, 836.97it/s]
 92%|█████████▏| 80926/87773 [01:43<00:07, 885.17it/s]
 92%|█████████▏| 81022/87773 [01:44<00:07, 899.43it/s]
 92%|█████████▏| 81117/87773 [01:44<00:07, 894.11it/s]
 93%|█████████▎| 81210/87773 [01:44<00:07, 902.55it/s]
 93%|█████████▎| 81313/87773 [01:44<00:06, 935.41it/s]
 93%|█████████▎| 81416/87773 [01:44<00:06, 957.28it/s]
 93%|█████