In [12]:
import sqlite3
import os
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from tqdm import tqdm

# Loading Dataset

In [2]:
if os.path.isfile('final.sqlite'):
    conn = sqlite3.connect('final.sqlite')
    final = pd.read_sql_query('select * from reviews', conn)
    conn.close()
else:
    print('Please run Text Preprocessing code file')

# Word2Vec

In [3]:
# Method-1 -> Using Google News Word2Vectors

# In this project we are using a pretrained model by Google
# its 3.3GB file, once you load this into your memory it occupies
# ~9GB, so please do this step only if you have > 12GB of RAM.

# It contains all our courpus words as keys and  model[word] as values.

# To use this code-snippet, download "GoogleNews-vectors-negative300.bin" 
# from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
# it's 1.9GB in size.

if os.path.isfile('GoogleNews-vectors-negative300.bin'):
    model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [4]:
print("the vector representation of word 'computer' : \n", model['computer'])
print("the similarity between the words 'woman' and 'man' : \n", model.similarity('woman', 'man')) # partially similar
print("the most similar words to 'woman'", model.most_similar('woman'))

the vector representation of word 'computer' : 
 [ 1.07421875e-01 -2.01171875e-01  1.23046875e-01  2.11914062e-01
 -9.13085938e-02  2.16796875e-01 -1.31835938e-01  8.30078125e-02
  2.02148438e-01  4.78515625e-02  3.66210938e-02 -2.45361328e-02
  2.39257812e-02 -1.60156250e-01 -2.61230469e-02  9.71679688e-02
 -6.34765625e-02  1.84570312e-01  1.70898438e-01 -1.63085938e-01
 -1.09375000e-01  1.49414062e-01 -4.65393066e-04  9.61914062e-02
  1.68945312e-01  2.60925293e-03  8.93554688e-02  6.49414062e-02
  3.56445312e-02 -6.93359375e-02 -1.46484375e-01 -1.21093750e-01
 -2.27539062e-01  2.45361328e-02 -1.24511719e-01 -3.18359375e-01
 -2.20703125e-01  1.30859375e-01  3.66210938e-02 -3.63769531e-02
 -1.13281250e-01  1.95312500e-01  9.76562500e-02  1.26953125e-01
  6.59179688e-02  6.93359375e-02  1.02539062e-02  1.75781250e-01
 -1.68945312e-01  1.21307373e-03 -2.98828125e-01 -1.15234375e-01
  5.66406250e-02 -1.77734375e-01 -2.08984375e-01  1.76757812e-01
  2.38037109e-02 -2.57812500e-01 -4.46777

In [5]:
# Method-2 -> Train your own Word2Vec model using your own text corpus
list_of_sent = []
for sent in final['CleanedText'].values:
    list_of_sent.append(sent.split())

In [6]:
print(final['CleanedText'][0])
print("********************************************************")
print(list_of_sent[0])

bought sever vital can dog food product found good qualiti product look like stew process meat smell better labrador finicki appreci product better
********************************************************
['bought', 'sever', 'vital', 'can', 'dog', 'food', 'product', 'found', 'good', 'qualiti', 'product', 'look', 'like', 'stew', 'process', 'meat', 'smell', 'better', 'labrador', 'finicki', 'appreci', 'product', 'better']


In [7]:
# min_count = 2 considers only words that occured atleast 2 times
w2v_model = Word2Vec(list_of_sent, min_count=2, vector_size=50, workers=4)

In [8]:
w2v_words = list(w2v_model.wv.index_to_key)
print("number of words that occured minimum 2 times : ", len(w2v_words))
print("sample words : ", w2v_words[:50])

number of words that occured minimum 5 times :  19065
sample words :  ['like', 'tast', 'flavor', 'good', 'love', 'one', 'product', 'great', 'use', 'tri', 'coffe', 'tea', 'food', 'get', 'make', 'would', 'dog', 'buy', 'eat', 'time', 'realli', 'dont', 'amazon', 'much', 'order', 'price', 'drink', 'also', 'bag', 'littl', 'find', 'best', 'even', 'well', 'chocol', 'ive', 'store', 'better', 'treat', 'box', 'cup', 'day', 'mix', 'recommend', 'look', 'sugar', 'first', 'give', 'year', 'sweet']


In [9]:
w2v_model.wv.most_similar('tasti')

[('delici', 0.7905284762382507),
 ('yummi', 0.7689010500907898),
 ('terrif', 0.685287356376648),
 ('satisfi', 0.6795302629470825),
 ('hearti', 0.6712507009506226),
 ('good', 0.6307939291000366),
 ('nice', 0.6246684193611145),
 ('fantast', 0.6041843295097351),
 ('crunchi', 0.5921370983123779),
 ('fill', 0.5820397734642029)]

In [10]:
w2v_model.wv.most_similar('like')

[('dislik', 0.6997739672660828),
 ('weird', 0.6994861364364624),
 ('okay', 0.6726734042167664),
 ('appeal', 0.661604642868042),
 ('prefer', 0.6582334637641907),
 ('think', 0.6539950370788574),
 ('good', 0.6336985230445862),
 ('hate', 0.6330476999282837),
 ('enjoy', 0.6313024759292603),
 ('remind', 0.6099268794059753)]

# Avg Word2Vec

In [13]:
# Average Word2Vec
# compute average word2vec for each review
sent_vectors = [] # the avg-w2v for each sentence/review is stored in this list
for sent in tqdm(list_of_sent): # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of length 50
    cnt_words = 0 # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in rview/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words # taking average
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

100%|██████████| 87773/87773 [01:18<00:00, 1117.55it/s]


87773
50
