In [1]:
import pandas as pd
import string
import re
import time

In [2]:
df = pd.read_csv('AmazonReviews.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568438 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [3]:
text = df.Text.to_list()
text[0]

'I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'

In [4]:
text2 = [re.sub(r'[^\w\s]','', x).lower().split() for x in text]
text2[:5]

[['i',
  'have',
  'bought',
  'several',
  'of',
  'the',
  'vitality',
  'canned',
  'dog',
  'food',
  'products',
  'and',
  'have',
  'found',
  'them',
  'all',
  'to',
  'be',
  'of',
  'good',
  'quality',
  'the',
  'product',
  'looks',
  'more',
  'like',
  'a',
  'stew',
  'than',
  'a',
  'processed',
  'meat',
  'and',
  'it',
  'smells',
  'better',
  'my',
  'labrador',
  'is',
  'finicky',
  'and',
  'she',
  'appreciates',
  'this',
  'product',
  'better',
  'than',
  'most'],
 ['product',
  'arrived',
  'labeled',
  'as',
  'jumbo',
  'salted',
  'peanutsthe',
  'peanuts',
  'were',
  'actually',
  'small',
  'sized',
  'unsalted',
  'not',
  'sure',
  'if',
  'this',
  'was',
  'an',
  'error',
  'or',
  'if',
  'the',
  'vendor',
  'intended',
  'to',
  'represent',
  'the',
  'product',
  'as',
  'jumbo'],
 ['this',
  'is',
  'a',
  'confection',
  'that',
  'has',
  'been',
  'around',
  'a',
  'few',
  'centuries',
  'it',
  'is',
  'a',
  'light',
  'pillowy',

In [5]:
tokens = text2[:200000]
len(tokens)

200000

In [6]:
import gensim
from gensim.models.callbacks import CallbackAny2Vec
from gensim.models import Word2Vec

In [7]:
class Callback(CallbackAny2Vec):
    
    def __init__(self):
        self.epoch = 0
        
    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        elif self.epoch % 10 == 0:        
            print('Loss after epoch {}: {}'.format(self.epoch, loss - self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

In [8]:
w2v = Word2Vec(size = 300,        # embedding size
               window = 15,       # max distance between target and context word 
               min_count = 2,     # ignore words with freq lower than this
               workers = 4,       # threads
               sg = 1,            # 1 for skip-gram, 0 for CBOW
               negative = 5,      # negative sampling number
               sample = 1e-5)     # hyper-parameter to downsample high freq word

In [9]:
w2v.build_vocab(tokens)

In [10]:
start = time.time()
w2v.train(tokens, total_examples=w2v.corpus_count, epochs=101,
         report_delay=1, compute_loss=True, callbacks=[Callback()])
end = time.time()

print('Time cost: ' + str(end-start))

Loss after epoch 0: 19041792.0
Loss after epoch 10: 705472.0
Loss after epoch 20: 734552.0
Loss after epoch 30: 657536.0
Loss after epoch 40: 600680.0
Loss after epoch 50: 526160.0
Loss after epoch 60: 449072.0
Loss after epoch 70: 369360.0
Loss after epoch 80: 286224.0
Loss after epoch 90: 219496.0
Loss after epoch 100: 168920.0
Time cost: 1710.249447107315


In [13]:
w2v.save('w2v(amazon).model')

In [16]:
reloaded_w2c = Word2Vec.load('w2v(amazon).model')
words = list(reloaded_w2c.wv.vocab)
print('Vocab size: ' + str(len(words)))

Vocab size: 66720


In [20]:
words[:100]

['i',
 'have',
 'bought',
 'several',
 'of',
 'the',
 'vitality',
 'canned',
 'dog',
 'food',
 'products',
 'and',
 'found',
 'them',
 'all',
 'to',
 'be',
 'good',
 'quality',
 'product',
 'looks',
 'more',
 'like',
 'a',
 'stew',
 'than',
 'processed',
 'meat',
 'it',
 'smells',
 'better',
 'my',
 'labrador',
 'is',
 'finicky',
 'she',
 'appreciates',
 'this',
 'most',
 'arrived',
 'labeled',
 'as',
 'jumbo',
 'salted',
 'peanutsthe',
 'peanuts',
 'were',
 'actually',
 'small',
 'sized',
 'unsalted',
 'not',
 'sure',
 'if',
 'was',
 'an',
 'error',
 'or',
 'vendor',
 'intended',
 'represent',
 'confection',
 'that',
 'has',
 'been',
 'around',
 'few',
 'centuries',
 'light',
 'pillowy',
 'citrus',
 'gelatin',
 'with',
 'nuts',
 'in',
 'case',
 'filberts',
 'cut',
 'into',
 'tiny',
 'squares',
 'then',
 'liberally',
 'coated',
 'powdered',
 'sugar',
 'mouthful',
 'heaven',
 'too',
 'chewy',
 'very',
 'flavorful',
 'highly',
 'recommend',
 'yummy',
 'treat',
 'you',
 'are',
 'familiar'

In [23]:
w1 = 'several'
print('Top 3 similar words:')
reloaded_w2c.wv.most_similar(positive = w1, topn = 3)
# need more training, need remove stopwords

Top 3 similar words:


[('have', 0.7322862148284912),
 ('been', 0.6838940382003784),
 ('ive', 0.6773158311843872)]

In [21]:
print('Similarity between A and B:')
reloaded_w2c.wv.similarity(w1='better', w2='good')

Similarity between A and B:


0.77047503

In [None]:
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling


def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    vectors = [] # positions in vector space
    labels = [] # keep track of words to label our data again later
    for word in model.wv.vocab:
        vectors.append(model.wv[word])
        labels.append(word)

    # convert both lists into numpy vectors for reduction
    vectors = np.asarray(vectors)
    #labels = np.asarray(labels)

    # reduce using t-SNE
    vectors = np.asarray(vectors)
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(reloaded_w2c)

In [None]:
import matplotlib.pyplot as plt
import random

def plot_with_matplotlib(x_vals, y_vals, labels):


    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    
    
    indices = list(range(len(labels)))
    #selected_indices = random.sample(indices, 25)
    selected_indices=[]
    index = labels.index("cell")
    selected_indices.append(index)
    index = labels.index("phone")
    selected_indices.append(index)
    index = labels.index("noise")
    selected_indices.append(index)
    index = labels.index("cancellation")
    selected_indices.append(index)
    index = labels.index("charger")
    selected_indices.append(index)
    index = labels.index("charge")
    selected_indices.append(index)
    index = labels.index("poor")
    selected_indices.append(index)
    index = labels.index("bad")
    selected_indices.append(index)
    
    
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))



plot_function = plot_with_matplotlib


plot_function(x_vals, y_vals, labels)