In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import sklearn
import nltk
from gensim.models import Word2Vec

%matplotlib inline

In [2]:
empath = pd.read_pickle('empath')

empath.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,unixReviewTime,categories,brand,healthScore,danger
0,A1ZQZ8RJS1XVTX,0657745316,"No sugar, no GMO garbage, no fillers that come...",5,2013-10-11,[[Grocery & Gourmet Food]],,0.0,0.0
1,A31W38VGZAUUM4,0700026444,"This is my absolute, undisputed favorite tea r...",5,2012-12-06,[[Grocery & Gourmet Food]],,0.0,
2,A3I0AV0UJX5OH0,1403796890,I ordered spongbob slippers and I got John Cen...,1,2013-12-02,[[Grocery & Gourmet Food]],,0.0,
3,A3QAAOLIXKV383,1403796890,The cart is fine and works for the purpose for...,3,2011-06-12,[[Grocery & Gourmet Food]],,0.0,0.0
4,AB1A5EGHHVA9M,141278509X,This product by Archer Farms is the best drink...,5,2012-03-24,[[Grocery & Gourmet Food]],,0.0,


In [3]:
empath = empath.drop(labels=['unixReviewTime', 'categories', 'brand', 'overall'], axis=1)

empath.head()

Unnamed: 0,reviewerID,asin,reviewText,healthScore,danger
0,A1ZQZ8RJS1XVTX,0657745316,"No sugar, no GMO garbage, no fillers that come...",0.0,0.0
1,A31W38VGZAUUM4,0700026444,"This is my absolute, undisputed favorite tea r...",0.0,
2,A3I0AV0UJX5OH0,1403796890,I ordered spongbob slippers and I got John Cen...,0.0,
3,A3QAAOLIXKV383,1403796890,The cart is fine and works for the purpose for...,0.0,0.0
4,AB1A5EGHHVA9M,141278509X,This product by Archer Farms is the best drink...,0.0,


In [4]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

stop_words = set(stopwords.words('english')) 

stemmer = PorterStemmer()

def process_text(sentence):
    token_words = nltk.word_tokenize(sentence)
    no_stopwords = [word.lower() for word in token_words if word not in stop_words and not \
                    all(c in string.punctuation for c in word) and not len(word) < 2]
    return [stemmer.stem(word) for word in no_stopwords]

print(process_text('I ordered spongbob slippers and I got John'))

['order', 'spongbob', 'slipper', 'got', 'john']


In [5]:
empath['tokenReviews'] = empath['reviewText'].apply(lambda x : process_text(x))

empath.head()

Unnamed: 0,reviewerID,asin,reviewText,healthScore,danger,tokenReviews
0,A1ZQZ8RJS1XVTX,0657745316,"No sugar, no GMO garbage, no fillers that come...",0.0,0.0,"[no, sugar, gmo, garbag, filler, come, store, ..."
1,A31W38VGZAUUM4,0700026444,"This is my absolute, undisputed favorite tea r...",0.0,,"[thi, absolut, undisput, favorit, tea, right, ..."
2,A3I0AV0UJX5OH0,1403796890,I ordered spongbob slippers and I got John Cen...,0.0,,"[order, spongbob, slipper, got, john, cena, ha..."
3,A3QAAOLIXKV383,1403796890,The cart is fine and works for the purpose for...,0.0,0.0,"[the, cart, fine, work, purpos, bought, farmer..."
4,AB1A5EGHHVA9M,141278509X,This product by Archer Farms is the best drink...,0.0,,"[thi, product, archer, farm, best, drink, mix,..."


In [6]:
size=100

model = Word2Vec(empath['tokenReviews'].values, min_count=1, size=size)

print(model)

Word2Vec(vocab=510787, size=100, alpha=0.025)


In [7]:
model.save('model.bin')

In [8]:
def token_to_vectors(x):
    arr = []
    for word in x:
        if word in model.wv.vocab:
            arr.append(model.wv[word])
    if (len(arr) == 0):
        return np.zeros((size,))
    return np.array(arr).mean(axis=0)

print(token_to_vectors(['gmo', 'undisputed']))

[-1.4299864e+00  6.1419249e-01 -6.7280042e-01 -4.7057580e-02
 -1.9351022e+00  1.3490531e+00  4.4879012e+00  1.5074408e+00
 -5.2873306e+00  7.6964438e-01 -2.9895549e+00 -1.8226904e+00
 -3.1845052e+00  6.5057032e-02  2.0344546e+00  3.3011502e-01
 -8.9194840e-01  2.4288208e+00 -7.8935671e-01  2.2583289e+00
 -8.4587640e-01  1.7959960e+00 -7.1261251e-01  1.7581066e+00
  9.3076175e-01 -4.1530335e-01  2.4760461e+00  8.2634765e-01
  2.1033955e+00 -2.3181653e+00 -5.5255210e-01  1.4868140e+00
 -1.3535790e-01 -1.5631964e+00 -9.5714867e-01  1.0214260e+00
  1.7677661e+00 -2.1691422e+00  3.0528378e+00  3.3635335e+00
  3.4955209e-01  3.0244169e+00 -1.8882445e+00 -1.7928773e+00
  2.1530716e+00 -1.7817794e-01  3.2175586e-01 -1.3825505e+00
  2.7364223e+00  3.5132888e-01 -2.0208201e+00  6.0491118e+00
 -6.2275511e-01  6.0901594e-01  3.5119581e+00  2.5449550e+00
  2.1350789e+00 -1.4994820e+00 -3.0934546e+00  1.9576004e+00
 -3.1513623e-03  5.5247908e+00 -1.9490995e+00 -2.4808099e+00
 -1.8262484e+00 -1.02238

In [9]:
empath['feature'] = empath['tokenReviews'].apply(lambda x : token_to_vectors(x))

empath.head()

Unnamed: 0,reviewerID,asin,reviewText,healthScore,danger,tokenReviews,feature
0,A1ZQZ8RJS1XVTX,0657745316,"No sugar, no GMO garbage, no fillers that come...",0.0,0.0,"[no, sugar, gmo, garbag, filler, come, store, ...","[-0.34659338, -0.34547496, -0.25911003, -0.037..."
1,A31W38VGZAUUM4,0700026444,"This is my absolute, undisputed favorite tea r...",0.0,,"[thi, absolut, undisput, favorit, tea, right, ...","[0.25467676, -0.6140808, -0.3436196, -0.129033..."
2,A3I0AV0UJX5OH0,1403796890,I ordered spongbob slippers and I got John Cen...,0.0,,"[order, spongbob, slipper, got, john, cena, ha...","[-0.4425382, -0.038782705, 0.17269209, -0.1097..."
3,A3QAAOLIXKV383,1403796890,The cart is fine and works for the purpose for...,0.0,0.0,"[the, cart, fine, work, purpos, bought, farmer...","[0.10807245, -0.47217086, -0.37766388, 0.14615..."
4,AB1A5EGHHVA9M,141278509X,This product by Archer Farms is the best drink...,0.0,,"[thi, product, archer, farm, best, drink, mix,...","[0.32985213, -1.4198517, -1.0735645, 0.8576383..."


In [11]:
train = empath[~empath['danger'].isna()]

train.head()

Unnamed: 0,reviewerID,asin,reviewText,healthScore,danger,tokenReviews,feature
0,A1ZQZ8RJS1XVTX,657745316,"No sugar, no GMO garbage, no fillers that come...",0.0,0.0,"[no, sugar, gmo, garbag, filler, come, store, ...","[-0.34659338, -0.34547496, -0.25911003, -0.037..."
3,A3QAAOLIXKV383,1403796890,The cart is fine and works for the purpose for...,0.0,0.0,"[the, cart, fine, work, purpos, bought, farmer...","[0.10807245, -0.47217086, -0.37766388, 0.14615..."
6,A3LZA698SQPCXE,1453060464,My wife picked some of this up on sale. I usu...,0.0,0.0,"[my, wife, pick, sale, usual, drink, crystal, ...","[-0.45956272, -0.0620397, -0.018273983, -0.152..."
19,A2IPE2KFGTZMI3,3295000018,"Fast and hot, did I mention hot, well it's hot",0.0,0.0,"[fast, hot, mention, hot, well, 's, hot]","[-0.59457356, -1.7851154, -0.10007, -1.5215352..."
23,A2H05FXY2BXUAN,3295000018,High quality! worth every penny you pay for it...,0.0,0.0,"[high, qualiti, worth, everi, penni, pay, the,...","[-0.7141961, -0.40475655, 0.48530835, -0.63792..."


In [14]:
from sklearn.model_selection import train_test_split

stratify_parameter = train.danger

X = np.array(train['feature'].tolist())

X_train, X_test, y_train, y_test = train_test_split(X, train['danger'].astype(int).values, \
                                                    test_size=0.1, random_state=10, \
                                                   stratify=stratify_parameter)

In [15]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=100, random_state=10)
classifier.fit(X_train, y_train) 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=10, verbose=0,
                       warm_start=False)

In [21]:
from sklearn.metrics import accuracy_score

y_pred = classifier.predict(X_test)

print("Accuracy on validation set : {0:.3f}".format(accuracy_score(y_test, y_pred)))

Accuracy on validation set : 0.912


In [22]:
test = empath[empath['danger'].isna()]

test.head()

Unnamed: 0,reviewerID,asin,reviewText,healthScore,danger,tokenReviews,feature
1,A31W38VGZAUUM4,0700026444,"This is my absolute, undisputed favorite tea r...",0.0,,"[thi, absolut, undisput, favorit, tea, right, ...","[0.25467676, -0.6140808, -0.3436196, -0.129033..."
2,A3I0AV0UJX5OH0,1403796890,I ordered spongbob slippers and I got John Cen...,0.0,,"[order, spongbob, slipper, got, john, cena, ha...","[-0.4425382, -0.038782705, 0.17269209, -0.1097..."
4,AB1A5EGHHVA9M,141278509X,This product by Archer Farms is the best drink...,0.0,,"[thi, product, archer, farm, best, drink, mix,...","[0.32985213, -1.4198517, -1.0735645, 0.8576383..."
5,A3DTB6RVENLQ9Q,1453060375,Don't buy this item - rip off at this price. ...,0.0,,"[do, n't, buy, item, rip, price, my, bad, mist...","[-0.046405952, 0.04730592, -1.1957499, 1.58346..."
7,A2XZPK86YY9R6G,1453060782,I bought these on sale (2 for $4) at my local ...,0.0,,"[bought, sale, local, supermarket, probabl, ov...","[-0.038239032, -0.6970307, -0.2797609, 0.05108..."


In [24]:
X_testing_set = np.array(test['feature'].tolist())

test_pred = classifier.predict(X_testing_set)

MemoryError: Unable to allocate array with shape (1152954, 100) and data type float64