In [1]:
#preprocess the .json file
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

df = pd.read_json('reviews.json', lines=True)
data = df[["reviewText", "overall"]]

In [2]:
data.head()

Unnamed: 0,reviewText,overall
0,"This came in on time and I am veru happy with it, I haved used it already and it makes taking ou...",5
1,"I had a factory Glock tool that I was using for my Glock 26, 27, and 17. I've since lost it and...",5
2,"If you don't have a 3/32 punch or would like to have one in your Glock bag, this is okay. The b...",4
3,"This works no better than any 3/32 punch you would find at the hardware store. Actually, I think...",4
4,I purchased this thinking maybe I need a special tool to easily pop off my base plates for my ma...,4


In [3]:
# the preprocess_documents function in gensim automatically stem
# which is not desired
# thus this customized gensim_strip function

#from gensim.parsing.preprocessing import preprocess_documents
from gensim.parsing.preprocessing import strip_tags
from gensim.parsing.preprocessing import strip_short
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_numeric
from gensim.parsing.preprocessing import strip_non_alphanum
from gensim.parsing.preprocessing import strip_multiple_whitespaces

def gensim_strip(input_data):
    input_data = input_data.lower()
    input_data = strip_tags(input_data)
    input_data = strip_short(input_data, minsize = 3)
    input_data = strip_punctuation(input_data)
    input_data = strip_numeric(input_data)
    input_data = strip_non_alphanum(input_data)
    input_data = strip_multiple_whitespaces(input_data)
    return input_data

In [4]:
import spacy

# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

In [5]:
# preprocess text using strip and lemmatization
def tokenvec(line):
    line = gensim_strip(line)
    doc = nlp(line)
    doc = [token.lemma_ for token in doc]
    return [token for token in doc if len(token) >= 3]

In [12]:
l = []
for index, row in data.iterrows():
    l.append(tokenvec(row['reviewText']))
data['tokenized review'] = l
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['tokenized review'] = l


Unnamed: 0,reviewText,overall,tokenized review
0,"This came in on time and I am veru happy with it, I haved used it already and it makes taking ou...",5,"[this, come, time, and, veru, happy, with, have, use, already, and, make, take, out, the, pin, g..."
1,"I had a factory Glock tool that I was using for my Glock 26, 27, and 17. I've since lost it and...",5,"[have, factory, glock, tool, that, use, for, glock, and, since, lose, and, have, need, another, ..."
2,"If you don't have a 3/32 punch or would like to have one in your Glock bag, this is okay. The b...",4,"[you, don, have, punch, would, like, have, one, your, glock, bag, this, okay, the, butt, end, ha..."
3,"This works no better than any 3/32 punch you would find at the hardware store. Actually, I think...",4,"[this, work, well, than, any, punch, you, would, find, the, hardware, store, actually, think, yo..."
4,I purchased this thinking maybe I need a special tool to easily pop off my base plates for my ma...,4,"[purchase, this, thinking, maybe, need, special, tool, easily, pop, off, base, plate, for, magaz..."


In [13]:
df = data[['overall', 'tokenized review']]
df.head()

Unnamed: 0,overall,tokenized review
0,5,"[this, come, time, and, veru, happy, with, have, use, already, and, make, take, out, the, pin, g..."
1,5,"[have, factory, glock, tool, that, use, for, glock, and, since, lose, and, have, need, another, ..."
2,4,"[you, don, have, punch, would, like, have, one, your, glock, bag, this, okay, the, butt, end, ha..."
3,4,"[this, work, well, than, any, punch, you, would, find, the, hardware, store, actually, think, yo..."
4,4,"[purchase, this, thinking, maybe, need, special, tool, easily, pop, off, base, plate, for, magaz..."


In [24]:
df.to_csv('reviews cleaned.csv', index=False) 

In [36]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['tokenized review'], df['overall'] , test_size=0.4)

In [37]:
# Train the word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

In [38]:
w2v_model.wv.index_to_key[:20]

['the',
 'and',
 'for',
 'this',
 'have',
 'you',
 'that',
 'with',
 'but',
 'not',
 'they',
 'use',
 'well',
 'get',
 'one',
 'can',
 'very',
 'good',
 'great',
 'like']

In [39]:
# Find the most similar words to "great" based on word vectors from our trained model
w2v_model.wv.most_similar('great')

[('fantastic', 0.8327916264533997),
 ('excellent', 0.7735987305641174),
 ('good', 0.7623379230499268),
 ('awesome', 0.7607489228248596),
 ('wonderful', 0.6825358867645264),
 ('wonderfully', 0.6691790819168091),
 ('nice', 0.6552520990371704),
 ('perfect', 0.6474815011024475),
 ('terrific', 0.6224292516708374),
 ('amazing', 0.6089505553245544)]

In [40]:
words = set(w2v_model.wv.index_to_key )
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test])

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])


In [41]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [42]:
for i in range(5):
    print(len(X_train_vect[i]), len(X_train_vect_avg[i]))

18 100
43 100
101 100
90 100
21 100


In [43]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [44]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [45]:
y_test[:20]

286550    3
249500    5
272298    4
248675    5
214103    3
253136    5
228542    5
102010    5
120387    3
200542    4
133867    5
40369     5
142116    5
190640    5
149930    4
281714    4
145249    4
78847     5
178192    5
156567    5
Name: overall, dtype: int64

In [46]:
y_pred[:20]

array([5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5])

In [47]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.6457164550554688