In [1]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt

In [2]:
# Constant definitions
REVIEW_FILE='Grocery_and_Gourmet_Food.json'
META_FILE ='meta_Grocery_and_Gourmet_Food.json'
DATA_LOCATION='data/'
PICKLE_LOCATION='processed-data/'

In [3]:
def load_pickle(pickle_name):
    with open(PICKLE_LOCATION+pickle_name, 'rb') as file:
        print('loading pickle...')
        return pickle.load(file)

In [4]:
def save_pickle(result, pickle_name):
    with open(PICKLE_LOCATION+pickle_name, 'wb') as file:
        print('saving pickle...')
        pickle.dump(result, file)

In [5]:
def simplify_filename(file):
    if file == 'Grocery_and_Gourmet_Food.json':
         return 'reviews'
    elif file == 'meta_Grocery_and_Gourmet_Food.json':
        return 'meta'
    else:
        return file

def read_file(file, cluster=False):
    file_simple = simplify_filename(file)
    
    try:
        return load_pickle(file_simple)
    except (FileNotFoundError, EOFError) as e:
        file_df = pd.read_json(DATA_LOCATION+file, lines=True)
        if not cluster:
            save_pickle(file_df, file_simple)
        return file_df

In [6]:
%time reviews = read_file(REVIEW_FILE)

loading pickle...
CPU times: user 6.91 s, sys: 2.26 s, total: 9.17 s
Wall time: 9.19 s


In [7]:
%time meta = read_file(META_FILE)

loading pickle...
CPU times: user 2.83 s, sys: 1.52 s, total: 4.35 s
Wall time: 4.35 s


**Reviews formatting and extraction of categories**

In [8]:
def format_reviews(reviews):
    reviews["overall"] = reviews["overall"].astype(int)
    reviews["verified"] = reviews["verified"].astype(bool)
    
    # Cast reviewTime to date
    reviews["reviewTime"] = reviews["reviewTime"].str.replace("^0", "")
    reviews["reviewTime"] = pd.to_datetime(reviews["reviewTime"], format="%m %d, %Y")
    
    # Cast vote to int, while changing NaN values to 0
    reviews.loc[reviews["vote"].isnull(), "vote"] = "0"
    reviews["vote"] = reviews["vote"].str.replace(",", "") # Thousands are separated with commas, we remove them. e.g. 1,881 = 1881
    reviews["vote"] = reviews["vote"].astype(int)
    
    # transform nan values in reviewText and summary to empty string
    reviews.loc[reviews['reviewText'].isnull(), 'reviewText'] = ''
    reviews.loc[reviews['summary'].isnull(), 'reviewText'] = ''

In [9]:
reviews.head(3)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,image,style
0,5,True,"06 4, 2013",ALP49FBWT4I7V,1888861614,Lori,Very pleased with my purchase. Looks exactly l...,Love it,1370304000,,,
1,4,True,"05 23, 2014",A1KPIZOCLB9FZ8,1888861614,BK Shopper,Very nicely crafted but too small. Am going to...,Nice but small,1400803200,,,
2,4,True,"05 9, 2014",A2W0FA06IYAYQE,1888861614,daninethequeen,still very pretty and well made...i am super p...,"the ""s"" looks like a 5, kina",1399593600,,,


In [10]:
all_food_categories = pd.DataFrame(meta['category'].explode().unique(), columns=['category'])
format_reviews(reviews)

In [11]:
display(reviews.head(3))
reviews.describe()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,image,style
0,5,True,2013-06-04,ALP49FBWT4I7V,1888861614,Lori,Very pleased with my purchase. Looks exactly l...,Love it,1370304000,0,,
1,4,True,2014-05-23,A1KPIZOCLB9FZ8,1888861614,BK Shopper,Very nicely crafted but too small. Am going to...,Nice but small,1400803200,0,,
2,4,True,2014-05-09,A2W0FA06IYAYQE,1888861614,daninethequeen,still very pretty and well made...i am super p...,"the ""s"" looks like a 5, kina",1399593600,0,,


Unnamed: 0,overall,unixReviewTime,vote
count,5074160.0,5074160.0,5074160.0
mean,4.314708,1446592000.0,0.8295479
std,1.249303,62278390.0,12.06719
min,1.0,961372800.0,0.0
25%,4.0,1416096000.0,0.0
50%,5.0,1456790000.0,0.0
75%,5.0,1491782000.0,0.0
max,5.0,1538870000.0,12174.0


**Meta formatting**

In [12]:
def format_date(s):
    try:
        return pd.to_datetime(s)
    except:
        return pd.NaT

def format_meta(meta):
    # Not formated: also_view and also_bought, image and five last (feature, ...)
    # Map the list representation of category and description to string
    meta['category'] = meta['category'].map(lambda cat_list: ' '.join(cat_list))
    meta.loc[meta['description'].isnull(), 'description'] = meta.loc[meta['description'].isnull(), 'description'].map(lambda _: [])
    meta['description'] = meta['description'].map(lambda descr_list: ' '.join(descr_list))
    
    meta.loc[meta['title'].isnull(), 'title'] = meta.loc[meta['title'].isnull(), 'description'].map(lambda _: [])
    meta.loc[meta['brand'].isnull(), 'brand'].map(lambda _: '')
    meta.loc[meta['rank'].isnull(), 'rank'] = meta.loc[meta['rank'].isnull(), 'rank'].map(lambda _: '')
    meta.loc[meta['main_cat'].isnull(), 'main_cat'] = meta.loc[meta['main_cat'].isnull(), 'main_cat'].map(lambda _: [])
    
    # Verification: meta[(meta['price'].isnull()) | (meta['price'].str.contains('^\$'))].size - meta.size
    meta.loc[~meta['description'].isnull(), 'description'].str.replace('$', '', regex=False)
    
    # Cast date column to date format
    meta.loc[:, 'date'] = meta['date'].map(format_date)

In [13]:
format_meta(meta)

In [None]:
display(meta.isnull().sum())

category             0
description          0
title                0
brand            11422
rank                 0
also_view       166197
main_cat             0
price           155073
asin                 0
also_buy        203877
image           136995
date            277650
feature         270835
details          13886
similar_item    286953
tech1           286465
fit             287205
dtype: int64

**Lemmtization of the reviewText for analysis**

In [15]:
reviews[["summary", "reviewText"]]

Unnamed: 0,summary,reviewText
0,Love it,Very pleased with my purchase. Looks exactly l...
1,Nice but small,Very nicely crafted but too small. Am going to...
2,"the ""s"" looks like a 5, kina",still very pretty and well made...i am super p...
3,Would recommend this to a friend!,"I got this for our wedding cake, and it was ev..."
4,Topper,It was just what I want to put at the top of m...
...,...,...
5074155,Exceclent product,"Love this product, very fresh, complete full f..."
5074156,Good taste but small quantity for the price us...,Good taste but small quantity for the expensiv...
5074157,See Comments below,Unlike some of the bad reviews regarding aroma...
5074158,Love!!,I absolutely love this vanilla bean paste. I h...


In [9]:
import string
import nltk
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger');

# Common verbs https://eslgrammar.org/list-of-verbs/
common_verbs = np.array(['Accept', 'Guess', 'Achieve', 'Harass', 'Add', 'Hate', 'Admire', 'Hear', 'Admit', 'Help', 'Adopt', 'Hit', 'Advise', 'Hope', 'Agree', 'Identify', 'Allow', 'Interrupt', 'Announce', 'Introduce', 'Appreciate', 'Irritate', 'Approve', 'Jump', 'Argue', 'Keep', 'Arrive', 'Kick', 'Ask', 'Kiss', 'Assist', 'Laugh', 'Attack', 'Learn', 'Bake', 'Leave', 'Bathe', 'Lend', 'Be', 'Lie', 'Beat', 'Like', 'Become', 'Listen', 'Beg', 'Lose', 'Behave', 'Love', 'Bet', 'Make', 'Boast', 'Marry', 'Boil', 'Measure', 'Borrow', 'Meet', 'Breathe', 'Move', 'Bring', 'Murder', 'Build', 'Obey', 'Burn', 'Offend', 'Bury', 'Offer', 'Buy', 'Open', 'Call', 'Paint', 'Catch', 'Pay', 'Challenge', 'Pick', 'Change', 'Play', 'Cheat', 'Pray', 'Chew', 'Print', 'Choose', 'Pull', 'Clap', 'Punch', 'Clean', 'Punish', 'Collect', 'Purchase', 'Compare', 'Push', 'Complain', 'Quit', 'Confess', 'Race', 'Confuse', 'Read', 'Construct', 'Relax', 'Control', 'Remember', 'Copy', 'Reply', 'Count', 'Retire', 'Create', 'Rub', 'Cry', 'See', 'Damage', 'Select', 'Dance', 'Sell', 'Deliver', 'Send', 'Destroy', 'Sing', 'Disagree', 'Snore', 'Drag', 'Stand', 'Drive', 'Stare', 'Drop', 'Start', 'Earn', 'Stink', 'Eat', 'Study', 'Employ', 'Sweep', 'Encourage', 'Swim', 'Enjoy', 'Take', 'Establish', 'Talk', 'Estimate', 'Teach', 'Exercise', 'Tear', 'Expand', 'Tell', 'Explain', 'Thank', 'Fear', 'Travel', 'Feel', 'Type', 'Fight', 'Understand', 'Find', 'Use', 'Fly', 'Visit', 'Forget', 'Wait', 'Forgive', 'Walk', 'Fry', 'Want', 'Gather', 'Warn', 'Get', 'Wed', 'Give', 'Weep', 'Glow', 'Wink', 'Greet', 'Worry', 'Grow', 'Write', 'Yell'])

stopwords = np.array([remove_punctuation(w) for w in stopwords.words('english')] ()

def remove_punctuation(s):
    punct_chars = string.punctuation
    translator_punct = s.maketrans(punct_chars, ' ' * len(punct_chars))
    
    return s.translate(translator_punct)

def remove_stopwords(s):
    words = np.array(s.split())
    words_filtered = words[~np.in1d(words, stopwords)]
    
    return ' '.join(words_filtered)

def transform_tag(tag):
    tag_prefix = tag[0].lower()
    if tag_prefix in ['n', 'r', 'v', 'j']:
        return 'a' if tag_prefix == 'j' else tag_prefix
    else:
        return 'n'

def lemmatize(t):
    t = t.lower()
    t = remove_punctuation(t)
    t = remove_stopwords(t)
    t_token = word_tokenize(t)
    t_tags = pos_tag(t_token)
    t_lemm = [lemmatizer.lemmatize(w, transform_tag(t)) for w, t in t_tags]
    return t_lemm

[nltk_data] Downloading package punkt to /home/jules/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jules/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jules/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [35]:
print('Example of lemmatizing pipeline')
testpd = pd.DataFrame(np.array([['I tried to stop some words to see  the lemmatizing. \nWords word!! alias-alias to and'], 
                                ['Here we\'ll see.']]), columns=['raw'])
testpd['lemmatized'] = testpd['raw'].apply(lemmatize)
testpd

Example of lemmatizing pipeline


Unnamed: 0,raw,lemmatized
0,I tried to stop some words to see the lemmati...,"[i, try, stop, word, see, lemmatizing, word, w..."
1,Here we'll see.,"[here, see]"


In [36]:
print(reviews.size)
sample_size = reviews.size / 1200
%time reviews_sample = reviews.loc[:sample_size, :] #test
%time reviews_sample["reviewTextLemma"] = reviews_sample['reviewText'].apply(lemmatize)
reviews_sample

60889920
CPU times: user 9.92 ms, sys: 66 µs, total: 9.98 ms
Wall time: 8.32 ms
CPU times: user 1min 34s, sys: 881 ms, total: 1min 35s
Wall time: 1min 35s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,image,style,reviewTextLemma
0,5,True,2013-06-04,ALP49FBWT4I7V,1888861614,Lori,Very pleased with my purchase. Looks exactly l...,Love it,1370304000,0,,,"[very, pleased, purchase, look, exactly, like,..."
1,4,True,2014-05-23,A1KPIZOCLB9FZ8,1888861614,BK Shopper,Very nicely crafted but too small. Am going to...,Nice but small,1400803200,0,,,"[very, nicely, craft, small, be, go, add, flow..."
2,4,True,2014-05-09,A2W0FA06IYAYQE,1888861614,daninethequeen,still very pretty and well made...i am super p...,"the ""s"" looks like a 5, kina",1399593600,0,,,"[still, pretty, well, make, super, picky, list..."
3,5,True,2014-04-20,A2PTZTCH2QUYBC,1888861614,Tammara,"I got this for our wedding cake, and it was ev...",Would recommend this to a friend!,1397952000,0,,,"[i, get, wed, cake, everything, even, person, ..."
4,4,True,2014-04-16,A2VNHGJ59N4Z90,1888861614,LaQuinta Alexander,It was just what I want to put at the top of m...,Topper,1397606400,0,,,"[it, i, want, put, top, wedding, cake, i, love..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50737,5,True,2013-02-23,A8Z0BFDOBQV0,B0001JXDXG,Michelle Fenner,I used this all the time making my dipped choc...,Paramount Crystals is a must have for chocolate,1361577600,3,,"{'Color:': ' 8 Ounce', 'Package Quantity:': ' 1'}","[i, use, time, make, dipped, chocolate, oreo, ..."
50738,5,True,2013-01-17,A3JO8O7F3JBWZD,B0001JXDXG,Timotheos,Wow! these things really work. They thin out ...,great stuff!,1358380800,4,,"{'Color:': ' White', 'Package Quantity:': ' 1'}","[wow, thing, really, work, they, thin, candy, ..."
50739,4,True,2012-12-06,A29RXW2S5Q53G8,B0001JXDXG,Lilly,Didn't get to use for what I wanted for but gl...,Great,1354752000,0,,"{'Color:': ' White', 'Package Quantity:': ' 1'}","[didn, get, use, i, want, glad, i, future, pla..."
50740,5,True,2012-11-05,AJ5UMONR0JE2C,B0001JXDXG,Amazon Customer,These do a wonderful job of thinning candy mel...,Wish I would have bought long ago.,1352073600,7,,"{'Color:': ' White', 'Package Quantity:': ' 1'}","[these, wonderful, job, thin, candy, melt, eas..."


In [43]:
from gensim import corpora, models
import gensim

texts = reviews_sample["reviewTextLemma"].to_list()

# turn our tokenized documents into a id <-> term dictionary
%time dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
%time corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
%time ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=20)
print("4 min 21 sec")
topic_words = []
for t in range(ldamodel.num_topics):
    words = [ldamodel.id2word[i] for i, _ in ldamodel.get_topic_terms(t)]
    topic_words.append(words)
for idx, topic in ldamodel.print_topics(-1):
    print("Topic: {}\nWords: {}".format(idx, topic))
    print("\n")

4 min 21 sec
Topic: 0
Words: 0.139*"good" + 0.098*"taste" + 0.057*"fresh" + 0.052*"very" + 0.032*"buy" + 0.032*"like" + 0.024*"tasty" + 0.024*"these" + 0.022*"flake" + 0.022*"flavor"


Topic: 1
Words: 0.117*"i" + 0.014*"get" + 0.013*"find" + 0.013*"use" + 0.012*"buy" + 0.011*"store" + 0.010*"one" + 0.009*"order" + 0.009*"amazon" + 0.009*"time"


Topic: 2
Words: 0.078*"yeast" + 0.046*"bread" + 0.026*"cheese" + 0.020*"delicious" + 0.016*"mustard" + 0.015*"flavor" + 0.014*"licorice" + 0.013*"cinnamon" + 0.013*"recipe" + 0.012*"ginger"


Topic: 3
Words: 0.142*"great" + 0.135*"product" + 0.066*"price" + 0.051*"excellent" + 0.037*"good" + 0.028*"quality" + 0.025*"fast" + 0.022*"link" + 0.018*"shipping" + 0.015*"delivery"


Topic: 4
Words: 0.050*"tomato" + 0.048*"paste" + 0.030*"product" + 0.020*"vanilla" + 0.020*"taste" + 0.017*"use" + 0.017*"ingredient" + 0.016*"organic" + 0.015*"powder" + 0.013*"sugar"


Topic: 5
Words: 0.061*"chocolate" + 0.042*"use" + 0.041*"best" + 0.041*"make" + 0.038*

In [69]:
%time
a = np.array(['1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09'])
%time print(np.array(filter(lambda x:len(x) > 1, a)))
def panda_filter(nparr):
    t = pd.DataFrame(a)
    return t.loc[t[0].str.len() > 1, :].to_numpy()
%time print(panda_filter)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.25 µs
<filter object at 0x7f6857b1beb0>
CPU times: user 64 µs, sys: 0 ns, total: 64 µs
Wall time: 67.5 µs
<function panda_filter at 0x7f68573975e0>
CPU times: user 1.31 ms, sys: 17 µs, total: 1.32 ms
Wall time: 1.15 ms


In [73]:
%time
a = reviews_sample['reviewTextLemma'].to_numpy()
%time print(np.array(filter(lambda x:len(x) > 1, a)))
def filter_word_unit(words):
    # Panda for efficiency
    words_pd = pd.DataFrame(words)
    display(words_pd)
    words_pd.loc[words_pd[0].str.len() > 1, :].to_numpy()
    return words
%time print(filter_word_unit(a))

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.48 µs
<filter object at 0x7f6857abd880>
CPU times: user 77 µs, sys: 1 µs, total: 78 µs
Wall time: 73.7 µs


Unnamed: 0,0
0,1
1,2
2,aowidnawd
3,4
4,09
...,...
155,1
156,2
157,aowidnawd
158,4


['1' '2' 'aowidnawd' '4' '09' '1' '2' 'aowidnawd' '4' '09' '1' '2'
 'aowidnawd' '4' '09' '1' '2' 'aowidnawd' '4' '09' '1' '2' 'aowidnawd' '4'
 '09' '1' '2' 'aowidnawd' '4' '09' '1' '2' 'aowidnawd' '4' '09' '1' '2'
 'aowidnawd' '4' '09' '1' '2' 'aowidnawd' '4' '09' '1' '2' 'aowidnawd' '4'
 '09' '1' '2' 'aowidnawd' '4' '09' '1' '2' 'aowidnawd' '4' '09' '1' '2'
 'aowidnawd' '4' '09' '1' '2' 'aowidnawd' '4' '09' '1' '2' 'aowidnawd' '4'
 '09' '1' '2' 'aowidnawd' '4' '09' '1' '2' 'aowidnawd' '4' '09' '1' '2'
 'aowidnawd' '4' '09' '1' '2' 'aowidnawd' '4' '09' '1' '2' 'aowidnawd' '4'
 '09' '1' '2' 'aowidnawd' '4' '09' '1' '2' 'aowidnawd' '4' '09' '1' '2'
 'aowidnawd' '4' '09' '1' '2' 'aowidnawd' '4' '09' '1' '2' 'aowidnawd' '4'
 '09' '1' '2' 'aowidnawd' '4' '09' '1' '2' 'aowidnawd' '4' '09' '1' '2'
 'aowidnawd' '4' '09' '1' '2' 'aowidnawd' '4' '09' '1' '2' 'aowidnawd' '4'
 '09' '1' '2' 'aowidnawd' '4' '09' '1' '2' 'aowidnawd' '4' '09']
CPU times: user 12.1 ms, sys: 0 ns, total: 12.1 ms
Wall ti

In [66]:
a = np.array(['1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09','1', '2', 'aowidnawd', '4', '09'])
t = pd.DataFrame(a)
display(t)

t.loc[t[0].str.len() > 1, :]

Unnamed: 0,0
0,1
1,2
2,aowidnawd
3,4
4,09
...,...
155,1
156,2
157,aowidnawd
158,4


Unnamed: 0,0
2,aowidnawd
4,09
7,aowidnawd
9,09
12,aowidnawd
...,...
149,09
152,aowidnawd
154,09
157,aowidnawd
