In [15]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt

In [40]:
# Constant definitions
REVIEW_FILE='Grocery_and_Gourmet_Food.json'
META_FILE ='meta_Grocery_and_Gourmet_Food.json'
DATA_LOCATION='data/'
PICKLE_LOCATION='processed-data/'

In [90]:
def load_pickle(pickle_name):
    with open(PICKLE_LOCATION+pickle_name, 'rb') as file:
        print('loading pickle...')
        return pickle.load(file)

In [42]:
def save_pickle(result, pickle_name):
    with open(PICKLE_LOCATION+pickle_name, 'wb') as file:
        pickle.dump(result, file)

In [91]:
def read_file(file, cluster=False):
    if file == 'Grocery_and_Gourmet_Food.json':
        file_simple = 'reviews'
    elif file == 'meta_Grocery_and_Gourmet_Food.json':
        file_simple = 'meta'
    else:
        file_simple = file
    
    try:
        return load_pickle(file_simple)
    except (FileNotFoundError, EOFError) as e:
        file_df = pd.read_json(DATA_LOCATION+file, lines=True)
        if not cluster:
            save_pickle(file_df, file_simple)
        return file_df

In [44]:
%time a = read_file(REVIEW_FILE)
%time b = read_file(REVIEW_FILE)

data/Grocery_and_Gourmet_Food.json
CPU times: user 1min 2s, sys: 12.3 s, total: 1min 15s
Wall time: 1min 18s
CPU times: user 8.28 s, sys: 1.01 s, total: 9.29 s
Wall time: 9.31 s


In [76]:
def format_reviews(reviews):
    reviews["overall"] = reviews["overall"].astype(int)
    reviews["verified"] = reviews["verified"].astype(bool)
    
    # Cast reviewTime to date
    reviews["reviewTime"] = reviews["reviewTime"].str.replace("^0", "")
    reviews["reviewTime"] = pd.to_datetime(reviews["reviewTime"], format="%m %d, %Y")
    
    # Cast vote to int, while changing NaN values to 0
    reviews.loc[reviews["vote"].isnull(), "vote"] = "0"
    reviews["vote"] = reviews["vote"].str.replace(",", "") # Thousands are separated with commas, we remove them. e.g. 1,881 = 1881
    reviews["vote"] = reviews["vote"].astype(int)
    
    # transform nan values in reviewText and summary to empty string
    reviews.loc[reviews['reviewText'].isnull(), 'reviewText'] = ''

In [None]:
reviews = read_file(REVIEW_FILE)
meta = read_file(META_FILE)

In [50]:
reviews.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,image,style
0,5,True,"06 4, 2013",ALP49FBWT4I7V,1888861614,Lori,Very pleased with my purchase. Looks exactly l...,Love it,1370304000,,,
1,4,True,"05 23, 2014",A1KPIZOCLB9FZ8,1888861614,BK Shopper,Very nicely crafted but too small. Am going to...,Nice but small,1400803200,,,
2,4,True,"05 9, 2014",A2W0FA06IYAYQE,1888861614,daninethequeen,still very pretty and well made...i am super p...,"the ""s"" looks like a 5, kina",1399593600,,,
3,5,True,"04 20, 2014",A2PTZTCH2QUYBC,1888861614,Tammara,"I got this for our wedding cake, and it was ev...",Would recommend this to a friend!,1397952000,,,
4,4,True,"04 16, 2014",A2VNHGJ59N4Z90,1888861614,LaQuinta Alexander,It was just what I want to put at the top of m...,Topper,1397606400,,,


In [77]:
format_reviews(reviews)

In [51]:
reviews.head().isnull()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,image,style
0,False,False,False,False,False,False,False,False,False,True,True,True
1,False,False,False,False,False,False,False,False,False,True,True,True
2,False,False,False,False,False,False,False,False,False,True,True,True
3,False,False,False,False,False,False,False,False,False,True,True,True
4,False,False,False,False,False,False,False,False,False,True,True,True


In [52]:
reviews.describe()

Unnamed: 0,overall,unixReviewTime
count,5074160.0,5074160.0
mean,4.314708,1446592000.0
std,1.249303,62278390.0
min,1.0,961372800.0
25%,4.0,1416096000.0
50%,5.0,1456790000.0
75%,5.0,1491782000.0
max,5.0,1538870000.0


In [53]:
reviews[["summary", "reviewText"]]

Unnamed: 0,summary,reviewText
0,Love it,Very pleased with my purchase. Looks exactly l...
1,Nice but small,Very nicely crafted but too small. Am going to...
2,"the ""s"" looks like a 5, kina",still very pretty and well made...i am super p...
3,Would recommend this to a friend!,"I got this for our wedding cake, and it was ev..."
4,Topper,It was just what I want to put at the top of m...
...,...,...
5074155,Exceclent product,"Love this product, very fresh, complete full f..."
5074156,Good taste but small quantity for the price us...,Good taste but small quantity for the expensiv...
5074157,See Comments below,Unlike some of the bad reviews regarding aroma...
5074158,Love!!,I absolutely love this vanilla bean paste. I h...


**Lemmtization of the reviewText for analysis**

In [86]:
import string
import nltk
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger');

def remove_punctuation(s):
    punct_chars = string.punctuation
    translator_punct = s.maketrans(punct_chars, ' ' * len(punct_chars))
    
    return s.translate(translator_punct)

def remove_stopwords(s):
    stopwords_np = np.array(stopwords.words('english'))
    words = np.array(s.split())
    words_filtered = words[~np.in1d(words, stopwords_np)]
    
    return ' '.join(words_filtered)

def transform_tag(tag):
    tag_prefix = tag[0].lower()
    if tag_prefix in ['n', 'r', 'v', 'j']:
        return 'a' if tag_prefix == 'j' else tag_prefix
    else:
        return 'n'

def lemmatize(t):
    t = remove_punctuation(t)
    t = remove_stopwords(t)
    t_token = word_tokenize(t)
    t_tags = pos_tag(t_token)
    print(t_tags)
    t_lemm = [lemmatizer.lemmatize(w, transform_tag(t)) for w, t in t_tags]
    return ' '.join(t_lemm)

[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading wordnet: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [Errno -3] Temporary failure in name resolution>


In [87]:
print('Example of lemmatizing pipeline')
testpd = pd.DataFrame(np.array([['I tried to stop some words to see  the lemmatizing. \nWords word!! alias-alias to and'], 
                                ['Here we\'ll see.']]), columns=['raw'])
testpd['lemmatized'] = testpd['raw'].apply(lemmatize)
testpd

Example of lemmatizing pipeline
[('I', 'PRP'), ('tried', 'VBD'), ('stop', 'JJ'), ('words', 'NNS'), ('see', 'VBP'), ('lemmatizing', 'JJ'), ('Words', 'NNP'), ('word', 'NN'), ('alias', 'NN'), ('alias', 'NN')]
[('Here', 'RB'), ('see', 'VB')]


Unnamed: 0,raw,lemmatized
0,I tried to stop some words to see the lemmati...,I try stop word see lemmatizing Words word ali...
1,Here we'll see.,Here see


In [None]:
# Very slow, didn't run yet
reviews["reviewTextLemma"] = reviews['reviewText'].apply(lemmatize)
reviews