In [1]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt

In [2]:
REVIEWS_FILE = "./data/Grocery_and_Gourmet_Food.json"

In [3]:
def load_pickle(pickle_path):
    with open(pickle_path, 'rb') as file:
        return pickle.load(file)

In [4]:
def save_pickle(result, pickle_path):
    with open(pickle_path, 'wb') as file:
        pickle.dump(result, file)

In [5]:
def read_reviews(path=REVIEWS_FILE, pickle_path="./temp/reviews", cluster=True):
    try:
        return load_pickle(pickle_path)
    except (FileNotFoundError, EOFError) as e:
        reviews = pd.read_json(REVIEWS_FILE, lines=True)
        if not cluster:
            save_pickle(reviews, pickle_path)
        return reviews

In [6]:
def format_reviews(reviews):
    reviews["overall"] = reviews["overall"].astype(int)
    reviews["verified"] = reviews["verified"].astype(bool)
    
    # Cast reviewTime to date
    reviews["reviewTime"] = reviews["reviewTime"].str.replace("^0", "")
    reviews["reviewTime"] = pd.to_datetime(reviews["reviewTime"], format="%m %d, %Y")
    
    # Cast vote to int, while changing NaN values to 0
    reviews.loc[reviews["vote"].isnull(), "vote"] = "0"
    reviews["vote"] = reviews["vote"].str.replace(",", "") # Thousands are separated with commas, we remove them. e.g. 1,881 = 1881
    reviews["vote"] = reviews["vote"].astype(int)

In [7]:
reviews = read_reviews()
reviews_raw = reviews.copy()

In [8]:
reviews_raw.head()

Unnamed: 0,asin,image,overall,reviewText,reviewTime,reviewerID,reviewerName,style,summary,unixReviewTime,verified,vote
0,1888861614,,5,Very pleased with my purchase. Looks exactly l...,"06 4, 2013",ALP49FBWT4I7V,Lori,,Love it,1370304000,True,
1,1888861614,,4,Very nicely crafted but too small. Am going to...,"05 23, 2014",A1KPIZOCLB9FZ8,BK Shopper,,Nice but small,1400803200,True,
2,1888861614,,4,still very pretty and well made...i am super p...,"05 9, 2014",A2W0FA06IYAYQE,daninethequeen,,"the ""s"" looks like a 5, kina",1399593600,True,
3,1888861614,,5,"I got this for our wedding cake, and it was ev...","04 20, 2014",A2PTZTCH2QUYBC,Tammara,,Would recommend this to a friend!,1397952000,True,
4,1888861614,,4,It was just what I want to put at the top of m...,"04 16, 2014",A2VNHGJ59N4Z90,LaQuinta Alexander,,Topper,1397606400,True,


In [9]:
reviews = reviews_raw.copy()
%time format_reviews(reviews)

CPU times: user 27.3 s, sys: 619 ms, total: 27.9 s
Wall time: 27.8 s


In [10]:
reviews.head()

Unnamed: 0,asin,image,overall,reviewText,reviewTime,reviewerID,reviewerName,style,summary,unixReviewTime,verified,vote
0,1888861614,,5,Very pleased with my purchase. Looks exactly l...,2013-06-04,ALP49FBWT4I7V,Lori,,Love it,1370304000,True,0
1,1888861614,,4,Very nicely crafted but too small. Am going to...,2014-05-23,A1KPIZOCLB9FZ8,BK Shopper,,Nice but small,1400803200,True,0
2,1888861614,,4,still very pretty and well made...i am super p...,2014-05-09,A2W0FA06IYAYQE,daninethequeen,,"the ""s"" looks like a 5, kina",1399593600,True,0
3,1888861614,,5,"I got this for our wedding cake, and it was ev...",2014-04-20,A2PTZTCH2QUYBC,Tammara,,Would recommend this to a friend!,1397952000,True,0
4,1888861614,,4,It was just what I want to put at the top of m...,2014-04-16,A2VNHGJ59N4Z90,LaQuinta Alexander,,Topper,1397606400,True,0


In [11]:
reviews.head().isnull()

Unnamed: 0,asin,image,overall,reviewText,reviewTime,reviewerID,reviewerName,style,summary,unixReviewTime,verified,vote
0,False,True,False,False,False,False,False,True,False,False,False,False
1,False,True,False,False,False,False,False,True,False,False,False,False
2,False,True,False,False,False,False,False,True,False,False,False,False
3,False,True,False,False,False,False,False,True,False,False,False,False
4,False,True,False,False,False,False,False,True,False,False,False,False


In [12]:
reviews.describe()

Unnamed: 0,overall,unixReviewTime,vote
count,5074160.0,5074160.0,5074160.0
mean,4.314708,1446592000.0,0.8295479
std,1.249303,62278390.0,12.06719
min,1.0,961372800.0,0.0
25%,4.0,1416096000.0,0.0
50%,5.0,1456790000.0,0.0
75%,5.0,1491782000.0,0.0
max,5.0,1538870000.0,12174.0
