## Overview
1. load Home and Kitchen data
2. select reviewers who left more than 15 reviews (9,860)
3. light preprocessing

In [30]:
import gzip
import nltk
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import re

from tqdm import tqdm

pd.set_option('display.max_colwidth', 600)
%matplotlib inline

In [2]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

data = getDF('reviews_Home_and_Kitchen.json.gz')

## 1. Preprocessing

* 개인 당 리뷰 수가 n개 이상인 경우
* 시간 순 정렬

In [3]:
def left_reviewerId_over_n(data, n):
    reviewer_dict = dict()
    for id_ in data.reviewerID:
        if reviewer_dict.get(id_): 
            reviewer_dict[id_] += 1
        else:
            reviewer_dict[id_] = 1
    over_n = []
    for id_, cnt in reviewer_dict.items():
        if cnt >= n:
            over_n.append(id_)
    return over_n

In [4]:
n = 15

In [5]:
reviewer_over_n = left_reviewerId_over_n(data, n)

In [6]:
print("reviewer whose number of reviews is over n: {}".format(len(reviewer_over_n))) # reviewer 수

reviewer whose number of reviews is over n: 9860


In [7]:
data_over_n = data[data.reviewerID.isin(reviewer_over_n)]

In [8]:
del(data)

In [9]:
data_over_n.sort_values('unixReviewTime', inplace=True)

In [10]:
print("number of items in data_over_n: {}".format(len(set(data_over_n.asin))))

number of items in data_over_n: 81221


### 1.1 text word indexing

In [11]:
lemma = nltk.wordnet.WordNetLemmatizer()

In [12]:
stemmed_review = []
for review in tqdm(data_over_n.reviewText):
    review = re.sub('\*+', '', review)
    review = re.sub('~+', '', review)
    review = re.sub('-+', '', review)
    review = re.sub('&quot;', '', review)
    review = re.sub('\&\#\d+\;', ' ', review)
    review = re.sub('_+', '', review)
    review = re.sub('([a-zA-Z]+)\.([a-zA-Z])', '\\1. \\2', review)
    stemmed = lemma.lemmatize(review)
    stemmed_review.append(stemmed)

100%|██████████| 244352/244352 [00:32<00:00, 7571.86it/s] 


In [13]:
cnt = 0
while cnt < len(stemmed_review):
    review = stemmed_review.pop(0)
    review = re.sub('theprimula', 'the primula', review)
    review = re.sub('thicknessit', 'thickness it', review)
    review = re.sub('perfmance', 'performance', review)
    review = re.sub('brakfakst', 'breakfast', review)
    review = re.sub('readingsand', 'readings and', review)
    review = re.sub('appletinis', 'appletin is', review)
    review = re.sub('makerhas', 'maker has', review)
    review = re.sub('durablegood', 'durable good', review)
    review = re.sub('spacenegative', 'space negative', review)
    review = re.sub('beveragecons', 'beverage cons', review)
    review = re.sub('naturalnearby', 'natural nearby', review)
    review = re.sub('rugfeatured', 'rug featured', review)
    review = re.sub('cookingmeals', 'cooking meals', review)
    review = re.sub('againstallgrain', 'against all grain', review)
    review = re.sub('updatefirst', 'update first', review)
    review = re.sub('lotif', 'lot if', review)
    review = re.sub('sheetcakes', 'sheet cakes', review)
    review = re.sub('modelsrating', 'models rating', review)
    review = re.sub('versatilitycons', 'versatility cons', review)
    review = re.sub('frustrationcons', 'frustrations cons', review)
    review = re.sub('valueeasy', 'value easy', review)
    review = re.sub('puttoppings', 'put toppings', review)
    review = re.sub("don'tworry", "don't worry", review)
    review = re.sub('betterbut', 'better but', review)
    review = re.sub('fightersnice', 'fighters nice', review)
    review = re.sub('buttwhile', 'but while', review)
    review = re.sub('colderthan', 'colder than', review)
    review = re.sub('curtainsconsits', 'curtains cons its', review)
    review = re.sub('moppingalthough', 'mopping although', review)
    review = re.sub('receiversshelf', 'receivers shelf', review)
    review = re.sub('oldoak', 'old oak', review)
    review = re.sub('wasworried', 'was worried', review)
    review = re.sub('myaroma', 'my aroma', review)
    review = re.sub('thereplaicing', 'the replacing', review)
    review = re.sub('pleasingb', 'pleasing', review)
    review = re.sub('noisesturn', 'noises turn', review)
    review = re.sub('sanitizebuild', 'sanitize build', review)
    review = re.sub('andpulling', 'and pulling', review)
    review = re.sub('extractorinstead', 'extractor instead', review)
    review = re.sub('tocome','to come', review)
    review = re.sub('peppercombine', 'pepper combine', review)
    review = re.sub('ightweight', 'light weight', review)
    review = re.sub("ratingi've", "rating i've", review)
    review = re.sub('pilllowcase', 'pillow case', review)
    review = re.sub('toughbiggreenand', 'tough big green and', review)
    review = re.sub('smartypants', 'smarty pants', review)
    review = re.sub('metalworker', 'metal worker', review)
    review = re.sub('quiterealistic', 'quite realistic', review)
    review = re.sub('flimsyoverall', 'films overall', review)
    review = re.sub('shortervictorinox', 'shorter victorinox', review)
    review = re.sub('hungryhappenings', 'hungry happenings', review)
    review = re.sub('portableairshop', 'portable air shop', review)
    review = re.sub('elementis', 'elements', review)
    review = re.sub('designunder', 'design under', review)
    review = re.sub('adjustableinexpensivestainless', 'adjustable inexpensive stainless', review)
    review = re.sub('minutesbefore', 'minutes before', review)
    review = re.sub('breadbaking', 'bread baking', review)
    review = re.sub('somewhathigher', 'somewhat higher', review)
    review = re.sub('remotewith', 'remote with', review)
    review = re.sub('teapotthat', 'teapot that', review)
    review = re.sub('dishwashersafe', 'dishwasher safe', review)
    review = re.sub('paperthe', 'paper the', review)
    review = re.sub('grillpanthe', 'grill pan the', review)
    review = re.sub('setbonjour', 'set bonjour', review)
    review = re.sub('thewaiting', 'the waiting', review)
    review = re.sub('buyinghttp', 'buying http', review)
    review = re.sub('spacemy', 'space my', review)
    review = re.sub('asomething', 'a something', review)
    review = re.sub('mybonjour', 'my bonjour', review)
    review = re.sub('newlodge', 'new lodge', review)
    review = re.sub('lbsgross', 'lbs gross', review)
    review = re.sub('cleanupgive', 'clean up give', review)
    review = re.sub('teaupdate', 'tea update', review)
    review = re.sub('thefrench', 'the french', review)
    review = re.sub('eggwater', 'egg water', review)
    review = re.sub('rollerto', 'roller to', review)
    review = re.sub('thesecons', 'these cons', review)
    review = re.sub('fridgethe', 'fridge the', review)
    review = re.sub('overpricedthese', 'over priced these', review)
    review = re.sub('freshmakes', 'fresh makes', review)
    review = re.sub('slowwwlllllyy', 'slowly', review)
    review = re.sub('12below', '12 below', review)
    review = re.sub('installedto', 'installed to', review)
    review = re.sub('bagscons', 'bags cons', review)
    review = re.sub('vacuumwhcih', 'vacuum which', review)
    review = re.sub('ergonomicsfolds', 'ergonomics folds', review)
    review = re.sub('stationboth', 'station both', review)
    review = re.sub('gotthe', 'got the', review)
    review = re.sub('batteriescons', 'batteries cons', review)
    review = re.sub('bakerhttp', 'baker http', review)
    review = re.sub('getsmy', 'gets my', review)
    review = re.sub('myhoover', 'my hoover', review)
    review = re.sub('tacoproper', 'taco proper', review)
    review = re.sub("it'sonly", "it's only", review)
    review = re.sub('steelmeasures', 'steel measures', review)
    review = re.sub('technologypanasonic', 'technology panasonic', review)
    review = re.sub('spiritsdate', 'spirits date', review)
    review = re.sub('knowwhen', 'know when', review)
    review = re.sub('toturn', 'to turn', review)
    review = re.sub('gettingit', 'getting it', review)
    review = re.sub('filtersbrush', 'filters brush', review)
    review = re.sub('onereason', 'one reason', review)
    review = re.sub('gradeproduct', 'grade product', review)
    review = re.sub('handlesthe', 'handles the', review)
    review = re.sub('capthere', 'cap there', review)
    review = re.sub('warrantyyou', 'warranty you', review)
    review = re.sub('veggiesthere', 'veggies there', review)
    review = re.sub('deepupdated', 'deep updated', review)
    review = re.sub('betterthis', 'better this', review)
    review = re.sub('filtersthis', 'filters this', review)
    review = re.sub('sohelpful', 'so helpful', review)
    review = re.sub('absolutelya', 'absolutely a', review)
    review = re.sub('bladefor', 'blade for', review)
    review = re.sub('beachcamera', 'beach camera', review)
    review = re.sub('teawith', 'tea with', review)
    review = re.sub('properlyconsneeds', 'properly cons needs', review)
    review = re.sub('paintedcons', 'painted cons', review)
    review = re.sub('offlaking', 'off laking', review)
    review = re.sub('theset', 'the set', review)
    review = re.sub('thiseven', 'this even', review)
    review = re.sub('bitsthere', 'bits there', review)
    review = re.sub('siloconvert', 'silo convert', review)
    review = re.sub('riiiiight', 'right', review)
    review = re.sub('nicelywhen', 'nicely when', review)
    review = re.sub('easethis', 'ease this', review)
    review = re.sub('struggleoverall', 'struggle overall', review)
    review = re.sub('tubesuction', 'tube suction', review)
    review = re.sub('steamthe', 'steam the', review)
    review = re.sub('mustardsalt', 'mustard salt', review)
    review = re.sub('withbrownie', 'with brownie', review)
    review = re.sub('pricecons', 'price cons', review)
    review = re.sub('arrivedthe', 'arrived the', review)
    review = re.sub('asharpening', 'a sharpening', review)
    review = re.sub('andforefinger', 'and forefinger', review)
    review = re.sub('downnaturally', 'down naturally', review)
    review = re.sub('underneathsummary', 'underneath summary', review)
    review = re.sub('jusssst', 'just', review)
    review = re.sub('shoulderbags', 'shoulder bags', review)
    review = re.sub('pepperdirectionscook', 'pepper dirctions cook', review)
    review = re.sub('homemadecookiesandmore', 'homemade cookies and more', review)
    review = re.sub('favoritewest', 'favorite west', review)
    stemmed_review.append(review)
    cnt += 1

In [14]:
# add '.' to review
for i, review in tqdm(enumerate(stemmed_review)):
    if review.startswith('I recently upgraded from aGE Spacemaker XLwith the purchase of an LG overtherange microwave'):
        stemmed_review.pop(i)
        corrected = "I recently upgraded from aGE Spacemaker XL with the purchase of an LG overtherange microwave 2 cubic ft., black, item number: LMV208SB. the Good attractive to look at, matches my kitchen decor perfectly. large LCD display is easy to see and read. includes many advanced and basic settings. has an 'EZON' feature which allows you to start cooking instantly with the press of this button. has dedicated buttons for popcorn and hot dogs. door opens on its own, there isn't another button that you need to press like on other microwaves for the door. the price was exceptional and with Amazn's free shipping. I defy anyone to find another retailer that can match their price. comes with a 1year warranty and if LG is anything like GE (I have no idea) they won't bat an eye if you call them for service even when the appliance is a few years old More/Less. Arrows allow you to subtract or add time while the microwave is in operation. tells you when to turn over food, when you're defrosting. extra large cooking capacity, and with the wrack it nearly doubles in size 1100 watts of power. the Bad it is incredibly heavy and bulky (nearly 70 pounds), have at least 2 strong men who can lift it when you install it. the door makes a loud clunking sound when it closes. there isn't a 'quick cook' button like GE had, the closest this has is EZOn, however that automatically defaults to 30 seconds the unit. doesn't constantly keep beeping to remind you that your food is ready. sometimes foods tend to get slightly soggy when taken off the 'hold warm' setting. the rotation cannot be turned off if it is set to keep warm. it's doing a number on my plastic dishes that are microwave safe, they're slightly bending, however since they're so inexpensive I do not mind too much. the Ugly it is a beast to clean/keep clean. it is a fingerprint/dust magnet picks up streaks very quickly. it can pick up scratches very, very easily. the Exceptional the Soften button allows you to soften butter, icecream, cream cheese and even frozen juice with the cooking sensors. the microwave cooks everything to the perfect temp and then shuts off automatically, this is great for TV dinners. with the Reheat button you can quickly heat up anything and the microwave will bring it to the perfect temp without any effort on your part dedicated. defrost settings for vegetables, beef and chicken with the Hold Warm button. I can now leave whatever I was cooking in it for up to 90 minutes and it will stay nice and hot, this'll save on power because I won't have to turn on the oven for this. able to automatically turn on the Hold Warm feature to follow another cooking function. EZOn feature can be extended in 30 second increments just by pressing it. Custom Cook can be programmed to cook any single time you use the most often (my only complaint about this is you still need to press the Start/Enter button). Things to Remember never use a brillo pad or abrasive sponge on this as it could scratch very easily. although I don't know why you would want to, never turn the microwave oven on if nothing is inside it. when you first get it, sometimes it makes slight crackling sounds as it's in operation, these are normal when this arrives. it will look HUGE in the manufacturer's box, don't worry that's just how it looks in the box. If anyone has any suggestions on how to keep this looking perfect and nice and shiny and glossy please let me know."
        stemmed_review.insert(i, corrected)
    elif review.startswith('This Microwave oven gets my highest rating even though Amazon only allows for five stars'):
        stemmed_review.pop(i)
        corrected = "This Microwave oven gets my highest rating even though Amazon only allows for five stars. the oven is extremely fast and has the largest capacity for the price, also the quietest microwave oven I have ever heard or owned. I am amazed at the speed of cooking and reheating it can do and Amazon has the best and lowest price of any website I researched on. oven has a clock which was not stated on the description on the amazon page where it is located on. there is however downfall to this microwave oven which I was very surprised to. have witnessed for such an innovator of this type of oven they are known for their carousel that it did not have a luud beep when the oven is finished cooking or reheating. you really have to be in a room that is very quiet to hear this beeping sound for when the oven is finished. but all in all great oven great price and lots of cooking and reheating settings and loads of interior space. I was even able to put a Visions cookware pot in there without damaging the sides of the oven while the turntable is in motion which I thought I would not be able to do prior to receiving the oven. I did have a concern that I would not be able to use this particular pot in there because of damage it could cause. but I was pleasantly surprised I was able to put it in this oven If you own a lower wattage microwave oven and you purchase this one. you will also need to adjust your times by at least half the amount you already use on your current oven. I also love the minute plus setting, if you make a mistake in your time you can press this button to add one more extra minute. everytime until you reach the desired time, it is very handy if you are not sure just how much more time you need with your cooking and don't want to have to go all through the steps again to reset and start over. this button is so easy. just one press and you get an extra minute. I completely recommend this Microwave oven ot everyone, I just hope it last twenty years like my old Amana microwave did."
        stemmed_review.insert(i, corrected)
    elif review.startswith('this was very easy to put togetheri like the suction on itit is much better than'):
        stemmed_review.pop(i)
        corrected = "this was very easy to put together. i like the suction on itit is much better than my other dirt devil. it is easy to use pretty light weight. it glides across the floor easily. there are a few things i do not like. wish it had a light on it for when i am going under things. i am so used to every other machine i have has one but this one does not. the wand is a bit klunky to take out to use it. i keep hitting myself on the plastic part that holds the hose when it is upright. as for the dirt cup i have yet to find any machine that i do not get a bit dirty when i have to empty it. this one if i just remember to dump it every single time i use it. then it is not so bad but when it gets clogged up and i have to stick my hand in there to twist it apart and let the clog get out then it is a huge mess. it is a bit odd where the cord comes out from the machine too close to the bottom of it and i have to keep picking it up to be sure not to run over it. seems like it could have been up above the on button. that is another issue with this machine. you have to put your foot in front of it to flip it down to use it. why not just have the down pedal in the back? we rarely use the dusting part of it. i was so excited to try this feature because we have butternut walls and ceilings all over our house to my dismay this feature is kind of a pain and just easier to get out my dust mopi give it three stars because there are simply too many downsides to overlook and i hope they improve on these things. overall the suction is really good so for daily cleaning of my carpets it is great. i just have learned to not use the wand much or the dusting feature"
        stemmed_review.insert(i, corrected)
    elif review.startswith('have been using this mattress for a month nowlove it!one thing it will be hard to get used to not having'):
        stemmed_review.pop(i)
        corrected = "have been using this mattress for a month now love it! one thing it will be hard to get used to not having to flip it though. i admit i am a flipper and our last mattress was over ten years old and i was always flipping it hoping to move the sags around. but this one i do not think will end up sagging like the other one did. it has a higher coil count than my last one did. so far this one i have no complaints of any kind for being able to get a good nights sleep. I am able to sleep and not wake up with a back achei did have to get new sheets though. thought i already had ones for deep pockets but they were not deep enough for this oneone tiny complaint i do have. it is harder to change the sheets. the material on the box spring and bottom of the mattress stick to each other instead of kind of sliding around and i wish the fabric on the boxspring was not so thin or fabric paper like. oh our older boxspring it was more like a waterproof tarp type material and not a single rip in it after ten years. this one came with a couple small tears in it so when i change the sheets have to lift the mattress up and move it around since it does not slide back into place at all. it is a softer bed just like it says plushso if you are looking for firm this is not going to be that bed for you."
        stemmed_review.insert(i, corrected)

244352it [00:00, 430637.82it/s]


In [15]:
def split_to_sent(reviewText):
    """
    reviewText to list of sentences
    """
    reviewText = re.sub('([a-z]+)([A-Z]+)', '\\1\. \\2', reviewText)
    reviewText = re.sub('\s\.\s\.', '', reviewText)
    reviewText = re.sub('\s\s', ' ', reviewText)
    pattern = re.compile('(?<=\.|\?|\!|\;|[\d\)])(?<!\w\.\w.)(?<![A-Z][a-z]\.)\s')
    sents = re.split(pattern, reviewText)
    return sents

In [16]:
# want to know sent_len distribution
max_sent_len = 0
for review in tqdm(stemmed_review):
    sents = split_to_sent(review)
    for sent in sents:
        if len(sent) > max_sent_len:
            max_sent_len = len(sent)

100%|██████████| 244352/244352 [00:37<00:00, 6495.93it/s] 


In [17]:
data_over_n['reviewText'] = stemmed_review

In [18]:
data_over_n.shape

(244352, 9)

In [19]:
print("maximum length of sentence in review: {}".format(max_sent_len))

maximum length of sentence in review: 1408


In [20]:
def sent_to_mtx(sentence, row_size=1000):
    """
    sentence to character-level matrix
    """
    alphabet = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}\n"
    sentence = sentence.lower()
    if len(sentence) > row_size:
        sentence = sentence[:row_size]
    char_mtx = np.zeros([row_size, len(alphabet)], dtype=np.float32)
    for i, char in enumerate(sentence):
        if char in alphabet:
            char_mtx[i, alphabet.index(char)] = 1
    return char_mtx

In [26]:
def convert_to_input(data_over_n):
    """
    convert original review data to tensorflow input data
    Args
        data_over_n (dataframe): Amazon review data 
    Returns
        input_ (dict): reviewerID as key, list of related attributes (asin, reviewText) are values 
    """
    input_ = dict()
    for i in data_over_n.itertuples():
        reviewerID, asin, reviewText, unixReviewTime, reviewTime = i[1], i[2], i[5], i[8], i[9]
        if input_.get(reviewerID):
            input_.get(reviewerID).append({'asin': asin, 'reviewText': reviewText, 'unixReviewTime': unixReviewTime, 'reviewTime': reviewTime})
        else:
            input_[reviewerID] = [{'asin': asin, 'reviewText': reviewText, 'unixReviewTime': unixReviewTime, 'reviewTime': reviewTime}]
    return input_

In [27]:
data = convert_to_input(data_over_n)

In [29]:
if len(set(data.keys())) == len(set(data_over_n.reviewerID)):
    print("preprocessing is well done!")

preprocessing is well done!


In [31]:
with open('data.pkl', 'wb') as f:
    pickle.dump(data, f)