In [1]:
import os
import gzip
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def get_df(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [3]:
DATASET = 'Grocery_and_Gourmet_Food'
SUB_PATH = "tada"
RAW_PATH = os.path.join('./', DATASET)
DATA_FILE = 'reviews_{}_5.json.gz'.format(DATASET)
META_FILE = 'meta_{}.json.gz'.format(DATASET)

# Load Data

1. Load interaction data and item metadata
2. Filter out unuseful items
3. Calculate basic statistics

In [4]:
# download data if not exists

if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)
if not os.path.exists(os.path.join(RAW_PATH, DATA_FILE)):
    print('Downloading interaction data into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_{}_5.json.gz'
        .format(RAW_PATH, DATASET), shell=True)
if not os.path.exists(os.path.join(RAW_PATH, META_FILE)):
    print('Downloading item metadata into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_{}.json.gz'
        .format(RAW_PATH, DATASET), shell=True)

In [5]:
data_df = get_df(os.path.join(RAW_PATH, DATA_FILE))
data_df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1VEELTKS8NLZB,616719923X,Amazon Customer,"[0, 0]",Just another flavor of Kit Kat but the taste i...,4.0,Good Taste,1370044800,"06 1, 2013"
1,A14R9XMZVJ6INB,616719923X,amf0001,"[0, 1]",I bought this on impulse and it comes from Jap...,3.0,"3.5 stars, sadly not as wonderful as I had hoped",1400457600,"05 19, 2014"
2,A27IQHDZFQFNGG,616719923X,Caitlin,"[3, 4]",Really good. Great gift for any fan of green t...,4.0,Yum!,1381190400,"10 8, 2013"
3,A31QY5TASILE89,616719923X,DebraDownSth,"[0, 0]","I had never had it before, was curious to see ...",5.0,Unexpected flavor meld,1369008000,"05 20, 2013"
4,A2LWK003FFMCI5,616719923X,Diana X.,"[1, 2]",I've been looking forward to trying these afte...,4.0,"Not a very strong tea flavor, but still yummy ...",1369526400,"05 26, 2013"


In [7]:
out_df = data_df.rename(columns={'asin': 'item_id', 'reviewerID': 'user_id', 'unixReviewTime': 'time'})
out_df = out_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df.sort_values(by=['time', 'user_id', 'item_id'], inplace=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
35,A1KXONFPU2XQ5K,B00004S1C5,965779200
136,A23GFTVIETX7DS,B0000CH39R,1068249600
171,A281NPSIMI1C2R,B0000DBN1H,1073433600
55,A3M174IC0VXOS2,B0000537AF,1075593600
1413,A218J1WI08045B,B0001EQN88,1082073600


In [8]:
# reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df = out_df.reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,2177,3,965779200
1,4161,18,1068249600
2,4698,23,1073433600
3,10146,6,1075593600
4,3915,126,1082073600


In [27]:
min_idx = None
min_v = np.inf
for i, v in out_df.groupby('user_id'):
    if len(v) < min_v:
        min_idx = i
        min_v = len(v)

In [28]:
min_idx

3

In [29]:
min_v

5

In [34]:
# leave one out spliting

clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())

In [30]:
def generate_dev_test(data_df):
    result_dfs = []
    for idx in range(4):
        result_df = data_df.groupby('user_id').tail(1).copy()
        data_df = data_df.drop(result_df.index)
        neg_items = np.random.randint(1, len(iids) + 1, (len(result_df), NEG_ITEMS))
        for i, uid in enumerate(result_df['user_id'].values):
            user_clicked = clicked_item_set[uid]
            for j in range(len(neg_items[i])):
                while neg_items[i][j] in user_clicked:
                    neg_items[i][j] = np.random.randint(1, len(iids) + 1)
        result_df['neg_items'] = neg_items.tolist()
        result_dfs.append(result_df)
    return result_dfs, data_df

In [32]:
NEG_ITEMS = 10

In [35]:
leave_df = out_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)

test_df, data_df = generate_dev_test(data_df)
#train_df = pd.concat([leave_df, data_df]).sort_index()

#len(train_df), len(dev_df), len(test_df)

In [39]:
data_df[data_df.user_id == 3]

Unnamed: 0,user_id,item_id,time
