In [18]:
import os
import gzip
import json
import pandas as pd
import numpy as np
import pickle

from config import RAW_DIR, PRE_DIR, RES_DIR
from utils.data_porter import read_from_csv, save_to_csv

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

ImportError: ignored

# Get Raw data

In [None]:
df_rating_dir = os.path.join(RAW_DIR, 'Video_Games.csv')
gz_review_dir = os.path.join(RAW_DIR, 'Video_Games_5.json.gz')

In [None]:
rating_data = read_from_csv(df_rating_dir, header=None, names=['ProductID', 'ReviewerID', 'Rating', 'TimeStamp'], )
rating_data['TimeStamp'] = pd.to_datetime(rating_data.TimeStamp, unit='s')
rating_data = rating_data.sort_values(by='TimeStamp')

In [None]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

review_data = getDF(gz_review_dir)
review_data['reviewTime'] = pd.to_datetime(review_data.unixReviewTime, unit='s')
review_data = review_data.sort_values(by='reviewTime')

# 2 Data Preprocessing

### Only keep data in '2000-01-01'~'2017-12-09'

In [None]:
rating_data = rating_data[(rating_data.TimeStamp>='2000-01-01')&(rating_data.TimeStamp<='2017-12-09')]
review_data = review_data[(review_data.reviewTime>='2000-01-01')&(review_data.reviewTime<='2017-12-09')]
rating_data = rating_data.reset_index(drop=True)
review_data = review_data.reset_index(drop=True)

In [None]:
rating_data.head(3)
review_data.head(3)

review_data
- reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B
- asin - ID of the product, e.g. 0000013714
- reviewerName - name of the reviewer
- vote - helpful votes of the review
- style - a disctionary of the product metadata, e.g., "Format" is "Hardcover"
- reviewText - text of the review
- overall - rating of the product
- summary - summary of the review
- unixReviewTime - time of the review (unix time)
- reviewTime - time of the review (raw)
- image - images that users post after they have received the product
 
rating_data
- These datasets include no metadata or reviews, but only (item,user,rating,timestamp) tuples.

### Join in Review data & Rating data on 'ReviewID'

In [None]:
# review_data = review_data.rename(columns={'asin':'ProductID'})
# data = rating_data.merge(review_data, how='outer', on='ProductID')
# data = data.reset_index(drop=True)
# data.head(5)

### Filter reviewer & product 

In [None]:
def filters(data, filtered_var, base_var, threshold):
    '''
    only keep the values of filter_var that meets:
    filter_var[base_var].sum() >= threshold
    '''
    count_df = data[[filtered_var, base_var]].groupby(filtered_var).count()
    valid_id = count_df[count_df[base_var]>=threshold].index.tolist()
    data = data[data[filtered_var].isin(valid_id)]
    return data

In [None]:
before_filtering = len(rating_data)

rating_data = filters(rating_data, 'ReviewerID', 'TimeStamp', 5) 
# only keep the reviewer that has more than 5 ratings
rating_data = filters(rating_data, 'ProductID', 'TimeStamp', 20) 
# only keep the product that has more than 20 ratings


rating_data = rating_data.reset_index(drop=True)
after_filtering = len(rating_data)
rating_data.head(5)

after_filtering/before_filtering

### Save the data to RES_DIR

In [None]:
with open(os.path.join(PRE_DIR, 'rating_data.pkl'), 'wb') as f:
    pickle.dump(rating_data, f)

In [None]:
with open(os.path.join(PRE_DIR, 'review_data.pkl'), 'wb') as f:
    pickle.dump(review_data, f)