# Notebook Purpose
For confirming that downloaded datasets are consistent with the baseline paper and exploring the contents of said datasets.

In [2]:
# imports
import numpy as np
import pandas as pd

# Amazon 1996-2014 Toys and Games Review Data

This dataset contains a list of reviews, each with the following metadata:
- reviewerID: the ID of the user
- asin: the ID of the product reviewed
- reviewerName: the username of the user
- helpful: the helpfulness rating, expressed as [helpful, unhelpful]
- reviewText: body text of the review
- overall: rating of the review /5
- summary: the header (summary text) of the review
- unixReviewTime: time in unix
- reviewTime: raw time recorded

In [4]:
amazon_5core_path = '../datasets/raw/reviews_Toys_and_Games_5.json'
amazon_1core_path = '../datasets/raw/reviews_Toys_and_Games.json'
amazon_5_df = pd.read_json(amazon_5core_path, lines = True)
amazon_1_df = pd.read_json(amazon_1core_path, lines = True)

In [5]:
display(amazon_5_df.info())
display(amazon_1_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167597 entries, 0 to 167596
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   reviewerID      167597 non-null  object
 1   asin            167597 non-null  object
 2   reviewerName    166759 non-null  object
 3   helpful         167597 non-null  object
 4   reviewText      167597 non-null  object
 5   overall         167597 non-null  int64 
 6   summary         167597 non-null  object
 7   unixReviewTime  167597 non-null  int64 
 8   reviewTime      167597 non-null  object
dtypes: int64(2), object(7)
memory usage: 11.5+ MB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2252771 entries, 0 to 2252770
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   reviewerID      object
 1   asin            object
 2   reviewerName    object
 3   helpful         object
 4   reviewText      object
 5   overall         int64 
 6   summary         object
 7   unixReviewTime  int64 
 8   reviewTime      object
dtypes: int64(2), object(7)
memory usage: 154.7+ MB


None

# Preprocessed Datasets
Here is where we prune datasets to fit certain needs. We would like the following types of datasets:
- 5core: User, Item, Rating
- 1core: User, Item, Rating

In [6]:
# Supplemental functions
# assumes arr is sorted, and item i is in arr
def index_item(arr, i):
    upper = len(arr) - 1
    lower = 0
    while True:
        mid = lower + (upper - lower)//2
        if arr[mid] == i:
            return mid
        elif arr[mid] > i:
            upper = mid
        else:
            lower = mid + 1
        if upper < lower:
            print('Fatal error: item not found.')
            break

In [7]:
### Supplemental arrays

# User and Item database; use these to give an ID within range to make calculations easier
U_5core = amazon_5_df['reviewerID'].unique()
I_5core = amazon_5_df['asin'].unique()
U_5core.sort()
I_5core.sort()
print(f'5core userbase: {len(U_5core)}; top 5: {U_5core[:5]}')
print(f'5core itemabse: {len(I_5core)}; top 5: {I_5core[:5]}')


### User item rating dataframe for Amazon 5-core
UIR_5core_df = amazon_5_df.copy()

# Remove unrelated variables
UIR_5core_df = UIR_5core_df[['reviewerID', 'asin', 'overall']]
UIR_5core_df['reviewerID'] = UIR_5core_df['reviewerID'].apply(lambda x: index_item(U_5core, x))
UIR_5core_df['asin'] = UIR_5core_df['asin'].apply(lambda x: index_item(I_5core, x))

G_b = UIR_5core_df['overall'].mean()
print(f'Global avg: {G_b}')

UIR_5core_in = UIR_5core_df[['reviewerID', 'asin']].to_numpy(np.int64)
UIR_5core_out = UIR_5core_df['overall'].to_numpy(np.float64)
np.savez_compressed('../datasets/processed/UIR_5core.npz', x = UIR_5core_in, y = UIR_5core_out, U_size = len(U_5core), I_size = len(I_5core), G_b = G_b)

5core userbase: 19412; top 5: ['A012468118FTQAINEI0OQ' 'A0182108CPDLPRCXQUZQ' 'A026961431MGW0616BRS3'
 'A034597326Z83X79S50FI' 'A04295422T2ZG087R17FX']
5core itemabse: 11924; top 5: ['0439893577' '048645195X' '0545496470' '0615444172' '0670010936']
Global avg: 4.356307093802395


In [9]:
np.savez_compressed('../datasets/processed/UI_lists.npz', u = U_5core, i = I_5core)


In [12]:
print(type(I_5core))

<class 'numpy.ndarray'>


In [17]:
amazon_beauty_path = '../datasets/raw/reviews_Beauty_5.json'
amazon_games_path = '../datasets/raw/reviews_Video_Games_5.json'
amazon_apps_path = '../datasets/raw/reviews_Apps_for_Android_5.json'
amazon_health_path = '../datasets/raw/reviews_Health_and_Personal_Care_5.json'
beauty_df = pd.read_json(amazon_beauty_path, lines = True)
games_df = pd.read_json(amazon_games_path, lines = True)
apps_df = pd.read_json(amazon_apps_path, lines = True)
health_df = pd.read_json(amazon_health_path, lines = True)

In [18]:
U_5core = beauty_df['reviewerID'].unique()
I_5core = beauty_df['asin'].unique()
print(len(U_5core))
print(len(I_5core))

U_5core = games_df['reviewerID'].unique()
I_5core = games_df['asin'].unique()
print(len(U_5core))
print(len(I_5core))

U_5core = apps_df['reviewerID'].unique()
I_5core = apps_df['asin'].unique()
print(len(U_5core))
print(len(I_5core))

U_5core = health_df['reviewerID'].unique()
I_5core = health_df['asin'].unique()
print(len(U_5core))
print(len(I_5core))

22363
12101
24303
10672
87271
13209
38609
18534


In [81]:
games_df.sort_values(by = 'reviewerID')

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
209657,A00263941WP7WCIL7AKWL,B008OSDGV0,Andy,"[11, 11]",A highly known skylander in all of the skyland...,5,Trigger Happy Review,1353715200,"11 24, 2012"
209669,A00263941WP7WCIL7AKWL,B008OSDHD2,Andy,"[0, 1]","Chill is a water skylander, which means she ca...",5,Chill review,1352160000,"11 6, 2012"
210035,A00263941WP7WCIL7AKWL,B008SBZF4Y,Andy,"[15, 18]","Crusher is both strong and sturdy, but slow. E...",5,Crusher review,1352160000,"11 6, 2012"
209886,A00263941WP7WCIL7AKWL,B008SBZD82,Andy,"[1, 3]","Drobot, the new lightcore character from the t...",1,Lightcore Drobot Review,1353801600,"11 25, 2012"
209713,A00263941WP7WCIL7AKWL,B008OSDHZK,Andy,"[16, 19]","Pop Fizz, an alchemist in potions is powerful....",5,Pop Fizz review,1352246400,"11 7, 2012"
...,...,...,...,...,...,...,...,...,...
197757,AZZTC2OYVNE2Q,B006W41X1S,Wouter,"[0, 0]","Quick service, product as described no more no...",5,Cool,1356566400,"12 27, 2012"
186737,AZZTC2OYVNE2Q,B0050SYX8W,Wouter,"[0, 1]","My kids wanted this game badly, but now it is ...",4,Its ok...,1356566400,"12 27, 2012"
216012,AZZTC2OYVNE2Q,B00BD9OLW0,Wouter,"[1, 2]","What can i say about this, it is a serious of ...",5,For my 7 year old.,1365811200,"04 13, 2013"
175808,AZZTC2OYVNE2Q,B004PAGJOC,Wouter,"[0, 0]",He seems to like it and is still playing it to...,4,My 8 year old's game,1365811200,"04 13, 2013"


In [13]:
games_df['overall'].describe()

count    231780.000000
mean          4.086397
std           1.202330
min           1.000000
25%           4.000000
50%           5.000000
75%           5.000000
max           5.000000
Name: overall, dtype: float64