# Notebook Purpose
For confirming that downloaded datasets are consistent with the baseline paper and exploring the contents of said datasets.

In [52]:
# imports
import numpy as np
import pandas as pd

# Amazon 1996-2014 Toys and Games Review Data

This dataset contains a list of reviews, each with the following metadata:
- reviewerID: the ID of the user
- asin: the ID of the product reviewed
- reviewerName: the username of the user
- helpful: the helpfulness rating, expressed as [helpful, unhelpful]
- reviewText: body text of the review
- overall: rating of the review /5
- summary: the header (summary text) of the review
- unixReviewTime: time in unix
- reviewTime: raw time recorded

In [5]:
amazon_5core_path = '../data/raw/reviews_Toys_and_Games_5.json'
amazon_1core_path = '../data/raw/reviews_Toys_and_Games.json'
amazon_5_df = pd.read_json(amazon_5core_path, lines = True)
amazon_1_df = pd.read_json(amazon_1core_path, lines = True)

In [6]:
display(amazon_5_df.info())
display(amazon_1_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167597 entries, 0 to 167596
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   reviewerID      167597 non-null  object
 1   asin            167597 non-null  object
 2   reviewerName    166759 non-null  object
 3   helpful         167597 non-null  object
 4   reviewText      167597 non-null  object
 5   overall         167597 non-null  int64 
 6   summary         167597 non-null  object
 7   unixReviewTime  167597 non-null  int64 
 8   reviewTime      167597 non-null  object
dtypes: int64(2), object(7)
memory usage: 11.5+ MB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2252771 entries, 0 to 2252770
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   reviewerID      object
 1   asin            object
 2   reviewerName    object
 3   helpful         object
 4   reviewText      object
 5   overall         int64 
 6   summary         object
 7   unixReviewTime  int64 
 8   reviewTime      object
dtypes: int64(2), object(7)
memory usage: 154.7+ MB


None

# Preprocessed Datasets
Here is where we prune datasets to fit certain needs. We would like the following types of datasets:
- 5core: User, Item, Rating
- 1core: User, Item, Rating

In [41]:
# Supplemental functions
# assumes arr is sorted, and item i is in arr
def index_item(arr, i):
    upper = len(arr) - 1
    lower = 0
    while True:
        mid = lower + (upper - lower)//2
        if arr[mid] == i:
            return mid
        elif arr[mid] > i:
            upper = mid
        else:
            lower = mid + 1
        if upper < lower:
            print('Fatal error: item not found.')
            break

In [74]:
### Supplemental arrays

# User and Item database; use these to give an ID within range to make calculations easier
U_5core = amazon_5_df['reviewerID'].unique()
I_5core = amazon_5_df['asin'].unique()
U_5core.sort()
I_5core.sort()
print(f'5core userbase: {len(U_5core)}; top 5: {U_5core[:5]}')
print(f'5core itemabse: {len(I_5core)}; top 5: {I_5core[:5]}')


### User item rating dataframe for Amazon 5-core
UIR_5core_df = amazon_5_df.copy()

# Remove unrelated variables
UIR_5core_df = UIR_5core_df[['reviewerID', 'asin', 'overall']]
UIR_5core_df['reviewerID'] = UIR_5core_df['reviewerID'].apply(lambda x: index_item(U_5core, x))
UIR_5core_df['asin'] = UIR_5core_df['asin'].apply(lambda x: index_item(I_5core, x))

G_b = UIR_5core_df['overall'].mean()
print(f'Global avg: {G_b}')

UIR_5core_in = UIR_5core_df[['reviewerID', 'asin']].to_numpy(np.int64)
UIR_5core_out = UIR_5core_df['overall'].to_numpy(np.float64)
np.savez_compressed('../data/UIR_5core.npz', x = UIR_5core_in, y = UIR_5core_out, U_size = len(U_5core), I_size = len(I_5core), G_b = G_b)

5core userbase: 19412; top 5: ['A012468118FTQAINEI0OQ' 'A0182108CPDLPRCXQUZQ' 'A026961431MGW0616BRS3'
 'A034597326Z83X79S50FI' 'A04295422T2ZG087R17FX']
5core itemabse: 11924; top 5: ['0439893577' '048645195X' '0545496470' '0615444172' '0670010936']
Global avg: 4.356307093802395
