# Notebook Purpose
For confirming that downloaded datasets are consistent with the baseline paper and exploring the contents of said datasets.

In [59]:
# imports
import numpy as np
import pandas as pd

# Amazon 1996-2014 Data

This dataset contains a list of reviews, each with the following metadata:
- reviewerID: the ID of the user
- asin: the ID of the product reviewed
- reviewerName: the username of the user
- helpful: the helpfulness rating, expressed as [helpful, unhelpful]
- reviewText: body text of the review
- overall: rating of the review /5
- summary: the header (summary text) of the review
- unixReviewTime: time in unix
- reviewTime: raw time recorded

We utilize the categories of Video Games, Android Applications, and Health & Personal Care.

In [60]:
amazon_games_path = '../datasets/raw/reviews_Video_Games_5.json'
amazon_apps_path = '../datasets/raw/reviews_Apps_for_Android_5.json'
amazon_health_path = '../datasets/raw/reviews_Health_and_Personal_Care_5.json'
games_df = pd.read_json(amazon_games_path, lines = True)
apps_df = pd.read_json(amazon_apps_path, lines = True)
health_df = pd.read_json(amazon_health_path, lines = True)

# Preprocessed Datasets
Here is where we prune datasets to fit certain needs. We would like the following types of datasets:
- ratings: User, Item, Rating
- user reviews: User, Text
- item reviews: Item, Text

In [67]:
# Supplemental functions
# assumes arr is sorted, and item i is in arr
def index_item(arr, i):
    upper = len(arr) - 1
    lower = 0
    while True:
        mid = lower + (upper - lower)//2
        if arr[mid] == i:
            return mid
        elif arr[mid] > i:
            upper = mid
        else:
            lower = mid + 1
        if upper < lower:
            print('Fatal error: item not found.')
            break

def combine_strings(row):
    return row['reviewText'] + ' ' + row['summary']

In [79]:
### Supplemental arrays

def generate_ratings_and_ids(df, keyword):
    # User and Item arrays; use these to compress IDs to a fixed-length integer sequence
    U = df['reviewerID'].unique()
    I = df['asin'].unique()
    U.sort()
    I.sort()
    print(f'User count for {keyword}: {len(U)}')
    print(f'Item count for {keyword}: {len(I)}')

    review_df = df.copy()

    # Remove unrelated variables
    review_df = review_df[['reviewerID', 'asin', 'overall']]
    review_df['reviewerID'] = review_df['reviewerID'].apply(lambda x: index_item(U, x))
    review_df['asin'] = review_df['asin'].apply(lambda x: index_item(I, x))

    G_b = review_df['overall'].mean()
    print(f'Global avg for {keyword}: {G_b}')

    X = review_df[['reviewerID', 'asin']].to_numpy(np.int64)
    y = review_df['overall'].to_numpy(np.float64)
    np.savez_compressed(f'../datasets/processed/{keyword}_ratings.npz', x = X, y = y, U_size = len(U), I_size = len(I), G_b = G_b)
    np.savez_compressed(f'../datasets/processed/{keyword}_UI_lists.npz', u = U, i = I)

def generate_reviews(df, keyword):

    item_reviews_df = df.groupby('asin')['reviewText'].apply(lambda x: " ".join(x)).reset_index()
    item_reviews_df['summary'] = df.groupby('asin')['summary'].apply(lambda x: " ".join(x)).reset_index()['summary']
    item_reviews_df = item_reviews_df.sort_values(by = 'asin')
    item_reviews_df.reset_index()
    
    user_reviews_df = df.groupby('reviewerID')['reviewText'].apply(lambda x: " ".join(x)).reset_index()
    user_reviews_df['summary'] = df.groupby('reviewerID')['summary'].apply(lambda x: " ".join(x)).reset_index()['summary']
    user_reviews_df = user_reviews_df.sort_values(by = 'reviewerID')
    user_reviews_df.reset_index()
    
    user_reviews_df.to_csv(f'../datasets/processed/{keyword}_U_reviews.csv', escapechar = '\\')
    item_reviews_df.to_csv(f'../datasets/processed/{keyword}_I_reviews.csv', escapechar = '\\')

def generate_datasets(df, keyword):
    generate_ratings_and_ids(df, keyword)
    generate_reviews(df, keyword)

In [80]:
generate_datasets(games_df, 'games')
generate_datasets(apps_df, 'apps')
generate_datasets(health_df, 'health')


User count for games: 24303
Item count for games: 10672
Global avg for games: 4.086396582966606
User count for apps: 87271
Item count for apps: 13209
Global avg for apps: 3.968930999539138
User count for health: 38609
Item count for health: 18534
Global avg for health: 4.274957774537685


In [12]:
print(type(I_5core))

<class 'numpy.ndarray'>


In [5]:
amazon_games_path = '../datasets/raw/reviews_Video_Games_5.json'
amazon_apps_path = '../datasets/raw/reviews_Apps_for_Android_5.json'
amazon_health_path = '../datasets/raw/reviews_Health_and_Personal_Care_5.json'
games_df = pd.read_json(amazon_games_path, lines = True)
apps_df = pd.read_json(amazon_apps_path, lines = True)
health_df = pd.read_json(amazon_health_path, lines = True)

In [18]:
U_5core = beauty_df['reviewerID'].unique()
I_5core = beauty_df['asin'].unique()
print(len(U_5core))
print(len(I_5core))

U_5core = games_df['reviewerID'].unique()
I_5core = games_df['asin'].unique()
print(len(U_5core))
print(len(I_5core))

U_5core = apps_df['reviewerID'].unique()
I_5core = apps_df['asin'].unique()
print(len(U_5core))
print(len(I_5core))

U_5core = health_df['reviewerID'].unique()
I_5core = health_df['asin'].unique()
print(len(U_5core))
print(len(I_5core))

22363
12101
24303
10672
87271
13209
38609
18534


In [81]:
games_df.sort_values(by = 'reviewerID')

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
209657,A00263941WP7WCIL7AKWL,B008OSDGV0,Andy,"[11, 11]",A highly known skylander in all of the skyland...,5,Trigger Happy Review,1353715200,"11 24, 2012"
209669,A00263941WP7WCIL7AKWL,B008OSDHD2,Andy,"[0, 1]","Chill is a water skylander, which means she ca...",5,Chill review,1352160000,"11 6, 2012"
210035,A00263941WP7WCIL7AKWL,B008SBZF4Y,Andy,"[15, 18]","Crusher is both strong and sturdy, but slow. E...",5,Crusher review,1352160000,"11 6, 2012"
209886,A00263941WP7WCIL7AKWL,B008SBZD82,Andy,"[1, 3]","Drobot, the new lightcore character from the t...",1,Lightcore Drobot Review,1353801600,"11 25, 2012"
209713,A00263941WP7WCIL7AKWL,B008OSDHZK,Andy,"[16, 19]","Pop Fizz, an alchemist in potions is powerful....",5,Pop Fizz review,1352246400,"11 7, 2012"
...,...,...,...,...,...,...,...,...,...
197757,AZZTC2OYVNE2Q,B006W41X1S,Wouter,"[0, 0]","Quick service, product as described no more no...",5,Cool,1356566400,"12 27, 2012"
186737,AZZTC2OYVNE2Q,B0050SYX8W,Wouter,"[0, 1]","My kids wanted this game badly, but now it is ...",4,Its ok...,1356566400,"12 27, 2012"
216012,AZZTC2OYVNE2Q,B00BD9OLW0,Wouter,"[1, 2]","What can i say about this, it is a serious of ...",5,For my 7 year old.,1365811200,"04 13, 2013"
175808,AZZTC2OYVNE2Q,B004PAGJOC,Wouter,"[0, 0]",He seems to like it and is still playing it to...,4,My 8 year old's game,1365811200,"04 13, 2013"


In [13]:


games_df['text+summary'] = games_df.apply(combine_strings, axis = 1)

In [36]:
games_item_review_only_df = games_df.groupby('asin')['reviewText'].apply(lambda x: "&&&".join(x)).reset_index()
games_item_review_only_df = games_item_review_only_df.sort_values(by = 'asin')
games_item_review_only_df

Unnamed: 0,asin,reviewText
0,0700099867,Installing the game was a struggle (because of...
1,6050036071,"Works good, however is not ""like a new"" with a..."
2,7100027950,"Great game! I love the storyline and graphics,..."
3,7293000936,"While the product is what it is described as, ..."
4,8176503290,I enjoyed the first Hawx game. However they le...
...,...,...
10667,B00JQ8YH6A,The Her Interactive Nancy Drew brand appears t...
10668,B00JQHU9RC,I personally do not like the game or the feel ...
10669,B00JXW6GE0,Vendor sent me a 'for review' unit so that I c...
10670,B00KAI3KW2,"Bought an Xbox One during 4th of July weekend,..."


In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

games_item_review_only_list = games_item_review_only_df['reviewText'].tolist()
vectorizer = TfidfVectorizer(encoding="utf-8", lowercase=True)
X = vectorizer.fit_transform(games_item_review_only_list)
