# Notebook Purpose
For confirming that downloaded datasets are consistent with the baseline paper and exploring the contents of said datasets.

In [1]:
# imports
import numpy as np
import pandas as pd

# Amazon 1996-2014 Data

This dataset contains a list of reviews, each with the following metadata:
- reviewerID: the ID of the user
- asin: the ID of the product reviewed
- reviewerName: the username of the user
- helpful: the helpfulness rating, expressed as [helpful, unhelpful]
- reviewText: body text of the review
- overall: rating of the review /5
- summary: the header (summary text) of the review
- unixReviewTime: time in unix
- reviewTime: raw time recorded

We utilize the categories of Video Games, Android Applications, and Health & Personal Care.

In [6]:
amazon_games_path = '../datasets/raw/reviews_Video_Games_5.json'
amazon_apps_path = '../datasets/raw/reviews_Apps_for_Android_5.json'
amazon_health_path = '../datasets/raw/reviews_Health_and_Personal_Care_5.json'
games_df = pd.read_json(amazon_games_path, lines = True)
apps_df = pd.read_json(amazon_apps_path, lines = True)
health_df = pd.read_json(amazon_health_path, lines = True)
games_apps_df = pd.concat([games_df, apps_df], axis = 0)
games_health_df = pd.concat([games_df, health_df], axis = 0)
all_df = pd.concat([games_df, apps_df, health_df], axis = 0)

print(len(games_df))
print(len(apps_df))
print(len(health_df))
print(len(games_apps_df))
print(len(games_health_df))
print(len(all_df))

231780
752937
346355
984717
578135
1331072


# Preprocessed Datasets
Here is where we prune datasets to fit certain needs. We would like the following types of datasets:
- ratings: User, Item, Rating
- user reviews: User, Text
- item reviews: Item, Text

In [3]:
# Supplemental functions
# assumes arr is sorted, and item i is in arr
def index_item(arr, i):
    upper = len(arr) - 1
    lower = 0
    while True:
        mid = lower + (upper - lower)//2
        if arr[mid] == i:
            return mid
        elif arr[mid] > i:
            upper = mid
        else:
            lower = mid + 1
        if upper < lower:
            print('Fatal error: item not found.')
            break

def combine_strings(row):
    return row['reviewText'] + ' ' + row['summary']

In [4]:
### Supplemental arrays

def generate_ratings_and_ids(df, keyword):
    # User and Item arrays; use these to compress IDs to a fixed-length integer sequence
    U = df['reviewerID'].unique()
    I = df['asin'].unique()
    U.sort()
    I.sort()
    print(f'User count for {keyword}: {len(U)}')
    print(f'Item count for {keyword}: {len(I)}')

    review_df = df.copy()

    # Remove unrelated variables
    review_df = review_df[['reviewerID', 'asin', 'overall']]
    review_df['reviewerID'] = review_df['reviewerID'].apply(lambda x: index_item(U, x))
    review_df['asin'] = review_df['asin'].apply(lambda x: index_item(I, x))

    G_b = review_df['overall'].mean()
    print(f'Global avg for {keyword}: {G_b}')

    X = review_df[['reviewerID', 'asin']].to_numpy(np.int64)
    y = review_df['overall'].to_numpy(np.float64)
    np.savez_compressed(f'../datasets/processed/{keyword}_ratings.npz', x = X, y = y, U_size = len(U), I_size = len(I), G_b = G_b)
    np.savez_compressed(f'../datasets/processed/{keyword}_UI_lists.npz', u = U, i = I)

def generate_reviews(df, keyword):

    item_reviews_df = df.groupby('asin')['reviewText'].apply(lambda x: " ".join(x)).reset_index()
    item_reviews_df['summary'] = df.groupby('asin')['summary'].apply(lambda x: " ".join(x)).reset_index()['summary']
    item_reviews_df = item_reviews_df.sort_values(by = 'asin')
    item_reviews_df.reset_index()
    
    user_reviews_df = df.groupby('reviewerID')['reviewText'].apply(lambda x: " ".join(x)).reset_index()
    user_reviews_df['summary'] = df.groupby('reviewerID')['summary'].apply(lambda x: " ".join(x)).reset_index()['summary']
    user_reviews_df = user_reviews_df.sort_values(by = 'reviewerID')
    user_reviews_df.reset_index()
    
    user_reviews_df.to_csv(f'../datasets/processed/{keyword}_U_reviews.csv', escapechar = '\\')
    item_reviews_df.to_csv(f'../datasets/processed/{keyword}_I_reviews.csv', escapechar = '\\')

def generate_datasets(df, keyword):
    generate_ratings_and_ids(df, keyword)
    generate_reviews(df, keyword)

In [11]:
generate_datasets(games_df, 'games')
generate_datasets(apps_df, 'apps')
generate_datasets(health_df, 'health')
generate_datasets(games_apps_df, 'games_apps')
generate_datasets(games_health_df, 'games_health')
generate_datasets(all_df, 'all')


User count for games: 24303
Item count for games: 10672
Global avg for games: 4.086396582966606
User count for apps: 87271
Item count for apps: 13209
Global avg for apps: 3.968930999539138
User count for health: 38609
Item count for health: 18534
Global avg for health: 4.274957774537685
User count for games_apps: 110680
Item count for games_apps: 23833
Global avg for games_apps: 3.9965797279827604
User count for games_health: 61028
Item count for games_health: 29206
Global avg for games_health: 4.1993617407698896
User count for all: 146140
Item count for all: 42367
Global avg for all: 4.069015800798153


In [None]:
print(type(I_5core))

In [None]:
amazon_games_path = '../datasets/raw/reviews_Video_Games_5.json'
amazon_apps_path = '../datasets/raw/reviews_Apps_for_Android_5.json'
amazon_health_path = '../datasets/raw/reviews_Health_and_Personal_Care_5.json'
games_df = pd.read_json(amazon_games_path, lines = True)
apps_df = pd.read_json(amazon_apps_path, lines = True)
health_df = pd.read_json(amazon_health_path, lines = True)

In [None]:
U_5core = beauty_df['reviewerID'].unique()
I_5core = beauty_df['asin'].unique()
print(len(U_5core))
print(len(I_5core))

U_5core = games_df['reviewerID'].unique()
I_5core = games_df['asin'].unique()
print(len(U_5core))
print(len(I_5core))

U_5core = apps_df['reviewerID'].unique()
I_5core = apps_df['asin'].unique()
print(len(U_5core))
print(len(I_5core))

U_5core = health_df['reviewerID'].unique()
I_5core = health_df['asin'].unique()
print(len(U_5core))
print(len(I_5core))

In [None]:
games_df.sort_values(by = 'reviewerID')

In [None]:


games_df['text+summary'] = games_df.apply(combine_strings, axis = 1)

In [None]:
games_item_review_only_df = games_df.groupby('asin')['reviewText'].apply(lambda x: "&&&".join(x)).reset_index()
games_item_review_only_df = games_item_review_only_df.sort_values(by = 'asin')
games_item_review_only_df

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

games_item_review_only_list = games_item_review_only_df['reviewText'].tolist()
vectorizer = TfidfVectorizer(encoding="utf-8", lowercase=True)
X = vectorizer.fit_transform(games_item_review_only_list)
