# Create Dataset

* From raw json files, create the reviews df
* From raw json files, create the items df
* Standardize strings across the items df
* Label encode the categorical columns

In [3]:
import csv
import os
import json
import glob
import re
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
from unidecode import unidecode

warnings.simplefilter(action='ignore', category=FutureWarning)

save_to_dir = "../dataset/utility"
if not os.path.exists(save_to_dir):
    os.mkdir(save_to_dir)

def clean_str(x):
    x = unidecode(x)
    x = '_'.join(x.replace('& ', '').split(' '))
    return x.lower().strip().replace(
        ',', '').replace('-', '').replace('?', '').replace(
            '(', '').replace(')', '').replace('~', '').replace('*', '').replace('.', '')

def items_and_reviews_to_dataframe(json_data):
    iter = 0
    products = []
    reviews = []
    asins = []
    asin_product_mapping = []
    for product_data in json_data:
        product = {}
        if ('body' not in product_data or 'reviews' not in product_data['body']
            or 'productInformation' not in product_data['body']):
            continue

        reviews_data = product_data['body'].get('reviews', [])
        product_name = product_data['body'].get('name', 'Unknown Product')
        product_data = product_data['body']
        asin = product_data['canonicalUrl'].split('/')[-1].lower()
        if asin not in asins:
            asin_product_mapping.append({
                'ASIN': asin,
                'name': product_name
            })
            asins.append(asin)
        #print(f"adding {iter}: {asin}")
        iter += 1

        if not reviews_data or len(reviews_data) == 0:
            continue
        
        ignore = ['dimensions', 'country_of_origin', 'batteries_included',
                  'weight', 'height', 'size', 'model', 'manufacturer',
                  'specifications', 'voltage', 'volts', '12v', 'climate_pledge',
                  'capacity', 'number_of_items', 'import', 'lxwxh', 'product'
                  'included']

        # form product data
        product['ASIN'] = asin
        product['customerReview'] = product_data.get('customerReview', 0)
        product['brand'] = clean_str(product_data.get('brand', 'Unknown brand'))
        if product.get('customerReview', 0) != 0:
            product['customerReview'] = float(product['customerReview'].split(' ')[0])
        breadcrumbs = product_data.get('breadCrumbs', [])
        for bc in breadcrumbs:
            name = clean_str(bc['name'])
            flag = True
            for ig in ignore:
                if ig in name:
                    flag = False
            if flag and '_' in name:
                name_list = name.split("_")
                for n in name_list:
                    product[n] = 1.0
            elif flag:
                product[name] = 1.0
        
        products.append(product)

        # form review data
        review = {}
        for r in reviews_data:
            review['ASIN'] = asin
            review['ProductName'] = clean_str(product_name)
            review['reviewerID'] = r['reviewerName'] + '_' + r['reviewerLink'].split('/')[-1].split('.')[-1]
            reviewRating = re.findall(r'(\d+\.\d+)', r['reviewRating'])
            reviewLocation = r['reviewDate'].split('on')[0].split(' in ')[-1].replace('the ', '')
            reviewDate = re.findall(r'on (.+)$', r['reviewDate'])
            reviewVotes = re.findall(r'(\d+)', r['reviewVotes'])
            if reviewRating:
                review['reviewRating'] = float(reviewRating[0])
            else:
                review['reviewRating'] = np.nan
            if reviewDate:
                review['reviewDate'] = reviewDate[0]
            else:
                review['reviewDate'] = 'Unknown'
            if reviewLocation:
                review['reviewLocation'] = reviewLocation
            else:
                review['reviewLocation'] = 'Unknown'
            if reviewVotes:
                review['reviewVotes'] = reviewVotes[0]
            else:
                review['reviewVotes'] = 0
            #print(f"adding reviewer: {review['reviewerID']}")
            reviews.append(review)
            review = {}
    all_reviews_df = pd.DataFrame(reviews)
    all_items_df = pd.DataFrame(products)
    asins_df = pd.DataFrame(asin_product_mapping)
    return all_reviews_df, all_items_df, asins_df

def get_all_json_data():
    base_dir = '../dataset/extracts/amazon'
    all_json_data = []
    for root, dirs, files in os.walk(base_dir):
        for dir in dirs:
            items_path = os.path.join(root, dir, 'items')
            if os.path.exists(items_path):
                json_files = glob.glob(os.path.join(items_path, '*.json'))
                
                for json_file in tqdm(json_files, desc=f'Loading JSON Files in {dir}'):
                    try:
                        with open(json_file, "r") as f:
                            all_json_data.append(json.load(f))
                    except json.JSONDecodeError:
                        print(f"Error loading JSON from file {json_file}: file is empty or not a valid JSON.")
                    except Exception as e:
                        print(f"Unexpected error loading JSON from file {json_file}: {e}")
    return all_json_data

all_json_data = get_all_json_data()

all_reviews_df, all_items_df, asins_df = items_and_reviews_to_dataframe(all_json_data)
all_reviews_df.drop_duplicates(keep="first", inplace=True)
all_reviews_df.to_csv(f"{save_to_dir}/reviews.csv", index=False)

all_items_df.drop_duplicates(keep='first', inplace=True)
all_items_df.to_csv(f"{save_to_dir}/itemset_preprocessed.csv", index=False)

asins_df.to_csv(f"{save_to_dir}/asin_product_mapping.csv", index=False)

Loading JSON Files in action figures:   0%|          | 0/267 [00:00<?, ?it/s]

Loading JSON Files in action figures: 100%|██████████| 267/267 [00:00<00:00, 2566.72it/s]
Loading JSON Files in adventure novel: 100%|██████████| 257/257 [00:00<00:00, 4430.08it/s]
Loading JSON Files in air freshener: 100%|██████████| 222/222 [00:00<00:00, 3963.34it/s]
Loading JSON Files in air fryer: 100%|██████████| 211/211 [00:00<00:00, 3767.01it/s]
Loading JSON Files in air purifier: 100%|██████████| 198/198 [00:00<00:00, 3806.80it/s]
Loading JSON Files in baby bottle: 100%|██████████| 219/219 [00:00<00:00, 3649.16it/s]
Loading JSON Files in baby formula: 100%|██████████| 209/209 [00:00<00:00, 1990.03it/s]
Loading JSON Files in baby wipes: 100%|██████████| 246/246 [00:00<00:00, 4168.51it/s]
Loading JSON Files in bathroom: 100%|██████████| 48/48 [00:00<00:00, 3427.83it/s]
Loading JSON Files in battery: 100%|██████████| 272/272 [00:00<00:00, 3941.14it/s]
Loading JSON Files in bedding: 100%|██████████| 246/246 [00:00<00:00, 3236.08it/s]
Loading JSON Files in bedframe: 100%|██████████|

Error loading JSON from file ../dataset/extracts/amazon\stove\items\amazon_B07V7JNTLB.json: file is empty or not a valid JSON.


Loading JSON Files in stroller: 100%|██████████| 203/203 [00:00<00:00, 3382.60it/s]
Loading JSON Files in surveillance camera: 100%|██████████| 234/234 [00:00<00:00, 3491.76it/s]
Loading JSON Files in table: 100%|██████████| 285/285 [00:00<00:00, 3313.18it/s]
Loading JSON Files in tablet: 100%|██████████| 186/186 [00:00<00:00, 3381.08it/s]
Loading JSON Files in tampon: 100%|██████████| 248/248 [00:00<00:00, 4427.55it/s]
Loading JSON Files in television: 100%|██████████| 208/208 [00:00<00:00, 3780.94it/s]
Loading JSON Files in thriller novel: 100%|██████████| 265/265 [00:00<00:00, 3440.75it/s]
Loading JSON Files in tires: 100%|██████████| 231/231 [00:00<00:00, 4357.45it/s]
Loading JSON Files in tissue: 100%|██████████| 242/242 [00:00<00:00, 390.23it/s]
Loading JSON Files in toddler toy: 100%|██████████| 258/258 [00:00<00:00, 3793.26it/s]
Loading JSON Files in toilet: 100%|██████████| 265/265 [00:00<00:00, 3954.30it/s]
Loading JSON Files in toothbrush: 100%|██████████| 237/237 [00:00<00:

# Preprocess Userbase

Get TopN reviewers only

In [11]:
save_to_dir = "../dataset/utility"
all_reviews_df = pd.read_csv(f"{save_to_dir}/reviews.csv")
all_reviews_df = all_reviews_df.replace(np.nan, '', regex=True)
user_ratings_df = all_reviews_df.drop([
    "reviewVotes", "reviewLocation", "reviewDate",
    "ProductName"], axis=1, inplace=False)
groupby_df = user_ratings_df.groupby('reviewerID')
freq = groupby_df['reviewerID'].value_counts()
groupby_df_freq = pd.merge(user_ratings_df, freq, on='reviewerID', how='left')
groupby_df_freq = groupby_df_freq.sort_values(['count'], ascending=False)

mask = groupby_df_freq["count"] >= 10
groupby_df_freq = groupby_df_freq.loc[mask]

topn_reviewers = pd.unique(groupby_df_freq["reviewerID"])

user_ratings_df.set_index("reviewerID", inplace=True)
user_ratings_grouped_df = user_ratings_df.loc[topn_reviewers].groupby('reviewerID')

# identify and remove generic reviewerID
generic_reviewerIDs = user_ratings_df.loc[topn_reviewers].groupby(
    'reviewerID').count().sort_values('ASIN', ascending=False)[:8].index.tolist()
topn_reviewers = [r for r in topn_reviewers if r not in generic_reviewerIDs]
user_ratings_grouped_df = user_ratings_df.loc[topn_reviewers].groupby('reviewerID')

rows = []
iter = 0
columns = pd.unique(all_reviews_df['ASIN']).tolist()
columns.append("reviewerID")

for index, data in user_ratings_grouped_df:
    row = {}
    row['reviewerID'] = index
    for ind, d in data.iterrows():
        row[d['ASIN']] = d['reviewRating']
    rows.append(row)
    iter += 1
fname = f"{save_to_dir}/utility_topn.csv"
with open(fname, 'w', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=columns)
    writer.writeheader()
    writer.writerows(rows)

In [12]:
df_utility = pd.read_csv(f"{save_to_dir}/utility_topn.csv")

In [13]:
df_utility.set_index('reviewerID', inplace=True)

In [14]:
# check asin discrepancy
asins_in_itemset = set(df_unique.index.tolist())
asins_in_utility = set(df_utility.columns.tolist())

In [15]:
len(asins_in_itemset)

33510

In [16]:
len(asins_in_utility)

33510