# Create Dataset

* From raw json files, create the reviews df
* From raw json files, create the items df
* Standardize strings across the items df
* Label encode the categorical columns

In [4]:
import os
import json
import glob
import re
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
from unidecode import unidecode

warnings.simplefilter(action='ignore', category=FutureWarning)

save_to_dir = "../dataset/utility"
if not os.path.exists(save_to_dir):
    os.mkdir(save_to_dir)

def clean_str(x):
    forbidden_chars = [',', '-', '?', '(', ')',
                       '~', '*', '.', '!']
    x = unidecode(x)
    x = '_'.join(x.replace('& ', '').split(' '))
    x = x.lower().strip()
    for c in forbidden_chars:
        x = x.replace(c, '')
    return x

def items_and_reviews_to_dataframe(json_data):
    products = []
    reviews = []
    asins = []
    asin_product_mapping = []
    for product_data in json_data:
        product = {}
        if ('body' not in product_data or 'reviews' not in product_data['body']
            or 'productInformation' not in product_data['body']):
            continue

        reviews_data = product_data['body'].get('reviews', [])
        product_name = product_data['body'].get('name', 'Unknown Product')
        product_data = product_data['body']
        asin = product_data['canonicalUrl'].split('/')[-1].lower()

        if not reviews_data or len(reviews_data) == 0:
            continue
        
        ignore = ['dimensions', 'country_of_origin', 'batteries_included',
                  'weight', 'height', 'size', 'model', 'manufacturer',
                  'specifications', 'voltage', 'volts', '12v', 'climate_pledge',
                  'capacity', 'number_of_items', 'import', 'lxwxh', 'product'
                  'included']

        product['ASIN'] = asin

        if asin not in asins:
            asin_product_mapping.append({
                'ASIN': asin,
                'name': product_name
            })
            asins.append(asin)
        breadcrumbs = product_data.get('breadCrumbs', [])
        for bc in breadcrumbs:
            name = clean_str(bc['name'])
            flag = True
            for ig in ignore:
                if ig in name:
                    flag = False
            if flag and '_' in name:
                name_list = name.split("_")
                for n in name_list:
                    product[n] = 1.0
            elif flag:
                product[name] = 1.0
        
        products.append(product)

        review = {}
        for r in reviews_data:
            review['ASIN'] = asin
            review['ProductName'] = clean_str(product_name)
            review['reviewerID'] = r['reviewerName'] + '_' + r['reviewerLink'].split('/')[-1].split('.')[-1]
            reviewRating = re.findall(r'(\d+\.\d+)', r['reviewRating'])
            reviewLocation = r['reviewDate'].split('on')[0].split(' in ')[-1].replace('the ', '')
            reviewDate = re.findall(r'on (.+)$', r['reviewDate'])
            reviewVotes = re.findall(r'(\d+)', r['reviewVotes'])
            if reviewRating:
                review['reviewRating'] = float(reviewRating[0])
            else:
                review['reviewRating'] = np.nan
            if reviewDate:
                review['reviewDate'] = reviewDate[0]
            else:
                review['reviewDate'] = 'Unknown'
            if reviewLocation:
                review['reviewLocation'] = reviewLocation
            else:
                review['reviewLocation'] = 'Unknown'
            if reviewVotes:
                review['reviewVotes'] = reviewVotes[0]
            else:
                review['reviewVotes'] = 0
            reviews.append(review)
            review = {}
    all_reviews_df = pd.DataFrame(reviews)
    all_items_df = pd.DataFrame(products)
    asins_df = pd.DataFrame(asin_product_mapping)
    return all_reviews_df, all_items_df, asins_df

def get_all_json_data():
    base_dir = '../dataset/extracts/amazon'
    all_json_data = []
    for root, dirs, files in os.walk(base_dir):
        for dir in dirs:
            items_path = os.path.join(root, dir, 'items')
            if os.path.exists(items_path):
                json_files = glob.glob(os.path.join(items_path, '*.json'))
                
                for json_file in tqdm(json_files, desc=f'Loading JSON Files in {dir}'):
                    try:
                        with open(json_file, "r") as f:
                            all_json_data.append(json.load(f))
                    except json.JSONDecodeError:
                        print(f"Error loading JSON from file {json_file}: file is empty or not a valid JSON.")
                    except Exception as e:
                        print(f"Unexpected error loading JSON from file {json_file}: {e}")
    return all_json_data

all_json_data = get_all_json_data()

all_reviews_df, all_items_df, asins_df = items_and_reviews_to_dataframe(all_json_data)
all_reviews_df = all_reviews_df.replace(np.nan, '', regex=True)
all_reviews_df.drop_duplicates(keep="first", inplace=True)
all_reviews_df.to_csv(f"{save_to_dir}/reviews.csv")

all_items_df.drop_duplicates(keep='first', inplace=True)
all_items_df.fillna(0, inplace=True)

asins_df.to_csv(f"{save_to_dir}/asin_product_mapping.csv", index=False)

Loading JSON Files in microwave: 100%|██████████| 226/226 [00:00<00:00, 249.30it/s]
Loading JSON Files in facial toner: 100%|██████████| 254/254 [00:00<00:00, 4934.16it/s]
Loading JSON Files in lamp: 100%|██████████| 233/233 [00:00<00:00, 4981.38it/s]
Loading JSON Files in luggage: 100%|██████████| 247/247 [00:00<00:00, 4795.66it/s]
Loading JSON Files in bedroom: 100%|██████████| 48/48 [00:00<00:00, 5653.18it/s]
Loading JSON Files in feminine wash: 100%|██████████| 246/246 [00:00<00:00, 5460.26it/s]
Loading JSON Files in pc power supply: 100%|██████████| 215/215 [00:00<00:00, 4835.90it/s]
Loading JSON Files in razor: 100%|██████████| 240/240 [00:00<00:00, 5794.87it/s]
Loading JSON Files in tablet: 100%|██████████| 186/186 [00:00<00:00, 5261.80it/s]
Loading JSON Files in fantasy novel: 100%|██████████| 233/233 [00:00<00:00, 5098.91it/s]
Loading JSON Files in air fryer: 100%|██████████| 211/211 [00:00<00:00, 5427.67it/s]
Loading JSON Files in coffee maker: 100%|██████████| 231/231 [00:00

Error loading JSON from file ../dataset/extracts/amazon/stove/items/amazon_B07V7JNTLB.json: file is empty or not a valid JSON.


Loading JSON Files in pillow: 100%|██████████| 250/250 [00:00<00:00, 5274.05it/s]
Loading JSON Files in nonfiction novel: 100%|██████████| 247/247 [00:00<00:00, 5883.05it/s]
Loading JSON Files in playroom: 100%|██████████| 48/48 [00:00<00:00, 7204.13it/s]
Loading JSON Files in utensils: 100%|██████████| 248/248 [00:00<00:00, 5150.13it/s]
Loading JSON Files in car seat: 100%|██████████| 240/240 [00:00<00:00, 2527.88it/s]
Loading JSON Files in water flask: 100%|██████████| 304/304 [00:02<00:00, 107.94it/s]
Loading JSON Files in historical novel: 100%|██████████| 281/281 [00:00<00:00, 4936.85it/s]
Loading JSON Files in patio: 100%|██████████| 48/48 [00:00<00:00, 7548.24it/s]
Loading JSON Files in cpu cooler: 100%|██████████| 219/219 [00:00<00:00, 5115.18it/s]
Loading JSON Files in men sweater: 100%|██████████| 252/252 [00:00<00:00, 5193.85it/s]
Loading JSON Files in table: 100%|██████████| 285/285 [00:00<00:00, 5272.13it/s]
Loading JSON Files in women jeans: 100%|██████████| 251/251 [00:0

## Preprocess Itemset

Merge like-ASINs

In [5]:
all_items_df.reset_index(inplace=True)
vc = all_items_df['ASIN'].value_counts()
to_merge = vc.loc[lambda x: x > 1].index.tolist()

all_items_df.set_index('ASIN', inplace=True)
merge_dicts = []
for asin in to_merge:
    merged_row = all_items_df.loc[asin].sum()
    merged_row = merged_row.drop('level_0')
    d = merged_row.to_dict()
    d['ASIN'] = asin
    merge_dicts.append(d)
    all_items_df.drop(asin, inplace=True)
    
merged_df = pd.DataFrame(merge_dicts)
merged_df.set_index('ASIN', inplace=True)
all_items_df = pd.concat([all_items_df, merged_df])
all_items_df.to_csv(f"{save_to_dir}/itemset_preprocessed.csv")

# Preprocess Userbase

Get TopN reviewers only

In [6]:
user_ratings_df = all_reviews_df.drop([
    "reviewVotes", "reviewLocation", "reviewDate",
    "ProductName"], axis=1, inplace=False)
groupby_df = user_ratings_df.groupby('reviewerID')
freq = groupby_df['reviewerID'].value_counts()
groupby_df_freq = pd.merge(user_ratings_df, freq, on='reviewerID', how='left')
groupby_df_freq = groupby_df_freq.sort_values(['count'], ascending=False)

mask = groupby_df_freq["count"] >= 10
groupby_df_freq = groupby_df_freq.loc[mask]

topn_reviewers = pd.unique(groupby_df_freq["reviewerID"])

user_ratings_df.set_index("reviewerID", inplace=True)
user_ratings_grouped_df = user_ratings_df.loc[topn_reviewers].groupby('reviewerID')

generic_reviewerIDs = user_ratings_df.loc[topn_reviewers].groupby(
    'reviewerID').count().sort_values('ASIN', ascending=False)[:8].index.tolist()
topn_reviewers = [r for r in topn_reviewers if r not in generic_reviewerIDs]
user_ratings_grouped_df = user_ratings_df.loc[topn_reviewers].groupby('reviewerID')

rows = []
iter = 0
columns = all_items_df.index.tolist()
columns.append("reviewerID")

for index, data in user_ratings_grouped_df:
    row = {}
    row['reviewerID'] = index
    for ind, d in data.iterrows():
        row[d['ASIN']] = d['reviewRating']
    rows.append(row)
    iter += 1

df_utility = pd.DataFrame(rows, columns=columns)
df_utility.to_csv(f"{save_to_dir}/utility_topn.csv")

In [7]:
all_items_df.index.value_counts()

ASIN
b085fzxzj5    1
b0clszlptn    1
b0b655zqky    1
b07n87g812    1
b09nl2f183    1
             ..
b08jqfkxyk    1
b08f2d9hj9    1
b07cg2pgy6    1
b089kdcmdn    1
b08gqxnj4y    1
Name: count, Length: 33510, dtype: int64

In [8]:
df_utility.columns.value_counts()

b085fzxzj5    1
b0ccn96fml    1
b0ckpsml62    1
b0b655zqky    1
b07n87g812    1
             ..
b00kfcratc    1
b08jqfkxyk    1
b08f2d9hj9    1
b07cg2pgy6    1
reviewerID    1
Name: count, Length: 33511, dtype: int64

In [9]:
asins_df

Unnamed: 0,ASIN,name
0,b085fzxzj5,TOSHIBA ML-EM45PIT(BS) Countertop Microwave Ov...
1,b000iw9j20,Cuisinart CMW-100 1-Cubic-Foot Stainless Steel...
2,b0c6rc2k82,Magic Chef 1000 Watt Compact Small Microwave O...
3,b09xjd2znd,BLACK+DECKER Range Microwave with Top Mount Ai...
4,b07pqt144t,Panasonic NN-SN65KB Microwave Oven with Invert...
...,...,...
33505,b07byyjl71,Intel Celeron G4900T Processor 2.90 GHz Dual C...
33506,b0cscrwfgz,Intel Core i9-13900KF Gaming Desktop Processor...
33507,b086m8v695,Intel® Core™ i3-10300 Desktop Processor 4 Core...
33508,b07s6crlvd,Intel Core i7-9700 Desktop Processor 8 Cores u...
