# Create Dataset

* From raw json files, create the reviews df
* From raw json files, create the items df
* Standardize strings across the items df
* Label encode the categorical columns

In [154]:
import os
import json
import glob
import re
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
from unidecode import unidecode

warnings.simplefilter(action='ignore', category=FutureWarning)

save_to_dir = "../dataset/utility"
if not os.path.exists(save_to_dir):
    os.mkdir(save_to_dir)

def clean_str(x):
    forbidden_chars = [',', '-', '?', '(', ')',
                       '~', '*', '.', '!']
    x = unidecode(x)
    x = '_'.join(x.replace('& ', '').split(' '))
    x = x.lower().strip()
    for c in forbidden_chars:
        x = x.replace(c, '')
    return x

def items_and_reviews_to_dataframe(json_data):
    products = []
    reviews = []
    asins = []
    asin_product_mapping = []
    for product_data in json_data:
        product = {}
        if ('body' not in product_data or 'reviews' not in product_data['body']
            or 'productInformation' not in product_data['body']):
            continue

        reviews_data = product_data['body'].get('reviews', [])
        product_name = product_data['body'].get('name', 'Unknown Product')
        product_data = product_data['body']
        asin = product_data['canonicalUrl'].split('/')[-1].lower()

        if not reviews_data or len(reviews_data) == 0:
            continue
        
        ignore = ['dimensions', 'country_of_origin', 'batteries_included',
                  'weight', 'height', 'size', 'model', 'manufacturer',
                  'specifications', 'voltage', 'volts', '12v', 'climate_pledge',
                  'capacity', 'number_of_items', 'import', 'lxwxh', 'product'
                  'included']

        product['ASIN'] = asin

        if asin not in asins:
            asin_product_mapping.append({
                'ASIN': asin,
                'name': product_name
            })
            asins.append(asin)
        breadcrumbs = product_data.get('breadCrumbs', [])
        for bc in breadcrumbs:
            name = clean_str(bc['name'])
            flag = True
            for ig in ignore:
                if ig in name:
                    flag = False
            if flag and '_' in name:
                name_list = name.split("_")
                for n in name_list:
                    product[n] = 1.0
            elif flag:
                product[name] = 1.0
        
        products.append(product)

        review = {}
        for r in reviews_data:
            review['ASIN'] = asin
            review['ProductName'] = clean_str(product_name)
            review['reviewerID'] = r['reviewerName'] + '_' + r['reviewerLink'].split('/')[-1].split('.')[-1]
            reviewRating = re.findall(r'(\d+\.\d+)', r['reviewRating'])
            reviewLocation = r['reviewDate'].split('on')[0].split(' in ')[-1].replace('the ', '')
            reviewDate = re.findall(r'on (.+)$', r['reviewDate'])
            reviewVotes = re.findall(r'(\d+)', r['reviewVotes'])
            if reviewRating:
                review['reviewRating'] = float(reviewRating[0])
            else:
                review['reviewRating'] = np.nan
            if reviewDate:
                review['reviewDate'] = reviewDate[0]
            else:
                review['reviewDate'] = 'Unknown'
            if reviewLocation:
                review['reviewLocation'] = reviewLocation
            else:
                review['reviewLocation'] = 'Unknown'
            if reviewVotes:
                review['reviewVotes'] = reviewVotes[0]
            else:
                review['reviewVotes'] = 0
            reviews.append(review)
            review = {}
    all_reviews_df = pd.DataFrame(reviews)
    all_items_df = pd.DataFrame(products)
    asins_df = pd.DataFrame(asin_product_mapping)
    return all_reviews_df, all_items_df, asins_df

def get_all_json_data():
    base_dir = '../dataset/extracts/amazon'
    all_json_data = []
    for root, dirs, files in os.walk(base_dir):
        for dir in dirs:
            items_path = os.path.join(root, dir, 'items')
            if os.path.exists(items_path):
                json_files = glob.glob(os.path.join(items_path, '*.json'))
                
                for json_file in tqdm(json_files, desc=f'Loading JSON Files in {dir}'):
                    try:
                        with open(json_file, "r") as f:
                            all_json_data.append(json.load(f))
                    except json.JSONDecodeError:
                        print(f"Error loading JSON from file {json_file}: file is empty or not a valid JSON.")
                    except Exception as e:
                        print(f"Unexpected error loading JSON from file {json_file}: {e}")
    return all_json_data

#all_json_data = get_all_json_data()

all_reviews_df, all_items_df, asins_df = items_and_reviews_to_dataframe(all_json_data)
all_reviews_df = all_reviews_df.replace(np.nan, '', regex=True)
all_reviews_df.drop_duplicates(keep="first", inplace=True)
all_reviews_df.to_csv(f"{save_to_dir}/reviews.csv")

all_items_df.drop_duplicates(keep='first', inplace=True)
all_items_df.fillna(0, inplace=True)

asins_df.to_csv(f"{save_to_dir}/asin_product_mapping.csv", index=False)

## Preprocess Itemset

Merge like-ASINs

In [155]:
all_items_df.reset_index(inplace=True)
vc = all_items_df['ASIN'].value_counts()
to_merge = vc.loc[lambda x: x > 1].index.tolist()

all_items_df.set_index('ASIN', inplace=True)
merge_dicts = []
for asin in to_merge:
    merged_row = all_items_df.loc[asin].sum()
    merged_row = merged_row.drop('level_0')
    d = merged_row.to_dict()
    d['ASIN'] = asin
    merge_dicts.append(d)
    all_items_df.drop(asin, inplace=True)
    
merged_df = pd.DataFrame(merge_dicts)
merged_df.set_index('ASIN', inplace=True)
all_items_df = pd.concat([all_items_df, merged_df])
all_items_df.to_csv(f"{save_to_dir}/itemset_preprocessed.csv")

# Preprocess Userbase

Get TopN reviewers only

In [167]:
user_ratings_df = all_reviews_df.drop([
    "reviewVotes", "reviewLocation", "reviewDate",
    "ProductName"], axis=1, inplace=False)
groupby_df = user_ratings_df.groupby('reviewerID')
freq = groupby_df['reviewerID'].value_counts()
groupby_df_freq = pd.merge(user_ratings_df, freq, on='reviewerID', how='left')
groupby_df_freq = groupby_df_freq.sort_values(['count'], ascending=False)

mask = groupby_df_freq["count"] >= 10
groupby_df_freq = groupby_df_freq.loc[mask]

topn_reviewers = pd.unique(groupby_df_freq["reviewerID"])

user_ratings_df.set_index("reviewerID", inplace=True)
user_ratings_grouped_df = user_ratings_df.loc[topn_reviewers].groupby('reviewerID')

generic_reviewerIDs = user_ratings_df.loc[topn_reviewers].groupby(
    'reviewerID').count().sort_values('ASIN', ascending=False)[:8].index.tolist()
topn_reviewers = [r for r in topn_reviewers if r not in generic_reviewerIDs]
user_ratings_grouped_df = user_ratings_df.loc[topn_reviewers].groupby('reviewerID')

rows = []
iter = 0
columns = all_items_df.index.tolist()
columns.append("reviewerID")

for index, data in user_ratings_grouped_df:
    row = {}
    row['reviewerID'] = index
    for ind, d in data.iterrows():
        row[d['ASIN']] = d['reviewRating']
    rows.append(row)
    iter += 1

df_utility = pd.DataFrame(rows, columns=columns)
df_utility.to_csv(f"{save_to_dir}/utility_topn.csv")

In [158]:
all_items_df.index.value_counts()

ASIN
b001f30182    1
b08p59knm4    1
b08p4mbk17    1
b08mwqqhcd    1
b08lg9g3jt    1
             ..
b0bnqymlyq    1
b0bmv2c6lv    1
b0bmpzd2xs    1
b0bml7h9kq    1
b0ctcy1wtr    1
Name: count, Length: 33510, dtype: int64

In [159]:
df_utility.columns.value_counts()

b001f30182    1
b08qtvhk6k    1
b08p58jxj9    1
b08p4mbk17    1
b08mwqqhcd    1
             ..
b0bq2m742p    1
b0bnqymlyq    1
b0bmv2c6lv    1
b0bmpzd2xs    1
reviewerID    1
Name: count, Length: 33511, dtype: int64

In [160]:
asins_df

Unnamed: 0,ASIN,name
0,b001f30182,STAR WARS The Black Series Dark Trooper Toy 6-...
1,b001gn794k,"Avatar: The Last Airbender Prince Zuko 7"" Acti..."
2,b00askv7fe,Hasbro Marvel Ultimate Spider-man Titan Hero S...
3,b00hzsmwmy,Accoutrements Crazy Cat Lady Action Figure Mul...
4,b00ik8qpoy,Marvel Ultimate Spider-Man Titan Hero Series A...
...,...,...
33505,b07zt8vlv4,Bestisun Womens Long Sleeve Workout Tops Yoga ...
33506,b0cqy9mrmr,Maxbee Flared Leggings with Pockets for Women ...
33507,b0cjr8rplq,"TownCat Women’s Yoga Pants with Pockets, High ..."
33508,b0cr15gpvh,AUROLA Serpent Seamless Scrunch Workout Shorts...
