# Create Dataset

* From raw json files, create the reviews df
* From raw json files, create the items df
* Standardize strings across the items df
* Label encode the categorical columns

In [28]:
import os
import json
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

save_to_dir = "../dataset/utility"
if not os.path.exists(save_to_dir):
    os.mkdir(save_to_dir)

def items_and_reviews_to_dataframe(json_data):
    all_reviews_df = pd.DataFrame()
    all_items_df = pd.DataFrame()
    iter = 0
    product = {}
    for product_data in json_data:
        if ('body' not in product_data or 'reviews' not in product_data['body']
            or 'productInformation' not in product_data['body']):
            continue

        reviews_data = product_data['body'].get('reviews', [])
        product_name = product_data['body'].get('name', 'Unknown Product')
        product_data = product_data['body']
        asin = product_data['canonicalUrl'].split('/')[-1]

        print(f"adding {iter}: {asin}")
        iter += 1

        if not reviews_data or len(reviews_data) == 0:
            continue

        reviews_df = pd.DataFrame.from_records(reviews_data)

        # form product data
        product['ASIN'] = asin
        product['name'] = product_name
        breadcrumbs = product_data.get('breadCrumbs', [])
        productInformation = product_data.get('productInformation', [])
        for bc in breadcrumbs:
            product[bc['name']] = 1
        for pi in productInformation: # categorical values
            product[pi['name']] = pi['value']

        items_df = pd.DataFrame.from_records([product])

        columns_to_include = [
            "reviewerName",
            'ASIN',
            "reviewerLink", 
            "reviewRating", 
            "reviewDate", 
            "reviewTitle", 
            "reviewText", 
            "reviewVotes", 
            "reviewVerifiedPurchase"
        ]
        reviews_df['ASIN'] = asin
        existing_columns = [col for col in columns_to_include if col in reviews_df.columns]
        
        if not existing_columns:
            continue
        
        reviews_df = reviews_df[existing_columns]
        
        if 'reviewRating' in existing_columns:
            reviews_df['reviewRating'] = reviews_df['reviewRating'].str.extract(r'(\d+\.\d+)').astype(float)
        if 'reviewDate' in existing_columns:
            reviews_df[['Location', 'Date']] = reviews_df['reviewDate'].str.extract(r'Reviewed in the ([\w\s]+) on (.+)$')
        if 'reviewVotes' in existing_columns:
            reviews_df['reviewVotes'] = reviews_df['reviewVotes'].str.extract(r'(\d+)').fillna(0).astype(int)
        if 'reviewerLink' in existing_columns:
            reviews_df['reviewerID'] = reviews_df['reviewerLink'].str.extract(r'.*amzn1.account.([^/]+)')
        
        for col in ['reviewDate', 'reviewerLink']:
            if col in reviews_df.columns:
                reviews_df.drop(columns=[col], inplace=True)
        
        reviews_df['ProductName'] = product_name
        all_reviews_df = pd.concat([all_reviews_df, reviews_df], ignore_index=True)
        all_items_df = pd.concat([all_items_df, items_df], ignore_index=True)
    return all_reviews_df, all_items_df

In [None]:
def create_dataset():
    base_dir = '../dataset/extracts/amazon'
    all_json_data = []
    for root, dirs, files in os.walk(base_dir):
        for dir in dirs:
            items_path = os.path.join(root, dir, 'items')
            if os.path.exists(items_path):
                json_files = glob.glob(os.path.join(items_path, '*.json'))
                
                for json_file in tqdm(json_files, desc=f'Loading JSON Files in {dir}'):
                    try:
                        with open(json_file, "r") as f:
                            all_json_data.append(json.load(f))
                    except json.JSONDecodeError:
                        print(f"Error loading JSON from file {json_file}: file is empty or not a valid JSON.")
                    except Exception as e:
                        print(f"Unexpected error loading JSON from file {json_file}: {e}")

In [29]:
all_reviews_df, all_items_df = items_and_reviews_to_dataframe(all_json_data)
all_reviews_df.to_csv(f"{save_to_dir}/reviews.csv", index=False)
all_items_df.to_csv(f"{save_to_dir}/itemset_no_preprocessing.csv", index=False)
display(all_reviews_df.head(10))
display(all_items_df.head(10))

adding 0: B0B28M9QFH
adding 1: B01BYK3KBK
adding 2: B085FZXZJ5
adding 3: B000IW9J20
adding 4: B0C6RC2K82
adding 5: B09XJD2ZND
adding 6: B07PQT144T
adding 7: B086JP7FX9
adding 8: B07JFL43NX
adding 9: B08S1Z625G
adding 10: B0C44FVT2Q
adding 11: B0BWJZ2FHW
adding 12: B00QEU6EKU
adding 13: B0C61KBPC8
adding 14: B08K8T3W2V
adding 15: B00FPOHDLU
adding 16: B01739UNYI
adding 17: B0754F4WVX
adding 18: B0BNP5LLCH
adding 19: B08CM6BP3J
adding 20: B07MC8BC41
adding 21: B0C6NHVT5F
adding 22: B0BWZZMHPV
adding 23: B0854DYFF3
adding 24: B0CM3KP8N1
adding 25: B00QEU8LCE
adding 26: B07SCGY2H6
adding 27: B097B9LJCW
adding 28: B08WDN9KCH
adding 29: B07MY2XS3S
adding 30: B0CSMV3PKD
adding 31: B07R6NPY6R
adding 32: B0CCGFGLVG
adding 33: B00H8BFGPI
adding 34: B086H1H5CK
adding 35: B0CJVRB42N
adding 36: B00785MVRA
adding 37: B0061BH488
adding 38: B00F2QFX5O
adding 39: B0BT1T4PC3
adding 40: B081ZS7VSM
adding 41: B07VNT3L14
adding 42: B0BNP2T3CQ
adding 43: B074278JDB
adding 44: B09T9M8432
adding 45: B01N23XS8

## Merge ASINs into one row

This handles the case when an ASIN appears in several categories

In [None]:
duplicates = all_items_df[all_items_df.duplicated('ASIN', keep=False)]
df_categories = duplicates.groupby('ASIN')['category'].apply(lambda x: ', '.join(x.unique())).reset_index(name='Categories')
df_unique = all_items_df.copy()
df_unique.set_index(["ASIN"], inplace=True)
df_unique.drop( ["category", "name"], inplace=True, axis=1)

dupe_asins = df_categories.set_index(["ASIN"]).index.tolist()
feats = df_unique.columns.tolist()
for asin in dupe_asins:
    print(f"processing {asin}")
    asin_indiv_df = df_unique.loc[asin]
    indices, series_list = zip(*asin_indiv_df.iterrows())

    ind = indices[0]
    final_series = series_list[0]
    for i in range(1, len(series_list)):
        for f in feats:
            if ~np.isnan(series_list[i][f]):
                final_series[f] = 1.0
    
    asin_indiv_df_filtered = pd.DataFrame(final_series)
    df_unique.drop(index=asin, inplace=True)
    df_unique = pd.concat([asin_indiv_df_filtered.T, df_unique])
df_unique.shape

## One-hot encode all categorical columns
* Remove columns pertaining to dimensions
* Clean strings to make sure the labels are as accurate as possible
* Lemmatize?
* Use label encoder to convert string values to integers, thus making it digestible by the recommender system algorithms

In [None]:
# standardize each string column
def clean_str(x):
    x = '_'.join(x.split(' '))
    return x.lower().strip()

In [None]:
categorical_cols = [df_unique.select_dtypes(include='str').columns]

In [None]:
df_unique.to_csv("../dataset/utility/itemset_preprocessed.csv")