In [11]:
import os
import json
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm  # Import tqdm
import re
import numpy as np

save_to_dir = "../dataset/utility/users"
if not os.path.exists(save_to_dir):
    os.mkdir(save_to_dir)

def reviews_to_dataframe(json_data):
    all_reviews_df = pd.DataFrame()
    iter = 0
    for product_data in json_data:
        if 'body' not in product_data or 'reviews' not in product_data['body']:
            continue
        reviews_data = product_data['body'].get('reviews', [])
        product_name = product_data['body'].get('name', 'Unknown Product')
        asin = product_data['url'].split('/')[-1]

        print(f"adding review {iter}: {asin}")
        iter += 1
        
        if not reviews_data:
            continue
        
        reviews_df = pd.DataFrame.from_records(reviews_data)

        columns_to_include = [
            "reviewerName",
            'ASIN',
            "reviewerLink", 
            "reviewRating", 
            "reviewDate", 
            "reviewTitle", 
            "reviewText", 
            "reviewVotes", 
            "reviewVerifiedPurchase"
        ]
        reviews_df['ASIN'] = asin
        existing_columns = [col for col in columns_to_include if col in reviews_df.columns]
        
        if not existing_columns:
            continue
        
        reviews_df = reviews_df[existing_columns]
        
        if 'reviewRating' in existing_columns:
            reviews_df['reviewRating'] = reviews_df['reviewRating'].str.extract(r'(\d+\.\d+)').astype(float)
        if 'reviewDate' in existing_columns:
            reviews_df[['Location', 'Date']] = reviews_df['reviewDate'].str.extract(r'Reviewed in the ([\w\s]+) on (.+)$')
        if 'reviewVotes' in existing_columns:
            reviews_df['reviewVotes'] = reviews_df['reviewVotes'].str.extract(r'(\d+)').fillna(0).astype(int)
        if 'reviewerLink' in existing_columns:
            reviews_df['reviewerID'] = reviews_df['reviewerLink'].str.extract(r'.*amzn1.account.([^/]+)')
        
        for col in ['reviewDate', 'reviewerLink']:
            if col in reviews_df.columns:
                reviews_df.drop(columns=[col], inplace=True)
        
        reviews_df['ProductName'] = product_name
        
        
        all_reviews_df = pd.concat([all_reviews_df, reviews_df], ignore_index=True)
    
    return all_reviews_df

In [105]:
base_dir = '../dataset/extracts/amazon'
all_json_data = []

for root, dirs, files in os.walk(base_dir):
    for dir in dirs:
        items_path = os.path.join(root, dir, 'items')
        if os.path.exists(items_path):
            json_files = glob.glob(os.path.join(items_path, '*.json'))
            
            for json_file in tqdm(json_files, desc=f'Loading JSON Files in {dir}'):
                try:
                    with open(json_file, "r") as f:
                        all_json_data.append(json.load(f))
                except json.JSONDecodeError:
                    print(f"Error loading JSON from file {json_file}: file is empty or not a valid JSON.")
                except Exception as e:
                    print(f"Unexpected error loading JSON from file {json_file}: {e}")

all_reviews_df = reviews_to_dataframe(all_json_data)
all_reviews_df.head()
all_reviews_df.to_csv(f"{save_to_dir}/reviews.csv", index=False)

Loading JSON Files in microwave: 100%|██████████| 226/226 [00:00<00:00, 4104.62it/s]
Loading JSON Files in facial toner: 100%|██████████| 254/254 [00:00<00:00, 4327.68it/s]
Loading JSON Files in lamp: 100%|██████████| 233/233 [00:00<00:00, 4180.26it/s]
Loading JSON Files in luggage: 100%|██████████| 247/247 [00:00<00:00, 3921.08it/s]
Loading JSON Files in bedroom: 100%|██████████| 48/48 [00:00<00:00, 5025.12it/s]
Loading JSON Files in feminine wash: 100%|██████████| 246/246 [00:00<00:00, 4690.89it/s]
Loading JSON Files in pc power supply: 100%|██████████| 215/215 [00:00<00:00, 4706.85it/s]
Loading JSON Files in razor: 100%|██████████| 240/240 [00:00<00:00, 4454.17it/s]
Loading JSON Files in tablet: 100%|██████████| 186/186 [00:00<00:00, 5034.79it/s]
Loading JSON Files in fantasy novel: 100%|██████████| 233/233 [00:00<00:00, 4952.48it/s]
Loading JSON Files in air fryer: 100%|██████████| 211/211 [00:00<00:00, 4747.33it/s]
Loading JSON Files in coffee maker: 100%|██████████| 231/231 [00:0

Error loading JSON from file ../dataset/extracts/amazon/stove/items/amazon_B07V7JNTLB.json: file is empty or not a valid JSON.


Loading JSON Files in pillow: 100%|██████████| 250/250 [00:00<00:00, 4758.64it/s]
Loading JSON Files in nonfiction novel: 100%|██████████| 247/247 [00:00<00:00, 5503.08it/s]
Loading JSON Files in playroom: 100%|██████████| 48/48 [00:00<00:00, 6654.54it/s]
Loading JSON Files in utensils: 100%|██████████| 248/248 [00:00<00:00, 4696.70it/s]
Loading JSON Files in car seat: 100%|██████████| 240/240 [00:00<00:00, 5256.35it/s]
Loading JSON Files in water flask: 100%|██████████| 304/304 [00:00<00:00, 4970.56it/s]
Loading JSON Files in historical novel: 100%|██████████| 281/281 [00:00<00:00, 5283.94it/s]
Loading JSON Files in patio: 100%|██████████| 48/48 [00:00<00:00, 6596.33it/s]
Loading JSON Files in cpu cooler: 100%|██████████| 219/219 [00:00<00:00, 4429.17it/s]
Loading JSON Files in men sweater: 100%|██████████| 252/252 [00:00<00:00, 4981.34it/s]
Loading JSON Files in table: 100%|██████████| 285/285 [00:00<00:00, 5120.55it/s]
Loading JSON Files in women jeans: 100%|██████████| 251/251 [00:

KeyboardInterrupt: 

## Form User-Rating Matrix

Two approaches:
* Split userbase into 20K users per file
* Get top N reviewers

In [None]:
save_to_dir = "../dataset/utility/users"

all_reviews_df = pd.read_csv(f"{save_to_dir}/reviews.csv")
columns = pd.unique(all_reviews_df['ASIN']).tolist()
columns.append("reviewerID")
all_reviews_df = all_reviews_df.replace(np.nan, '', regex=True)
all_reviews_df["reviewerID"] =  all_reviews_df["reviewerName"] + '_' + all_reviews_df["reviewerID"]
user_ratings_df = all_reviews_df.drop([
    "reviewTitle", "reviewText", "reviewVotes",
    "reviewVerifiedPurchase", "Location", "Date",
    "ProductName"], axis=1, inplace=False)
user_ratings_df.drop(["reviewerName"], axis=1, inplace=True)
user_ratings_df = user_ratings_df.groupby(["reviewerID"])

In [None]:
import csv
rows = []
iter = 0

for index, data in user_ratings_df:
    print(f"iter: {iter} | {index[0]}")
    data = data.to_dict()
    row = {}
    row["reviewerID"] = index[0]
    for i in data["ASIN"]:
        asin = data["ASIN"][i]
        rating = data["reviewRating"][i]
        row[asin] = rating
    rows.append(row)
    iter += 1
    if iter % 10000 == 0:
        fname = f"{save_to_dir}/utility_{iter}.csv"
        with open(fname, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=columns)
            writer.writeheader()
            writer.writerows(rows)
        rows = []
fname = f"{save_to_dir}/utility_{iter}.csv"
with open(fname, 'w') as f:
    writer = csv.DictWriter(f, fieldnames=columns)
    writer.writeheader()
    writer.writerows(rows)

# Get top N reviewers

In [70]:
save_to_dir = "../dataset/utility/users"
all_reviews_df = pd.read_csv(f"{save_to_dir}/reviews.csv")
all_reviews_df = all_reviews_df.replace(np.nan, '', regex=True)
all_reviews_df["reviewerID"] =  all_reviews_df["reviewerName"].astype(str) + '_' + all_reviews_df["reviewerID"].astype(str)
user_ratings_df = all_reviews_df.drop([
    "reviewTitle", "reviewText", "reviewVotes",
    "reviewVerifiedPurchase", "Location", "Date",
    "ProductName"], axis=1, inplace=False)
user_ratings_df.drop(["reviewerName"], axis=1, inplace=True)
groupby_df = user_ratings_df.groupby('reviewerID')
freq = groupby_df['reviewerID'].value_counts()
groupby_df_freq = pd.merge(user_ratings_df, freq, on='reviewerID', how='left')
groupby_df_freq = groupby_df_freq.sort_values(['count'], ascending=False)

mask = groupby_df_freq["count"] >= 10
groupby_df_freq = groupby_df_freq.loc[mask]

In [71]:
groupby_df_freq

Unnamed: 0,ASIN,reviewRating,reviewerID,count
74626,B09XM96H87,5.0,Amazon Customer_,3554
295270,B08NWS6CH3,5.0,Amazon Customer_,3554
159270,B0CN3RHMF9,5.0,Amazon Customer_,3554
159236,B09FR31CPF,5.0,Amazon Customer_,3554
159212,B0B1RLT5L2,5.0,Amazon Customer_,3554
...,...,...,...,...
274765,B084LHNR57,5.0,SD_,10
72061,B0C4GZN99V,5.0,XennialLifeXennialLife_AGXDPEOXPMPIHIWPVXTBYFB...,10
90170,B092Q348ZC,4.0,Cynthia_,10
203282,B0748G2F3D,5.0,live love laugh_AGEJVATBEAWO46BMRZTXDTIANJFQ,10


In [72]:
topn_reviewers = pd.unique(groupby_df_freq["reviewerID"])
user_ratings_df.set_index("reviewerID", inplace=True)
user_ratings_df.loc[topn_reviewers]
user_ratings_grouped_df = user_ratings_df.loc[topn_reviewers].groupby('reviewerID')

In [74]:
# identify and remove generic reviewerID
generic_reviewerIDs = user_ratings_df.loc[topn_reviewers].groupby('reviewerID').count().sort_values('ASIN', ascending=False)[:8].index.tolist()
topn_reviewers = [r for r in topn_reviewers if r not in generic_reviewerIDs]
user_ratings_grouped_df = user_ratings_df.loc[topn_reviewers].groupby('reviewerID')

In [76]:
import csv
rows = []
iter = 0
columns = pd.unique(all_reviews_df['ASIN']).tolist()
columns.append("reviewerID")

for index, data in user_ratings_grouped_df:
    print(f"iter: {iter} | {index}")
    data = data.to_dict()
    row = {}
    row["reviewerID"] = index
    for i in data["ASIN"]:
        asin = data["ASIN"][i]
        rating = data["reviewRating"][i]
        row[asin] = rating
    rows.append(row)
    iter += 1
fname = f"{save_to_dir}/utility_topn.csv"
with open(fname, 'w') as f:
    writer = csv.DictWriter(f, fieldnames=columns)
    writer.writeheader()
    writer.writerows(rows)

iter: 0 | ***Toy Collector***_AG5NFKDKQNEYV76GKH7BMXNTHKSQ
iter: 1 | A._
iter: 2 | AJ_
iter: 3 | A_
iter: 4 | Aaron_
iter: 5 | Adam Nelson_AHIL2JHWADD6Z5U277LX5WLGG2VA
iter: 6 | Adam_
iter: 7 | Adrian_
iter: 8 | Adriana_
iter: 9 | Al_
iter: 10 | Alan_
iter: 11 | Alberto_
iter: 12 | Ale_
iter: 13 | Alejandra_
iter: 14 | Alejandro_
iter: 15 | Alessandro_
iter: 16 | AlexAlex_
iter: 17 | Alex_
iter: 18 | Alexandra_
iter: 19 | Alfredo_
iter: 20 | Ali_
iter: 21 | Alicia_
iter: 22 | Amanda_
iter: 23 | Amazon-Kunde_
iter: 24 | Amazonlover_AG62UD7Q67DBARLYYTGSWHASW33Q
iter: 25 | Amazonカスタマー_
iter: 26 | Amber_
iter: 27 | Amy_
iter: 28 | Ana_
iter: 29 | Andre_
iter: 30 | Andrea_
iter: 31 | Andrew_
iter: 32 | Andy_
iter: 33 | Angel_
iter: 34 | Angela_
iter: 35 | Angie_
iter: 36 | Ann_
iter: 37 | Anna_
iter: 38 | Anne_
iter: 39 | Annie_
iter: 40 | Anon_
iter: 41 | Anonymous _
iter: 42 | Anonymous_
iter: 43 | Antonio_
iter: 44 | Armando_
iter: 45 | Arturo_
iter: 46 | AseAware_AEYZWWDY354SEDQR36DCXOH

In [77]:
df = pd.read_csv(f"{save_to_dir}/utility_topn.csv")

In [78]:
has_non_null_vals_cols = [col for col in df.columns if df[col].any() & ~df[col].eq(np.nan).all()]
df.loc[:, has_non_null_vals_cols]

Unnamed: 0,B087CDBKCH,B0BZXNSW5K,B0BX59CFN1,1685795714,1542034299,B0BHFBQ76G,B0C4BHDZGM,0735221103,B008LQXR9Q,B0BXQS3JKP,...,B0BCDR9M33,B09TWVPXS5,B07MMD4DDJ,B07Q11QQCM,B086MHTK5C,B086MHSH2Z,B086ML4XSB,B08V1T4JC1,B0759FGJ3Q,reviewerID
0,,,,,,,,,,,...,,,,,,,,,,***Toy Collector***_AG5NFKDKQNEYV76GKH7BMXNTHKSQ
1,,,,,,,,,,,...,,,,,,,,,,A._
2,,,,,,,,,,,...,,,,,,,,,,AJ_
3,,,,,,,,,,,...,,,,,,,,,,A_
4,,,,,,,,,,,...,,,,,,,,,,Aaron_
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
477,,,,,,,,,,,...,,,,,,,,,,david_
478,,,,,,,,,,,...,,,,,,,,,,live love laugh_AGEJVATBEAWO46BMRZTXDTIANJFQ
479,,,,,,,,,,,...,,,,,,,,,,mike_
480,,,,,,,,,,,...,,,,,,,,,,susan shelton_


In [79]:
df.shape

(482, 33667)