In [2]:
# Read meta_data_link.json

import json
import sys
import requests

with open('/teamspace/studios/this_studio/amd_benchmark/meta_data_link.json') as f:
    data = json.load(f)


In [6]:
# Read from data json and download files
for row in data['categories']:
    category = row['category']
    meta_download_link = row['meta_download_link']
    # Download using meta_download_link
    r = requests.get(meta_download_link, allow_redirects=True)
    open(f'./amd_benchmark/amazon_review_data/{category}.gz', 'wb').write(r.content)
    print(f'{category}.gz downloaded')


Clothing_Shoes_and_Jewelry.gz downloaded
Digital_Music.gz downloaded
Electronics.gz downloaded
Gift_Cards.gz downloaded
Grocery_and_Gourmet_Food.gz downloaded
Handmade_Products.gz downloaded
Health_and_Household.gz downloaded
Health_and_Personal_Care.gz downloaded
Home_and_Kitchen.gz downloaded
Industrial_and_Scientific.gz downloaded
Kindle_Store.gz downloaded
Magazine_Subscriptions.gz downloaded
Movies_and_TV.gz downloaded
Musical_Instruments.gz downloaded
Office_Products.gz downloaded
Patio_Lawn_and_Garden.gz downloaded
Pet_Supplies.gz downloaded
Software.gz downloaded
Sports_and_Outdoors.gz downloaded
Subscription_Boxes.gz downloaded
Tools_and_Home_Improvement.gz downloaded
Toys_and_Games.gz downloaded
Video_Games.gz downloaded
Unknown.gz downloaded


In [4]:
# Unzip all .gz files
import gzip
import shutil
for category in data['categories']:
    with gzip.open(f'./amd_benchmark/amazon_review_data/{category["category"]}.gz', 'rb') as f_in:
        with open(f'./amd_benchmark/amazon_review_data/{category["category"]}.json', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print(f'{category["category"]} unzipped')

All_Beauty unzipped
Amazon_Fashion unzipped
Appliances unzipped
Arts_Crafts_and_Sewing unzipped
Automotive unzipped
Baby_Products unzipped
Beauty_and_Personal_Care unzipped
Books unzipped
CDs_and_Vinyl unzipped
Cell_Phones_and_Accessories unzipped
Clothing_Shoes_and_Jewelry unzipped
Digital_Music unzipped
Electronics unzipped
Gift_Cards unzipped
Grocery_and_Gourmet_Food unzipped
Handmade_Products unzipped
Health_and_Household unzipped
Health_and_Personal_Care unzipped
Home_and_Kitchen unzipped
Industrial_and_Scientific unzipped
Kindle_Store unzipped
Magazine_Subscriptions unzipped
Movies_and_TV unzipped
Musical_Instruments unzipped
Office_Products unzipped
Patio_Lawn_and_Garden unzipped
Pet_Supplies unzipped
Software unzipped
Sports_and_Outdoors unzipped
Subscription_Boxes unzipped
Tools_and_Home_Improvement unzipped
Toys_and_Games unzipped
Video_Games unzipped
Unknown unzipped


In [1]:
# Organize json files and sample a category classifcation dataset
import json
import os
import random

data_dir = './amd_benchmark/amazon_review_data'
data_files = os.listdir(data_dir)
data_files = [f for f in data_files if f.endswith('.json')]
SAMPLE_SIZE_PER_CATEGORY = 10
# Read each json data files, sampple 1000 from each category
for fi in data_files:
    counter = 0
    with open(f'{data_dir}/{fi}', 'r') as f:
        for line in f:
            data = json.loads(line)
            category = data['main_category']
            if category and category.lower() !='null':
                with open(f'./amd_benchmark/amazon_review_data/amazon_review_mini_classification_data.json', 'a') as f2:
                    # Dump oen sample per line
                    json.dump(data, f2)
                    f2.write("\n")
                    counter +=1
            if counter == SAMPLE_SIZE_PER_CATEGORY:
                break
        print(f'{category} sampled')
    




Gift Cards sampled
All Beauty sampled
Digital Music sampled
Buy a Kindle sampled
Appstore for Android sampled
Amazon Home sampled
Grocery sampled
Books sampled
Industrial & Scientific sampled
Toys & Games sampled
Sports & Outdoors sampled
SUBSCRIPTION BOXES sampled
None sampled
Magazine Subscriptions sampled
AMAZON FASHION sampled
All Beauty sampled
Tools & Home Improvement sampled
Handmade sampled
Video Games sampled
Digital Music sampled
Arts, Crafts & Sewing sampled
Health & Personal Care sampled
Musical Instruments sampled
Industrial & Scientific sampled
Movies & TV sampled
AMAZON FASHION sampled
Baby sampled
All Electronics sampled
Industrial & Scientific sampled
AMAZON FASHION sampled
Amazon Home sampled
Pet Supplies sampled
Computers sampled
Cell Phones & Accessories sampled


In [11]:
# Read amazon_review_mini_classification_data.json
with open('./amd_benchmark/amazon_review_data/amazon_review_mini_classification_data.json', 'r') as f:
    data = f.readlines()
    print(f'Number of samples: {len(data)}')

Number of samples: 330


In [5]:
from openai import OpenAI

from dotenv import load_dotenv
import os
import json

load_dotenv()
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

In [13]:
# Find all unique categories from data
categories = []
for line in data:
    categories.append(json.loads(line)['main_category'])
categories = list(set(categories))
print(categories)

['All Beauty', 'SUBSCRIPTION BOXES', 'Toys & Games', 'Pet Supplies', 'Office Products', 'Automotive', 'Amazon Home', 'Magazine Subscriptions', 'Sports & Outdoors', 'Tools & Home Improvement', 'Arts, Crafts & Sewing', 'Books', 'Health & Personal Care', 'Industrial & Scientific', 'Cell Phones & Accessories', 'Handmade', 'Appstore for Android', 'Computers', 'Baby', 'Buy a Kindle', 'Appliances', 'Gift Cards', 'Digital Music', 'Grocery', 'Prime Video', 'All Electronics', 'AMAZON FASHION', 'Video Games', 'Musical Instruments', 'Movies & TV']


In [14]:

# Prompt the model to do classification
system_prompt = f"""
You are tasked with classifying amazon product title into main categories. You are going to apply your common 
sense and knowledge to classify the product title into one of the categories.
{categories}

- Some products contains subtitle. Consider the subtitle for classification.
- Give your reasoning for the classification.
"""

# Short prompt (document to classify)
document = """
Please classify the this product title into one of the categories: {categories}
title: {title}
subtitle: {subtitle}

Output in json format with your reasoning. Example output
{{
    "classification": "Books",
    "reasoning": "The title looks like a book title, so I classified it as a book."
}}
"""


def amazon_title_classification(title, subtitle):
    response = client.chat.completions.create(
        model="gpt-4o-mini-2024-07-18",  # or 'gpt-3.5-turbo'
        messages=[
            {"role": "system", "content": system_prompt.format(categories=categories)},
            {"role": "user", "content": document.format(categories=categories, title=title, subtitle=subtitle)},
        ],
        temperature=0.0,
        response_format={ "type": "json_object" }
        )

    classification = response.choices[0].message.content

    # print(classification)
    # Process response
    try:
        classification = json.loads(classification)
        res = classification['classification'].lower()
        # if res not in lv1_labels:
        #     raise Exception("Invalid classification")
        return classification
        # TODO: Token count
    except:
        # Retry upto 3 times
        raise Exception("Output is not json format or classification fail")

In [23]:
# Evaluate the model
from time import sleep

y_true = []
y_pred = []
for line in data:
    row = json.loads(line)
    y_true.append(row['main_category'])
    title = row['title']
    subtitle = row.get('subtitle', 'NO_SUBTITLE')
    print(f'Title: {title}')
    print(f'Subtitle: {subtitle}')
    print(f'Category: {row["main_category"]}')
    try:
        classification = amazon_title_classification(title, subtitle)
        y_pred.append(classification['classification'])
        print(f'Predicted Category: {classification["classification"]}')
        print(f'Reasoning: {classification["reasoning"]}')
        # Write result to file
        with open('./amd_benchmark/amazon_review_mini_classification_result.json', 'a') as f:
            json.dump({
                "title": title,
                "subtitle": subtitle,
                "true_category": row["main_category"],
                "predicted_category": classification["classification"],
                "reasoning": classification["reasoning"]
            }, f)
            f.write("\n")
    except Exception as e:
        print(f'Error: {e}')
    print('------------------------------------')
    print('\n\n')
    sleep(1)


Title: Amazon.com Gift Card in Gift Tag (Various Designs)
Subtitle: NO_SUBTITLE
Category: Gift Cards
Predicted Category: Gift Cards
Reasoning: The product title explicitly mentions 'Gift Card', which directly categorizes it under Gift Cards. The mention of 'Various Designs' indicates that it is a type of gift card, further confirming this classification.
------------------------------------



Title: $25 Mastercard Gift Card (plus $3.95 Purchase Fee)
Subtitle: NO_SUBTITLE
Category: Gift Cards
Predicted Category: Gift Cards
Reasoning: The product title clearly indicates that it is a Mastercard Gift Card, which falls under the category of Gift Cards. The mention of a purchase fee further supports this classification.
------------------------------------



Title: Tractor Supply Company Gift Card
Subtitle: NO_SUBTITLE
Category: Gift Cards
Predicted Category: Gift Cards
Reasoning: The title explicitly mentions 'Gift Card', which directly falls under the category of Gift Cards. There is no 

In [3]:
# Read from amazon_review_mini_classification_result.json
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
with open('./amd_benchmark/amazon_review_mini_classification_result.json', 'r') as f:
    data = f.readlines()
    print(f'Number of samples: {len(data)}')

Number of samples: 82


In [6]:
y_true = []
y_pred = []
for line in data:
    row = json.loads(line)
    y_true.append(row['true_category'])
    y_pred.append(row['predicted_category'])

print(f'Accuracy: {accuracy_score(y_true, y_pred)}')

Accuracy: 0.524390243902439


In [7]:
classification_report(y_true, y_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'                          precision    recall  f1-score   support\n\n              All Beauty       1.00      0.45      0.62        11\n         All Electronics       0.50      0.50      0.50         2\n             Amazon Home       0.92      0.92      0.92        13\n    Appstore for Android       0.00      0.00      0.00        10\n   Arts, Crafts & Sewing       0.00      0.00      0.00         0\n              Automotive       0.00      0.00      0.00         0\n                    Baby       0.00      0.00      0.00         0\n                   Books       0.00      0.00      0.00         0\n            Buy a Kindle       0.00      0.00      0.00        10\n           Digital Music       1.00      0.30      0.46        10\n              Gift Cards       1.00      1.00      1.00        11\n                 Grocery       0.89      1.00      0.94         8\n                Handmade       0.00      0.00      0.00         1\n  Health & Personal Care       0.00      0.00      0.00    

In [8]:
print('                          precision    recall  f1-score   support\n\n              All Beauty       1.00      0.45      0.62        11\n         All Electronics       0.50      0.50      0.50         2\n             Amazon Home       0.92      0.92      0.92        13\n    Appstore for Android       0.00      0.00      0.00        10\n   Arts, Crafts & Sewing       0.00      0.00      0.00         0\n              Automotive       0.00      0.00      0.00         0\n                    Baby       0.00      0.00      0.00         0\n                   Books       0.00      0.00      0.00         0\n            Buy a Kindle       0.00      0.00      0.00        10\n           Digital Music       1.00      0.30      0.46        10\n              Gift Cards       1.00      1.00      1.00        11\n                 Grocery       0.89      1.00      0.94         8\n                Handmade       0.00      0.00      0.00         1\n  Health & Personal Care       0.00      0.00      0.00         0\n Industrial & Scientific       0.00      0.00      0.00         1\n  Magazine Subscriptions       0.00      0.00      0.00         1\n                   Music       0.00      0.00      0.00         0\n     Musical Instruments       0.00      0.00      0.00         1\n         Office Products       1.00      1.00      1.00         1\n            Pet Supplies       0.00      0.00      0.00         0\n      SUBSCRIPTION BOXES       0.00      0.00      0.00         0\n       Sports & Outdoors       1.00      1.00      1.00         1\nTools & Home Improvement       0.00      0.00      0.00         0\n            Toys & Games       0.00      0.00      0.00         0\n             Video Games       0.50      1.00      0.67         1\n\n                accuracy                           0.52        82\n               macro avg       0.31      0.29      0.28        82\n            weighted avg       0.67      0.52      0.56        82\n')

                          precision    recall  f1-score   support

              All Beauty       1.00      0.45      0.62        11
         All Electronics       0.50      0.50      0.50         2
             Amazon Home       0.92      0.92      0.92        13
    Appstore for Android       0.00      0.00      0.00        10
   Arts, Crafts & Sewing       0.00      0.00      0.00         0
              Automotive       0.00      0.00      0.00         0
                    Baby       0.00      0.00      0.00         0
                   Books       0.00      0.00      0.00         0
            Buy a Kindle       0.00      0.00      0.00        10
           Digital Music       1.00      0.30      0.46        10
              Gift Cards       1.00      1.00      1.00        11
                 Grocery       0.89      1.00      0.94         8
                Handmade       0.00      0.00      0.00         1
  Health & Personal Care       0.00      0.00      0.00         0
 Industri

In [12]:
from collections import Counter
Counter(y_true)

Counter({'Amazon Home': 13,
         'Gift Cards': 11,
         'All Beauty': 11,
         'Digital Music': 10,
         'Buy a Kindle': 10,
         'Appstore for Android': 10,
         'Grocery': 8,
         'All Electronics': 2,
         'Industrial & Scientific': 1,
         'Sports & Outdoors': 1,
         'Magazine Subscriptions': 1,
         'Handmade': 1,
         'Video Games': 1,
         'Office Products': 1,
         'Musical Instruments': 1})