# Data Processing

In [2]:
import os
import random
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import numpy as np
import pickle

In [3]:
load_dotenv(override=True)

True

In [4]:
hf_token = os.environ['HF_TOKEN']
login(hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [5]:
from loaders import ItemLoader
from items import Item

In [6]:
%matplotlib inline

In [None]:
items = ItemLoader("Home_and_Kitchen").load()

In [None]:
print(items[1].prompt)

How much does this cost to the nearest dollar?

Foaming Soap Dispenser Thick Ceramic Foam Hand Soap Dispenser for Bathroom or Kitchen Sink, Liquid Pump Bottles for Hand soap, Body Wash, 2 Pack Black
Saving money You can DIY foam soap which will save you hundreds of dollars ( create foam just need to dilute the regular soap with water in a ratio of 1 3 ). One pump foams the right amount of soap for a thorough hand wash. High Quality made of thickened ceramic material, which is sturdy, durable and wear-resistant. The base has a nice heavy weight so it won’t slide or tip when used. Simple use with 3.15 inch Wide opening and large pump button, you can refill and press very easily, even so easy for kids to use. Moreover, smoothly surface make it easy to clean

Price is $25.00


# More Data

Include all the electrical, electronic, office and related items, but not clothes / beauty / books.

In [None]:
dataset_names = [
    "Automotive",
    "Electronics",
    "Office_Products",
    "Tools_and_Home_Improvement",
    "Cell_Phones_and_Accessories",
    "Toys_and_Games",
    "Appliances",
    "Musical_Instruments",
]

In [None]:
items = []
for dataset_name in dataset_names:
    loader = ItemLoader(dataset_name)
    items.extend(loader.load())

Loading dataset Automotive


meta_Automotive.jsonl:   0%|          | 0.00/5.35G [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

100%|██████████| 2004/2004 [10:10<00:00,  3.28it/s]


Completed Automotive with 911,688 datapoints in 17.5 mins
Loading dataset Electronics


meta_Electronics.jsonl:   0%|          | 0.00/5.25G [00:00<?, ?B/s]

Generating full split:   0%|          | 0/1610012 [00:00<?, ? examples/s]

100%|██████████| 1611/1611 [06:35<00:00,  4.07it/s]


Completed Electronics with 443,473 datapoints in 13.4 mins
Loading dataset Office_Products


meta_Office_Products.jsonl:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

100%|██████████| 711/711 [02:12<00:00,  5.36it/s]


Completed Office_Products with 240,394 datapoints in 5.1 mins
Loading dataset Tools_and_Home_Improvement


meta_Tools_and_Home_Improvement.jsonl:   0%|          | 0.00/4.85G [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

100%|██████████| 1474/1474 [06:25<00:00,  3.83it/s]


Completed Tools_and_Home_Improvement with 541,051 datapoints in 12.7 mins
Loading dataset Cell_Phones_and_Accessories


meta_Cell_Phones_and_Accessories.jsonl:   0%|          | 0.00/4.02G [00:00<?, ?B/s]

Generating full split:   0%|          | 0/1288490 [00:00<?, ? examples/s]

100%|██████████| 1289/1289 [04:09<00:00,  5.16it/s]


Completed Cell_Phones_and_Accessories with 238,869 datapoints in 9.4 mins
Loading dataset Toys_and_Games


meta_Toys_and_Games.jsonl:   0%|          | 0.00/2.64G [00:00<?, ?B/s]

Generating full split:   0%|          | 0/890874 [00:00<?, ? examples/s]

100%|██████████| 891/891 [03:30<00:00,  4.23it/s]


Completed Toys_and_Games with 340,479 datapoints in 7.1 mins
Loading dataset Appliances


100%|██████████| 95/95 [00:19<00:00,  4.88it/s]


Completed Appliances with 28,625 datapoints in 0.4 mins
Loading dataset Musical_Instruments


meta_Musical_Instruments.jsonl:   0%|          | 0.00/632M [00:00<?, ?B/s]

Generating full split:   0%|          | 0/213593 [00:00<?, ? examples/s]

100%|██████████| 214/214 [00:38<00:00,  5.49it/s]


Completed Musical_Instruments with 66,829 datapoints in 1.5 mins


In [16]:
print(f"A grand total of {len(items):,} items")

A grand total of 2,811,408 items


# Balancing Prices and Categories


In [None]:
slots = defaultdict(list)
for item in items:
    slots[round(item.price)].append(item)

In [None]:
np.random.seed(42)
random.seed(42)
sample = []
for i in range(1, 1000):
    slot = slots[i]
    if i>=240:
        sample.extend(slot)
    elif len(slot) <= 1200:
        sample.extend(slot)
    else:
        weights = np.array([1 if item.category=='Automotive' else 5 for item in slot])
        weights = weights / np.sum(weights)
        selected_indices = np.random.choice(len(slot), size=1200, replace=False, p=weights)
        selected = [slot[i] for i in selected_indices]
        sample.extend(selected)

print(f"There are {len(sample):,} items in the sample")

There are 408,635 items in the sample


# Final Check

In [26]:
def report(item):
    prompt = item.prompt
    tokens = Item.tokenizer.encode(item.prompt)
    print(prompt)
    print(tokens[-10:])
    print(Item.tokenizer.batch_decode(tokens[-10:]))

In [27]:
report(sample[398000])

How much does this cost to the nearest dollar?

MonoRS Coilovers Lowering Kit Made For Scion FRS Fully Adjustable, Set of 4
MonoRS Coilover damper kit by Godspeed Project are intermediate suspension upgrade setup for daily and Sunday club racing. Lowering your car with improved springs over factory and paired with Mono-tubo shocks with valving that allows 32 levels of rebound adjustment to improve handling without sacrifice comfort. Ride height can easily be adjusted by twisting the lower mount bracket. In order to keep weight gain at the minimum, most of attachments and accessories are CNC machined from billet aluminum. Koyo bearings are used when camber plate top mount is applicable depends on car models. To assure that our customers are getting high quality products, MonoRS coilovers are covered by 12 months limited warranty by the manufacturer from

Price is $765.00
[279, 14290, 505, 271, 7117, 374, 400, 22240, 13, 410]
[' the', ' manufacturer', ' from', '\n\n', 'Price', ' is', ' $


For Llama tokenizer, every number from 1 to 999 gets mapped to 1 token, much as we saw with gpt-4o. This is not true of qwen2, gemma and phi3, which all map individual digits to tokens. 

# Split Data

* Training data size: 400,000 
* Testing data size: 2,000


In [28]:
random.seed(42)
random.shuffle(sample)
train = sample[:400_000]
test = sample[400_000:402_000]
print(f"Divided into a training set of {len(train):,} items and test set of {len(test):,} items")

Divided into a training set of 400,000 items and test set of 2,000 items


In [29]:
print(train[0].prompt)

How much does this cost to the nearest dollar?

Delphi FG0166 Fuel Pump Module
Delphi brings 80 years of OE Heritage into each Delphi pump, ensuring quality and fitment for each Delphi part. Part is validated, tested and matched to the right vehicle application Delphi brings 80 years of OE Heritage into each Delphi assembly, ensuring quality and fitment for each Delphi part Always be sure to check and clean fuel tank to avoid unnecessary returns Rigorous OE-testing ensures the pump can withstand extreme temperatures Brand Delphi, Fit Type Vehicle Specific Fit, Dimensions LxWxH 19.7 x 7.7 x 5.1 inches, Weight 2.2 Pounds, Auto Part Position Unknown, Operation Mode Mechanical, Manufacturer Delphi, Model FUEL PUMP, Dimensions 19.7

Price is $227.00


In [30]:
print(test[0].test_prompt())

How much does this cost to the nearest dollar?

OEM AC Compressor w/A/C Repair Kit For Ford F150 F-150 V8 & Lincoln Mark LT 2007 2008 - BuyAutoParts NEW
As one of the world's largest automotive parts suppliers, our parts are trusted every day by mechanics and vehicle owners worldwide. This A/C Compressor and Components Kit is manufactured and tested to the strictest OE standards for unparalleled performance. Built for trouble-free ownership and 100% visually inspected and quality tested, this A/C Compressor and Components Kit is backed by our 100% satisfaction guarantee. Guaranteed Exact Fit for easy installation 100% BRAND NEW, premium ISO/TS 16949 quality - tested to meet or exceed OEM specifications Engineered for superior durability, backed by industry-leading unlimited-mileage warranty Included in this K

Price is $


# Finally - upload your brand new dataset

Convert to prompts and upload to HuggingFace hub

In [32]:
train_prompts = [item.prompt for item in train]
train_prices = [item.price for item in train]
test_prompts = [item.test_prompt() for item in test]
test_prices = [item.price for item in test]

In [None]:
train_dataset = Dataset.from_dict({"text": train_prompts, "price": train_prices})
test_dataset = Dataset.from_dict({"text": test_prompts, "price": test_prices})
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [None]:
HF_USER = "dzhen"
DATASET_NAME = f"{HF_USER}/amazon-pricer-data"
dataset.push_to_hub(DATASET_NAME, private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/400 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/dzhen/amazon-pricer-data/commit/a9b2ed44d3973786724f74f14ef171511c57bae8', commit_message='Upload dataset', commit_description='', oid='a9b2ed44d3973786724f74f14ef171511c57bae8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/dzhen/amazon-pricer-data', endpoint='https://huggingface.co', repo_type='dataset', repo_id='dzhen/amazon-pricer-data'), pr_revision=None, pr_num=None)

In [None]:
with open('train.pkl', 'wb') as file:
    pickle.dump(train, file)

with open('test.pkl', 'wb') as file:
    pickle.dump(test, file)