In [37]:
# imports

import os
from dotenv import load_dotenv
from huggingface_hub import login
from datasets import load_dataset, Dataset, DatasetDict
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# environment
load_dotenv(override=True)

# Get the HF token from environment variables
hf_token = os.getenv('HF_TOKEN')

# Log in to Hugging Face
login(hf_token, add_to_git_credential=True)

In [None]:
# Load in our dataset
rawdata = load_dataset("cogsci13/Amazon-Reviews-2023-Books-Meta", split="full", trust_remote_code=True)
#print(dataset[0])

# Data Exploration

In [None]:
len(rawdata)

In [None]:
print(rawdata[0])

In [None]:
# Investigate a particular datapoint
datapoint = rawdata[1]
datapoint

In [None]:
# Investigate
datapoint = rawdata[5]
print(datapoint["title"])
print(datapoint["description"])
print(datapoint["features"])
print(datapoint["details"])
print(datapoint["price"])

In [None]:
# How many have prices?

prices = 0
for datapoint in rawdata:
    try:
        price = float(datapoint["price"])
        if price > 0:
            prices += 1
    except ValueError as e:
        pass

print(f"There are {prices:,} with prices which is {prices/len(rawdata)*100:,.1f}%")

In [None]:
#Filter only records with prices

filtered = (dp for dp in rawdata if "price" in dp and isinstance(dp["price"], str) and dp["price"].replace('.', '', 1).isdigit() and float(dp["price"]) > 0)

In [None]:
# Convert a sample of filtered data to in-memory dataset
sampled_data = []
for i, dp in enumerate(filtered):
    if i >= 100000:  # pick a reasonable sample size, e.g. 10k
        break
    sampled_data.append(dp)

dataset = Dataset.from_list(sampled_data)
print(f"Loaded {len(dataset):,} examples with prices.")

In [None]:
prices = []
lengths = []

for datapoint in dataset:
    try:
        price = float(datapoint["price"])
        if price > 0:
            prices.append(price)
            contents = (
                datapoint["title"] +
                str(datapoint.get("description", "")) +
                str(datapoint.get("features", "")) +
                str(datapoint.get("details", ""))
            )
            lengths.append(len(contents))
    except ValueError:
        pass


In [None]:
# Plot lengths
plt.figure(figsize=(15, 6))
plt.title(f"Lengths: Avg {sum(lengths)/len(lengths):,.0f} and highest {max(lengths):,}\n")
plt.xlabel('Length (chars)')
plt.ylabel('Count')
plt.hist(lengths, rwidth=0.7, color="lightblue", bins=range(0, 6000, 100))
plt.show()

In [None]:
# Plot prices
plt.figure(figsize=(15, 6))
plt.title(f"Prices: Avg {sum(prices)/len(prices):,.2f} and highest {max(prices):,}\n")
plt.xlabel('Price ($)')
plt.ylabel('Count')
plt.hist(prices, rwidth=0.7, color="orange", bins=range(0, 1000, 10))
plt.show()

In [None]:
# So what is this book??

for datapoint in dataset:
    try:
        price = float(datapoint["price"])
        if price > 3400:
            print(datapoint['title'])
    except ValueError as e:
        pass

# Data Transfroms