### Notebook to understand Datasets

In [1]:
# imports
import os
import pickle
import numpy as np
import pandas as pd
import glob
import json
from rich import print as rprint
from rich.table import Table
from rich.console import Console
import gzip

### Global Functions

In [2]:
def display_pickle_summary(data, title="Pickle File Contents"):
    """
    Load and summarize the contents of a pickle file using rich.

    :param data: .pkl data
    :param title: Optional title for the printed table.
    """
    table = Table(title=title)
    
    table.add_column("Key/Type", style="cyan", no_wrap=True)
    table.add_column("Description", style="magenta")

    total_size = 0
    if isinstance(data, dict):
        for key, value in data.items():
            desc = f"{type(value).__name__}, len={len(value)}" if hasattr(value, '__len__') else type(value).__name__
            table.add_row(str(key), desc)
            if key in ["train", "test", "val"]:
                total_size += len(value)
            else:
                total_size = "N/A"
    else:
        table.add_row(type(data).__name__, f"{data}" if isinstance(data, (int, float, str)) else str(type(data)))

    table.add_row("Total Size", str(total_size))
    console = Console()
    console.print(table)
    
    if "train" in data or "test" in data or "val" in data:
        rprint("Train Sample:")
        rprint(data['train'][0])
        rprint("Val Sample:")
        rprint(data['val'][0])
        rprint("Test Sample:")
        print(data['test'][0])
        
        
def df_stats(df: pd.DataFrame, title="DataFrame Stats"):
    table = Table(title=title)
    rprint(f"DataFrame shape: {df.shape}")
    table.add_column("Column", style="cyan", no_wrap=True)
    table.add_column("Non-Null Count", style="yellow")
    table.add_column("Unique Count", style="magenta")
    table.add_column("Null/NA Count", style="red")
    table.add_column("Data Type", style="green")

    for col in df.columns:
        try:
            non_null_count = df[col].notna().sum()
        except:
            non_null_count = "Error"
        try:
            unique_count = df[col].nunique(dropna=True)
        except:
            unique_count = "Error"
        try:
            null_count = df[col].isna().sum()
        except:
            null_count = "Error"
        try:
            dtype = str(df[col].dtype)
        except:
            dtype = "Error"
        table.add_row(col, str(non_null_count), str(unique_count), str(null_count), dtype)

    Console().print(table)

### Amazon Dataset (2014)

"Small" subsets for experimentation
If you're using this data for a class project (or similar) please consider using one of these smaller datasets below before requesting the larger files. To obtain the larger files you will need to contact me to obtain access.

K-cores (i.e., dense subsets): These data have been reduced to extract the k-core, such that each of the remaining users and items have k reviews each.

where:

- `reviewerID` - ID of the reviewer, e.g. A2SUAM1J3GNN3B
- `asin` - ID of the product, e.g. 0000013714
- `reviewerName` - name of the reviewer
- `helpful` - helpfulness rating of the review, e.g. 2/3
- `reviewText` - text of the review
- `overall` - rating of the product
- `summary` - summary of the review
- `unixReviewTime` - time of the review (unix time)
- `reviewTime` - time of the review (raw)

**Metadata includes descriptions, price, sales-rank, brand info, and co-purchasing links:**

metadata (3.1gb) - metadata for 9.4 million products

where

- `asin` - ID of the product, e.g. 0000031852
- `title` - name of the product
- `price` - price in US dollars (at time of crawl)
- `imUrl` - url of the product image
- `related` - related products (also bought, also viewed, bought together, buy after viewing)
- `salesRank` - sales rank information
- `brand` - brand name
- `categories` - list of categories the product belongs to

In [3]:
# global vars
DATASET_DIR = "../dataset/amazon/raw"
DATASET_SPLIT = "beauty"

In [4]:
# total files
files = glob.glob(f"{DATASET_DIR}/{DATASET_SPLIT}/*.pkl")
print(f"Found {len(files)} files in `{DATASET_SPLIT}`")
files

Found 5 files in `beauty`


['../dataset/amazon/raw/beauty/review_splits.pkl',
 '../dataset/amazon/raw/beauty/exp_splits.pkl',
 '../dataset/amazon/raw/beauty/rating_splits_augmented.pkl',
 '../dataset/amazon/raw/beauty/zeroshot_exp_splits.pkl',
 '../dataset/amazon/raw/beauty/user_id2name.pkl']

### Raw Dataset

#### Review Splits

In [6]:
with open(f"{DATASET_DIR}/{DATASET_SPLIT}/review_splits.pkl", "rb") as f:
    review_splits = pickle.load(f)
    
display_pickle_summary(review_splits, title="Review Splits Summary")

{'reviewerID': 'A2QKXW3LDQ66P5', 'asin': 'B005X2F7KI', 'reviewerName': 'stephanie', 'helpful': [5, 6], 'reviewText': 'Absolutely great product.  I bought this for my fourteen year old niece for Christmas and of course I had to try it out, then I tried another one, and another one and another one.  So much fun!  I even contemplated keeping a few for myself!', 'overall': 5.0, 'summary': 'Perfect!', 'unixReviewTime': 1352937600, 'reviewTime': '11 15, 2012', 'explanation': 'Absolutely great product', 'feature': 'product'}


#### Exp Splits

In [7]:
with open(f"{DATASET_DIR}/{DATASET_SPLIT}/exp_splits.pkl", "rb") as f:
    exp_splits = pickle.load(f)
    
display_pickle_summary(exp_splits, title="Exp Splits Summary")

{'reviewerID': 'A2QKXW3LDQ66P5', 'asin': 'B005X2F7KI', 'reviewerName': 'stephanie', 'helpful': [5, 6], 'reviewText': 'Absolutely great product.  I bought this for my fourteen year old niece for Christmas and of course I had to try it out, then I tried another one, and another one and another one.  So much fun!  I even contemplated keeping a few for myself!', 'overall': 5.0, 'summary': 'Perfect!', 'unixReviewTime': 1352937600, 'reviewTime': '11 15, 2012', 'explanation': 'Absolutely great product', 'feature': 'product'}


#### Ratings Split Augmented

In [None]:
with open(f"{DATASET_DIR}/{DATASET_SPLIT}/rating_splits_augmented.pkl", "rb") as f:
    rating_splits_augmented = pickle.load(f)

display_pickle_summary(rating_splits_augmented, title="Rating Splits Augmented Summary")

{'reviewerID': 'A2QKXW3LDQ66P5', 'asin': 'B005X2F7KI', 'reviewerName': 'stephanie', 'helpful': [5, 6], 'reviewText': 'Absolutely great product.  I bought this for my fourteen year old niece for Christmas and of course I had to try it out, then I tried another one, and another one and another one.  So much fun!  I even contemplated keeping a few for myself!', 'overall': 5.0, 'summary': 'Perfect!', 'unixReviewTime': 1352937600, 'reviewTime': '11 15, 2012', 'explanation': 'Absolutely great product', 'feature': 'product'}


#### Zero-Shot Exp Splits

In [9]:
with open(f"{DATASET_DIR}/{DATASET_SPLIT}/zeroshot_exp_splits.pkl", "rb") as f:
    zeroshot_exp_splits = pickle.load(f)

display_pickle_summary(zeroshot_exp_splits, title="Zero-Shot Exp Splits Summary")

#### user_id2name

In [10]:
with open(f"{DATASET_DIR}/{DATASET_SPLIT}/user_id2name.pkl", "rb") as f:
    user_id2name = pickle.load(f)

len(user_id2name)

22363

In [39]:
with open(f"{DATASET_DIR}/{DATASET_SPLIT}/datamaps.json", "rb") as f:
    datamaps = json.load(f)

display_pickle_summary(datamaps, title="Data Maps Summary")

### Metadata

In [58]:
DATASET_SPLIT = "sports"

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF(f"{DATASET_DIR}/{DATASET_SPLIT}/meta.json.gz")
df.shape

(532197, 9)

In [60]:
rprint(df.shape)
df.head()

Unnamed: 0,asin,title,price,imUrl,related,brand,categories,salesRank,description
0,32069,Adult Ballet Tutu Cheetah Pink,7.89,http://ecx.images-amazon.com/images/I/51EzU6qu...,"{'also_bought': ['0000032050', 'B00D0DJAEG', '...",BubuBibi,"[[Sports & Outdoors, Other Sports, Dance, Clot...",,
1,31909,Girls Ballet Tutu Neon Pink,7.0,http://ecx.images-amazon.com/images/I/41xBoP0F...,"{'also_bought': ['B002BZX8Z6', 'B00JHONN1S', '...",Unknown,"[[Sports & Outdoors, Other Sports, Dance]]",{'Toys & Games': 201847},High quality 3 layer ballet tutu. 12 inches in...
2,32034,Adult Ballet Tutu Yellow,7.87,http://ecx.images-amazon.com/images/I/21GNUNIa...,"{'also_bought': ['B00D2JSRFQ', '0000032042', '...",BubuBibi,"[[Sports & Outdoors, Other Sports, Dance, Clot...",,
3,31852,Girls Ballet Tutu Zebra Hot Pink,3.17,http://ecx.images-amazon.com/images/I/51fAmVkT...,"{'also_bought': ['B00JHONN1S', 'B002BZX8Z6', '...",Coxlures,"[[Sports & Outdoors, Other Sports, Dance]]",{'Toys & Games': 211836},TUtu
4,32050,Adult Ballet Tutu Purple,12.85,http://ecx.images-amazon.com/images/I/41TxNYG8...,"{'also_bought': ['B00D2JSRFQ', 'B00D2JTMS2', '...",BubuBibi,"[[Sports & Outdoors, Other Sports, Dance, Clot...",,


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 532197 entries, 0 to 532196
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   asin         532197 non-null  object 
 1   title        529901 non-null  object 
 2   price        287792 non-null  float64
 3   imUrl        531613 non-null  object 
 4   related      388118 non-null  object 
 5   brand        152487 non-null  object 
 6   categories   532197 non-null  object 
 7   salesRank    486349 non-null  object 
 8   description  402371 non-null  object 
dtypes: float64(1), object(8)
memory usage: 40.6+ MB


In [62]:
df_stats(df, title="Metadata Stats")

In [70]:
with open(f"{DATASET_DIR}/{DATASET_SPLIT}/datamaps.json", "rb") as f:
    datamaps = json.load(f)

def _remap_ids(x):
    return x - 1

# Compute item features
asin2id = pd.DataFrame(
    [
        {"asin": k, "id": _remap_ids(int(v))}
        for k, v in datamaps["item2id"].items()
    ]
)
asin2id.shape

(18357, 2)

In [64]:
item_data = df.merge(asin2id, on="asin").sort_values(by="id").fillna({"brand": "Unknown"})
item_data.shape

(18357, 10)

In [65]:
df_stats(item_data, title="Item Data Stats")

In [66]:
item_data.head(5)

Unnamed: 0,asin,title,price,imUrl,related,brand,categories,salesRank,description,id
0,1881509818,Ghost Inc Glock Armorers Tool 3/32 Punch,9.99,http://ecx.images-amazon.com/images/I/21iMxsyD...,"{'also_bought': ['B000U3YWEM', 'B000U401J6', '...",Ghost,"[[Sports & Outdoors, Hunting & Fishing, Huntin...",{'Sports &amp; Outdoors': 172909},Ghost Armorer Tool (1). The GAT is made with a...,0
11982,B0048KGFHU,Tipton Polymer Gun Cleaning Picks,6.34,http://ecx.images-amazon.com/images/I/31K7ShuO...,"{'also_bought': ['B006T6Y56E', 'B00162OKDY', '...",Tipton,"[[Sports & Outdoors, Hunting & Fishing, Huntin...",{'Sports &amp; Outdoors': 501},Here's a clever idea that belongs in every cle...,1
15852,B0081JJVUC,TekMat 11-Inch X 17-Inch Handgun Cleaning Mat ...,10.67,http://ecx.images-amazon.com/images/I/51gB5DgF...,"{'also_bought': ['B0014VX2M2', 'B0036N474S', '...",TekMat,"[[Sports & Outdoors, Hunting & Fishing, Huntin...",{'Sports &amp; Outdoors': 9061},Your gun is a proud possession as well as an i...,2
3320,B000N8OIE8,Lee Precision Powder Measure Kit,11.46,http://ecx.images-amazon.com/images/I/41T-weQ9...,"{'also_bought': ['B000NOUEUO', 'B0013RD6OQ', '...",Lee,"[[Sports & Outdoors, Hunting & Fishing, Huntin...",{'Sports &amp; Outdoors': 6842},Lee Powder Measure Dipper Kit includes 15 grad...,3
13372,B004Y27DVY,Glock Magazine Speed Loader for 9mm / .40 / .3...,7.88,http://ecx.images-amazon.com/images/I/41dpYrCC...,"{'also_bought': ['B0014VX2M2', 'B000U3YWEM', '...",Glock,"[[Sports & Outdoors, Hunting & Fishing, Huntin...",{'Sports &amp; Outdoors': 5930},Stop busting your fingers loading Glock Magazi...,4


In [None]:
# item_data.to_csv(f"{DATASET_DIR}/{DATASET_SPLIT}/item_data.csv", index=False)

In [68]:
item_data.columns

Index(['asin', 'title', 'price', 'imUrl', 'related', 'brand', 'categories',
       'salesRank', 'description', 'id'],
      dtype='object')

In [69]:
item_data["imUrl"].iloc[0]

'http://ecx.images-amazon.com/images/I/21iMxsyDBRL._SX300_.jpg'

In [56]:
url = "http://ecx.images-amazon.com/images/I/413BcULsveL._SX300_.jpg"
os.path.basename(url)

'413BcULsveL._SX300_.jpg'

In [55]:
url.split('?')[0]

'http://ecx.images-amazon.com/images/I/413BcULsveL._SX300_.jpg'

In [3]:
item_data = pd.read_csv(f"{DATASET_DIR}/beauty/item_data.csv")
df_stats(item_data)

NameError: name 'DATASET_DIR' is not defined

In [11]:
import glob
len(glob.glob(f"{DATASET_DIR}/sports/product_images/*"))

18287

### Processed Data Pipelines

In [5]:
import os
import torch
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from data.processed import RecDataset
# %load_ext autoreload
# %autoreload 2

In [6]:
# global vars
DATASET_DIR = "../dataset/amazon/raw"
DATASET_SPLIT = "beauty"

In [7]:
from torch.utils.data import Dataset
from data.amazon import AmazonReviews
from data.ml1m import RawMovieLens1M
from data.ml32m import RawMovieLens32M
from data.schemas import SeqBatch
import clip
import json
from PIL import Image

In [None]:
DATASET_NAME_TO_RAW_DATASET = {
    RecDataset.AMAZON: AmazonReviews,
    RecDataset.ML_1M: RawMovieLens1M,
    RecDataset.ML_32M: RawMovieLens32M,
}


DATASET_NAME_TO_MAX_SEQ_LEN = {
    RecDataset.AMAZON: 20,
    RecDataset.ML_1M: 200,
    RecDataset.ML_32M: 200,
}

class ItemData(Dataset):
    def __init__(
        self,
        root: str,
        *args,
        force_process: bool = False,
        dataset: RecDataset = RecDataset.ML_1M,
        train_test_split: str = "all",
        encode_images: bool = False,
        **kwargs
    ) -> None:

        self.encode_images = encode_images
        self.root = root
        raw_dataset_class = DATASET_NAME_TO_RAW_DATASET[dataset]
        max_seq_len = DATASET_NAME_TO_MAX_SEQ_LEN[dataset]
        raw_data = raw_dataset_class(root=self.root, *args, **kwargs)
        self.raw_data = raw_data
        processed_data_path = raw_data.processed_paths[0]
        if not os.path.exists(processed_data_path) or force_process:
            raw_data.process(max_seq_len=max_seq_len)

        if train_test_split == "train":
            filt = raw_data.data["item"]["is_train"]
        elif train_test_split == "eval":
            filt = ~raw_data.data["item"]["is_train"]
        elif train_test_split == "all":
            filt = torch.ones_like(raw_data.data["item"]["x"][:, 0], dtype=bool)

        self.item_data, self.item_text, self.item_brand_id = (
            raw_data.data["item"]["x"][filt],
            raw_data.data["item"]["text"][filt],
            raw_data.data["item"]["brand_id"][filt],
        )

        if self.encode_images:
            self.dataset_split = kwargs.get("split")
            with open(os.path.join(self.root, "raw", self.dataset_split, "datamaps.json"), "r") as f:
                self.data_maps = json.load(f)
            self.clip_model, self.preprocess = clip.load("ViT-L/14", device="cpu")

    def __len__(self):
        return self.item_data.shape[0]

    def __getitem__(self, idx):
        item_ids = (
            torch.tensor(idx).unsqueeze(0) if not isinstance(idx, torch.Tensor) else idx
        )
        x = self.item_data[idx, :768]
        print("x", x.shape)
        x_brand_id = torch.Tensor(self.item_brand_id[idx])
        
        # if image encoding enabled and filenames are present
        if self.encode_images:
            img_filename = self.data_maps["id2item"][str(item_ids.item() + 1)] + ".jpg"
            img_path = os.path.join(self.root, "raw", self.dataset_split, "product_images", img_filename)
            try:
                image = Image.open(img_path).convert("RGB")
                image_input = self.preprocess(image).to(self.clip_model.visual.conv1.weight.device)  # (3, 224, 224)
                with torch.no_grad():
                    image_feat = self.clip_model.encode_image(image_input.unsqueeze(0))  # (1, 768 or 1024)
                    print("image_feat", image_feat.shape)
                    image_feat = image_feat / image_feat.norm(dim=1, keepdim=True)
                # Combine with existing features if needed
                print("x", x.shape)
                x = torch.cat([x, image_feat.squeeze(0)], dim=-1)
                print("x2", x.shape)
            except Exception as e:
                print(e)
                x = torch.cat([x, torch.zeros(image_feat.shape[-1])], dim=-1)

        item = SeqBatch(
            user_ids=-1 * torch.ones_like(item_ids.squeeze(0)),
            ids=item_ids,
            ids_fut=-1 * torch.ones_like(item_ids.squeeze(0)),
            x=x,
            x_brand_id=x_brand_id,
            x_fut=-1 * torch.ones_like(item_ids.squeeze(0)),
            x_fut_brand_id=-1 * torch.ones_like(item_ids.squeeze(0)),
            seq_mask=torch.ones_like(item_ids, dtype=bool),
        )
                
        return item



ds_split = "beauty"
dataset = ItemData(
    "../dataset/amazon", dataset=RecDataset.AMAZON, split=ds_split, force_process=False,
    encode_images=True
)

item_data = pd.read_csv(f"{DATASET_DIR}/{ds_split}/item_data.csv")
datamaps = json.load(open(f"{DATASET_DIR}/{ds_split}/datamaps.json", "r"))
# df_stats(item_data)



In [5]:
item_data = pd.read_csv(f"{DATASET_DIR}/{DATASET_SPLIT}/item_data.csv")
df_stats(item_data)

In [6]:
item_data

Unnamed: 0,asin,description,title,imUrl,salesRank,categories,price,related,brand,id
0,B004756YJA,OPI Burlesque Colors,"OPI Nail Lacquer, Simmer and Shimmer, 0.5-Flui...",http://ecx.images-amazon.com/images/I/411jo-OU...,{'Beauty': 46572},"[['Beauty', 'Makeup', 'Nails', 'Nail Polish']]",12.00,"{'also_bought': ['B0045M2T12', 'B004KFNHLA', '...",OPI,0
1,B004ZT0SSG,Red Shatter Nail Polish\nFull Size :15ML,OPI Red Shatter Crackle Nail Polish E55 New,http://ecx.images-amazon.com/images/I/41X8hWnt...,{'Beauty': 74739},"[['Beauty', 'Makeup', 'Nails', 'Nail Polish']]",3.04,"{'also_bought': ['B004Y6G910', 'B005GSWUY0', '...",OPI,1
2,B0020YLEYK,It is 3 effects function beblesh balm. By Aden...,SKIN79 The Prestige Beblesh Balm BB Cream Diam...,http://ecx.images-amazon.com/images/I/31lrzUjx...,{'Beauty': 24042},"[['Beauty', 'Skin Care', 'Face', 'Creams & Moi...",14.96,"{'also_bought': ['B006RWW7VU', 'B002HPBF32', '...",Unknown,2
3,7806397051,An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[['Beauty', 'Makeup', 'Face', 'Concealers & Ne...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA,3
4,B002WLWX82,Paraffin bath for pain relief and removing dry...,Dr. Scholl's Quick Heat Paraffin Spa Bath,http://ecx.images-amazon.com/images/I/41jOWVYU...,{'Beauty': 344},"[['Beauty', 'Skin Care', 'Hands & Nails', 'Par...",47.95,"{'also_bought': ['B000BLS0NM', 'B0006Q00IK', '...",Dr. Scholl&#39;s,4
...,...,...,...,...,...,...,...,...,...,...
12096,B00GYN9A08,One In Eight Ingredients In Personal Care Prod...,Best INDIAN HEALING CLAY -&quot;SODIUM&quot; B...,http://ecx.images-amazon.com/images/I/51JW3-Ie...,{'Beauty': 6825},"[['Beauty', 'Skin Care', 'Face', 'Treatments &...",25.99,"{'also_bought': ['B00JVXVBQE', 'B00HRGBSYW', '...",Unknown,12096
12097,B00IBMV2ME,The Best BOTANICAL HYALURONIC ACID (5.0%) Gel ...,Best Botanical Hyaluronic Acid Anti Aging Faci...,http://ecx.images-amazon.com/images/I/4171BmUV...,{'Beauty': 116649},"[['Beauty', 'Skin Care', 'Face', 'Oils & Serum...",24.50,"{'also_bought': ['B00IC8JBIE', 'B00IC9AG5A', '...",Unknown,12097
12098,B00IC9AG5A,Announcing a Dermatologist Grade Skin Treatmen...,Anti Aging All In One Facial Treatment (Replac...,http://ecx.images-amazon.com/images/I/314b-jZn...,{'Beauty': 84262},"[['Beauty', 'Skin Care', 'Eyes', 'Combinations']]",26.50,"{'also_bought': ['B00IC8JBIE', 'B00IC7L3JK', '...",Unknown,12098
12099,B00IKKORVU,Announcing The Ultimate Vitamin C Anti Aging S...,Best Vitamin C Anti Aging 6 Item System &amp; ...,http://ecx.images-amazon.com/images/I/51yIcFHj...,{'Beauty': 87595},"[['Beauty', 'Skin Care', 'Sets & Kits']]",125.00,"{'also_viewed': ['B00IC8JBIE', 'B00GYJWL7G', '...",Unknown,12099


In [22]:
item_data.mean()

  item_data.mean()


price                17.005026
id                 6050.000000
description_len     478.998387
title_len            67.564908
dtype: float64

In [6]:
dataset[0].x.shape

image_feat torch.Size([1, 768])
x torch.Size([768])
x2 torch.Size([1536])


torch.Size([1536])

In [46]:
dataset.data_maps["id2item"][str(1)]

'B004756YJA'

In [43]:
int(dataset[0].ids)

0

In [20]:
datamaps["id2item"]["1"]

'B004756YJA'

In [38]:
item_data.head(5)

Unnamed: 0,asin,description,title,imUrl,salesRank,categories,price,related,brand,id
0,B004756YJA,OPI Burlesque Colors,"OPI Nail Lacquer, Simmer and Shimmer, 0.5-Flui...",http://ecx.images-amazon.com/images/I/411jo-OU...,{'Beauty': 46572},"[['Beauty', 'Makeup', 'Nails', 'Nail Polish']]",12.0,"{'also_bought': ['B0045M2T12', 'B004KFNHLA', '...",OPI,0
1,B004ZT0SSG,Red Shatter Nail Polish\nFull Size :15ML,OPI Red Shatter Crackle Nail Polish E55 New,http://ecx.images-amazon.com/images/I/41X8hWnt...,{'Beauty': 74739},"[['Beauty', 'Makeup', 'Nails', 'Nail Polish']]",3.04,"{'also_bought': ['B004Y6G910', 'B005GSWUY0', '...",OPI,1
2,B0020YLEYK,It is 3 effects function beblesh balm. By Aden...,SKIN79 The Prestige Beblesh Balm BB Cream Diam...,http://ecx.images-amazon.com/images/I/31lrzUjx...,{'Beauty': 24042},"[['Beauty', 'Skin Care', 'Face', 'Creams & Moi...",14.96,"{'also_bought': ['B006RWW7VU', 'B002HPBF32', '...",Unknown,2
3,7806397051,An extensive range of 15 multiple vibrant long...,WAWO 15 Color Professionl Makeup Eyeshadow Cam...,http://ecx.images-amazon.com/images/I/41Rn18Oe...,{'Beauty': 10486},"[['Beauty', 'Makeup', 'Face', 'Concealers & Ne...",5.04,"{'also_bought': ['B00KR26VFE', 'B00E7LQHZ0', '...",COKA,3
4,B002WLWX82,Paraffin bath for pain relief and removing dry...,Dr. Scholl's Quick Heat Paraffin Spa Bath,http://ecx.images-amazon.com/images/I/41jOWVYU...,{'Beauty': 344},"[['Beauty', 'Skin Care', 'Hands & Nails', 'Par...",47.95,"{'also_bought': ['B000BLS0NM', 'B0006Q00IK', '...",Dr. Scholl&#39;s,4


In [97]:
dataset[5].ids

tensor([5])

In [11]:
import glob

category = "toys"
# category = "sports"


files = glob.glob(f"/home/scur2745/RecSys/dataset/amazon/2023/raw/{category}/product_images/*.jpg")
len(files)

162012