### Notebook to understand Datasets

In [73]:
# imports
import os
import pickle
import numpy as np
import pandas as pd
import glob
import json
from rich import print as rprint
from rich.table import Table
from rich.console import Console
import gzip


### Raw Dataset

"Small" subsets for experimentation
If you're using this data for a class project (or similar) please consider using one of these smaller datasets below before requesting the larger files. To obtain the larger files you will need to contact me to obtain access.

K-cores (i.e., dense subsets): These data have been reduced to extract the k-core, such that each of the remaining users and items have k reviews each.

where:

- `reviewerID` - ID of the reviewer, e.g. A2SUAM1J3GNN3B
- `asin` - ID of the product, e.g. 0000013714
- `reviewerName` - name of the reviewer
- `helpful` - helpfulness rating of the review, e.g. 2/3
- `reviewText` - text of the review
- `overall` - rating of the product
- `summary` - summary of the review
- `unixReviewTime` - time of the review (unix time)
- `reviewTime` - time of the review (raw)

**Metadata includes descriptions, price, sales-rank, brand info, and co-purchasing links:**

metadata (3.1gb) - metadata for 9.4 million products

where

- `asin` - ID of the product, e.g. 0000031852
- `title` - name of the product
- `price` - price in US dollars (at time of crawl)
- `imUrl` - url of the product image
- `related` - related products (also bought, also viewed, bought together, buy after viewing)
- `salesRank` - sales rank information
- `brand` - brand name
- `categories` - list of categories the product belongs to

In [16]:
# global vars
DATASET_DIR = "../dataset/amazon/raw"
DATASET_SPLIT = "beauty"

In [17]:
# total files
files = glob.glob(f"{DATASET_DIR}/{DATASET_SPLIT}/*.pkl")
print(f"Found {len(files)} files in `{DATASET_SPLIT}`")
files

Found 5 files in `beauty`


['../dataset/amazon/raw/beauty/review_splits.pkl',
 '../dataset/amazon/raw/beauty/exp_splits.pkl',
 '../dataset/amazon/raw/beauty/rating_splits_augmented.pkl',
 '../dataset/amazon/raw/beauty/zeroshot_exp_splits.pkl',
 '../dataset/amazon/raw/beauty/user_id2name.pkl']

In [68]:
def display_pickle_summary(data, title="Pickle File Contents"):
    """
    Load and summarize the contents of a pickle file using rich.

    :param data: .pkl data
    :param title: Optional title for the printed table.
    """
    table = Table(title=title)
    
    table.add_column("Key/Type", style="cyan", no_wrap=True)
    table.add_column("Description", style="magenta")

    total_size = 0
    if isinstance(data, dict):
        for key, value in data.items():
            desc = f"{type(value).__name__}, len={len(value)}" if hasattr(value, '__len__') else type(value).__name__
            table.add_row(str(key), desc)
            if key in ["train", "test", "val"]:
                total_size += len(value)
            else:
                total_size = "N/A"
    else:
        table.add_row(type(data).__name__, f"{data}" if isinstance(data, (int, float, str)) else str(type(data)))

    table.add_row("Total Size", str(total_size))
    console = Console()
    console.print(table)
    
    if "train" in data or "test" in data or "val" in data:
        rprint("Train Sample:")
        rprint(data['train'][0])
        rprint("Val Sample:")
        rprint(data['val'][0])
        rprint("Test Sample:")
        print(data['test'][0])

#### Review Splits

In [57]:
with open(f"{DATASET_DIR}/{DATASET_SPLIT}/review_splits.pkl", "rb") as f:
    review_splits = pickle.load(f)
    
display_pickle_summary(review_splits, title="Review Splits Summary")

#### Exp Splits

In [58]:
with open(f"{DATASET_DIR}/{DATASET_SPLIT}/exp_splits.pkl", "rb") as f:
    exp_splits = pickle.load(f)
    
display_pickle_summary(exp_splits, title="Exp Splits Summary")

#### Ratings Split Augmented

In [59]:
with open(f"{DATASET_DIR}/{DATASET_SPLIT}/rating_splits_augmented.pkl", "rb") as f:
    rating_splits_augmented = pickle.load(f)

display_pickle_summary(rating_splits_augmented, title="Rating Splits Augmented Summary")

#### Zero-Shot Exp Splits

In [60]:
with open(f"{DATASET_DIR}/{DATASET_SPLIT}/zeroshot_exp_splits.pkl", "rb") as f:
    zeroshot_exp_splits = pickle.load(f)

display_pickle_summary(zeroshot_exp_splits, title="Zero-Shot Exp Splits Summary")

#### user_id2name

In [None]:
with open(f"{DATASET_DIR}/{DATASET_SPLIT}/user_id2name.pkl", "rb") as f:
    user_id2name = pickle.load(f)

len(user_id2name)

22363

In [69]:
with open(f"{DATASET_DIR}/{DATASET_SPLIT}/datamaps.json", "rb") as f:
    datamaps = json.load(f)

display_pickle_summary(datamaps, title="Data Maps Summary")

#### Metadata

In [None]:
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield eval(l)

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF(f"{DATASET_DIR}/{DATASET_SPLIT}/meta.json.gz")
df.shape

In [None]:
rprint(df.shape)
df.head()

Unnamed: 0,asin,description,title,imUrl,salesRank,categories,price,related,brand
0,205616461,"As we age, our once youthful, healthy skin suc...",Bio-Active Anti-Aging Serum (Firming Ultra-Hyd...,http://ecx.images-amazon.com/images/I/41DecrGO...,{'Health & Personal Care': 461765},"[[Beauty, Skin Care, Face, Creams & Moisturize...",,,
1,558925278,Mineral Powder Brush--Apply powder or mineral ...,Eco Friendly Ecotools Quality Natural Bamboo C...,http://ecx.images-amazon.com/images/I/51L%2BzY...,{'Beauty': 402875},"[[Beauty, Tools & Accessories, Makeup Brushes ...",,,
2,733001998,"From the Greek island of Chios, this Mastiha b...",Mastiha Body Lotion,http://ecx.images-amazon.com/images/I/311WK5y1...,{'Beauty': 540255},"[[Beauty, Skin Care, Body, Moisturizers, Lotio...",,,
3,737104473,Limited edition Hello Kitty Lipstick featuring...,Hello Kitty Lustre Lipstick (See sellers comme...,http://ecx.images-amazon.com/images/I/31u6Hrzk...,{'Beauty': 931125},"[[Beauty, Makeup, Lips, Lipstick]]",,,
4,762451459,"The mermaid is an elusive (okay, mythical) cre...",Stephanie Johnson Mermaid Round Snap Mirror,http://ecx.images-amazon.com/images/I/41y2%2BF...,,"[[Beauty, Tools & Accessories, Mirrors, Makeup...",19.98,,


In [78]:
df.isnull().sum()

asin                0
description     24707
title             444
imUrl              88
salesRank        5188
categories          0
price           69274
related         51350
brand          131038
dtype: int64

In [82]:
df["imUrl"].iloc[0]

'http://ecx.images-amazon.com/images/I/41DecrGODDL._SY300_.jpg'

### Processed Data Pipelines

In [None]:
import torch
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from data.processed import ItemData, RecDataset
%load_ext autoreload
%autoreload 2

In [2]:
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from data.processed import ItemData, RecDataset, SeqData

train_dataset = SeqData(
    root="/home/scur2745/RecSys/dataset/amazon",
    dataset=RecDataset.AMAZON,
    is_train=False,
    subsample=False,
    split="beauty",
)
len(train_dataset)

  [torch.tensor(l[-max_seq_len:]) for l in self.sequence_data["itemId"]],


3
3
3


22363

In [None]:
from data.processed import ItemData, RecDataset

# load train dataset
train_dataset = ItemData(
    root="/home/scur2745/RecSys/dataset/amazon",
    dataset=RecDataset.AMAZON,
    force_process=False,
    train_test_split="test",
    split="beauty",
)

In [26]:
train_dataset[0]

SeqBatch(user_ids=tensor(-1), ids=tensor([0]), ids_fut=tensor(-1), x=tensor([-2.0531e-03,  5.5936e-02,  4.6736e-02, -3.1006e-02, -9.1658e-03,
        -7.9645e-02, -7.8050e-02,  8.5028e-02,  2.2035e-02, -1.8748e-02,
         1.9890e-02,  2.4373e-02, -6.7752e-02, -1.0936e-01, -9.2943e-03,
         2.1252e-02,  4.3363e-02, -1.9461e-02,  2.9900e-03, -8.7175e-05,
        -1.3706e-02, -2.0147e-02, -8.3040e-03,  2.3467e-02, -1.4278e-02,
        -5.8923e-02,  6.6702e-02,  2.9903e-02, -4.1056e-02, -4.8130e-02,
         3.1035e-02, -4.9202e-02,  1.6379e-02, -2.2526e-02, -2.2642e-03,
         2.9824e-02, -6.0305e-04, -5.4097e-02,  2.5860e-02, -5.5954e-02,
        -2.6394e-02,  4.2049e-02,  6.1356e-02, -6.3023e-02,  6.1110e-02,
         1.1420e-02,  3.6712e-05, -1.9804e-02,  1.6040e-02, -1.0195e-02,
        -5.1779e-02, -2.3248e-02,  4.3240e-03, -1.6076e-02,  3.2320e-02,
        -2.6335e-02, -2.4666e-02, -9.6030e-04, -9.6135e-03,  2.1064e-02,
        -2.3914e-03,  2.8232e-02, -2.6620e-02,  5.4424e

In [None]:
dataset = ItemData(
    "dataset/amazon", dataset=RecDataset.AMAZON, split="beauty", force_process=True
)
dataset[0]
train_dataset = SeqData(
    root="dataset/amazon",
    dataset=RecDataset.AMAZON,
    is_train=True,
    subsample=True,
    split="beauty",
)
print("train_dataset", train_dataset[0])
eval_dataset = SeqData(
    root="dataset/amazon",
    dataset=RecDataset.AMAZON,
    is_train=False,
    subsample=False,
    split="beauty",
    get_brand_id=True,
)
print("eval_dataset", eval_dataset[0])

In [4]:
pre_filter = torch.load("/home/scur2745/RecSys/dataset/amazon/processed/pre_filter.pt")
pre_filter

'False'

In [5]:
pre_transform = torch.load("/home/scur2745/RecSys/dataset/amazon/processed/pre_transform.pt")
pre_transform

'None'