In [1]:
# imports
import os
import pickle
import numpy as np
import pandas as pd
import glob
import json
from rich import print as rprint
from rich.table import Table
from rich.console import Console
import gzip
import datasets
from datasets import load_dataset

datasets.logging.set_verbosity_error()

### Global Functions

In [2]:
def display_pickle_summary(data, title="Pickle File Contents"):
    """
    Load and summarize the contents of a pickle file using rich.

    :param data: .pkl data
    :param title: Optional title for the printed table.
    """
    table = Table(title=title)
    
    table.add_column("Key/Type", style="cyan", no_wrap=True)
    table.add_column("Description", style="magenta")

    total_size = 0
    if isinstance(data, dict):
        for key, value in data.items():
            desc = f"{type(value).__name__}, len={len(value)}" if hasattr(value, '__len__') else type(value).__name__
            table.add_row(str(key), desc)
            if key in ["train", "test", "val"]:
                total_size += len(value)
            else:
                total_size = "N/A"
    else:
        table.add_row(type(data).__name__, f"{data}" if isinstance(data, (int, float, str)) else str(type(data)))

    table.add_row("Total Size", str(total_size))
    console = Console()
    console.print(table)
    
    if "train" in data or "test" in data or "val" in data:
        rprint("Train Sample:")
        rprint(data['train'][0])
        rprint("Val Sample:")
        rprint(data['val'][0])
        rprint("Test Sample:")
        print(data['test'][0])
        
        
def df_stats(df: pd.DataFrame, title="DataFrame Stats"):
    table = Table(title=title)
    rprint(f"DataFrame shape: {df.shape}")
    table.add_column("Column", style="cyan", no_wrap=True)
    table.add_column("Non-Null Count", style="yellow")
    table.add_column("Unique Count", style="magenta")
    table.add_column("Null/NA Count", style="red")
    table.add_column("Data Type", style="green")

    for col in df.columns:
        try:
            non_null_count = df[col].notna().sum()
        except:
            non_null_count = "Error"
        try:
            unique_count = df[col].nunique(dropna=True)
        except:
            unique_count = "Error"
        try:
            null_count = df[col].isna().sum()
        except:
            null_count = "Error"
        try:
            dtype = str(df[col].dtype)
        except:
            dtype = "Error"
        table.add_row(col, str(non_null_count), str(unique_count), str(null_count), dtype)

    Console().print(table)

### Loading Data from HuggingFace

#### Review Samples

In [4]:
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", 
                       "raw_review_All_Beauty", 
                       trust_remote_code=True)

In [6]:
dataset["full"][0]

{'rating': 5.0,
 'title': 'Such a lovely scent but not overpowering.',
 'text': "This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, medium thickness. I am comparing to other brands with yucky chemicals so I'm gonna stick with this. Try it!",
 'images': [],
 'asin': 'B00YQ6X8EO',
 'parent_asin': 'B00YQ6X8EO',
 'user_id': 'AGKHLEW2SOWHNMFQIJGBECAF7INQ',
 'timestamp': 1588687728923,
 'helpful_vote': 0,
 'verified_purchase': True}

#### Metadata

In [8]:
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", 
                       "raw_meta_All_Beauty", 
                       split="full", 
                       trust_remote_code=True)

meta_All_Beauty.jsonl:   0%|          | 0.00/213M [00:00<?, ?B/s]

Generating full split:   0%|          | 0/112590 [00:00<?, ? examples/s]

In [9]:
dataset[0]

{'main_category': 'All Beauty',
 'title': 'Howard LC0008 Leather Conditioner, 8-Ounce (4-Pack)',
 'average_rating': 4.8,
 'rating_number': 10,
 'features': [],
 'description': [],
 'price': 'None',
 'images': {'hi_res': [None,
   'https://m.media-amazon.com/images/I/71i77AuI9xL._SL1500_.jpg'],
  'large': ['https://m.media-amazon.com/images/I/41qfjSfqNyL.jpg',
   'https://m.media-amazon.com/images/I/41w2yznfuZL.jpg'],
  'thumb': ['https://m.media-amazon.com/images/I/41qfjSfqNyL._SS40_.jpg',
   'https://m.media-amazon.com/images/I/41w2yznfuZL._SS40_.jpg'],
  'variant': ['MAIN', 'PT01']},
 'videos': {'title': [], 'url': [], 'user_id': []},
 'store': 'Howard Products',
 'categories': [],
 'details': '{"Package Dimensions": "7.1 x 5.5 x 3 inches; 2.38 Pounds", "UPC": "617390882781"}',
 'parent_asin': 'B01CUPMQZE',
 'bought_together': None,
 'subtitle': None,
 'author': None}

#### Pure IDs Files (Before Splitting)

In [10]:
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", 
                       "5core_rating_only_All_Beauty", 
                       trust_remote_code=True)

All_Beauty.csv:   0%|          | 0.00/147k [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

In [11]:
dataset['full'][0:5]

{'user_id': ['AFSKPY37N3C43SOI5IEXEK5JSIYA',
  'AFSKPY37N3C43SOI5IEXEK5JSIYA',
  'AFSKPY37N3C43SOI5IEXEK5JSIYA',
  'AFSKPY37N3C43SOI5IEXEK5JSIYA',
  'AFSKPY37N3C43SOI5IEXEK5JSIYA'],
 'parent_asin': ['B07J3GH1W1',
  'B07W397QG4',
  'B07KG1TWP5',
  'B08JTNQFZY',
  'B07SLFWZKN'],
 'rating': ['5.0', '5.0', '5.0', '5.0', '3.0'],
 'timestamp': ['1547589356557',
  '1593352422858',
  '1596473351088',
  '1617904219785',
  '1619737501209']}

#### Pure IDs Files (After Splitting)

In [12]:
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", 
                       "5core_timestamp_All_Beauty", 
                       trust_remote_code=True)

All_Beauty.train.csv:   0%|          | 0.00/130k [00:00<?, ?B/s]

All_Beauty.valid.csv:   0%|          | 0.00/16.1k [00:00<?, ?B/s]

All_Beauty.test.csv:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [13]:
dataset['train'][:5]

{'user_id': ['AFSKPY37N3C43SOI5IEXEK5JSIYA',
  'AFSKPY37N3C43SOI5IEXEK5JSIYA',
  'AFSKPY37N3C43SOI5IEXEK5JSIYA',
  'AFSKPY37N3C43SOI5IEXEK5JSIYA',
  'AFSKPY37N3C43SOI5IEXEK5JSIYA'],
 'parent_asin': ['B07J3GH1W1',
  'B07W397QG4',
  'B07KG1TWP5',
  'B08JTNQFZY',
  'B07SLFWZKN'],
 'rating': ['5.0', '5.0', '5.0', '5.0', '3.0'],
 'timestamp': ['1547589356557',
  '1593352422858',
  '1596473351088',
  '1617904219785',
  '1619737501209']}

#### Need additional user historical interactions

In [14]:
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", 
                       "5core_timestamp_w_his_All_Beauty", 
                       trust_remote_code=True)

All_Beauty.train.csv:   0%|          | 0.00/291k [00:00<?, ?B/s]

All_Beauty.valid.csv:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

All_Beauty.test.csv:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [17]:
dataset['train'][:5]

{'user_id': ['AFSKPY37N3C43SOI5IEXEK5JSIYA',
  'AFSKPY37N3C43SOI5IEXEK5JSIYA',
  'AFSKPY37N3C43SOI5IEXEK5JSIYA',
  'AFSKPY37N3C43SOI5IEXEK5JSIYA',
  'AFSKPY37N3C43SOI5IEXEK5JSIYA'],
 'parent_asin': ['B07J3GH1W1',
  'B07W397QG4',
  'B07KG1TWP5',
  'B08JTNQFZY',
  'B07SLFWZKN'],
 'rating': ['5.0', '5.0', '5.0', '5.0', '3.0'],
 'timestamp': ['1547589356557',
  '1593352422858',
  '1596473351088',
  '1617904219785',
  '1619737501209'],
 'history': ['',
  'B07J3GH1W1',
  'B07J3GH1W1 B07W397QG4',
  'B07J3GH1W1 B07W397QG4 B07KG1TWP5',
  'B07J3GH1W1 B07W397QG4 B07KG1TWP5 B08JTNQFZY']}

### Saving 5-Core

In [3]:
from tqdm import tqdm
from pprint import pprint
from huggingface_hub import hf_hub_download

In [None]:
rating_file = hf_hub_download(
            repo_id='McAuley-Lab/Amazon-Reviews-2023',
            filename='benchmark/5core/rating_only/Toys_and_Games.csv',
            repo_type='dataset'
            )

data_file = hf_hub_download(
            repo_id='McAuley-Lab/Amazon-Reviews-2023',
            filename='raw/review_categories/Toys_and_Games.jsonl',
            repo_type='dataset'
            )

Toys_and_Games.csv:   0%|          | 0.00/224M [00:00<?, ?B/s]

'/home/scur2745/.cache/huggingface/hub/datasets--McAuley-Lab--Amazon-Reviews-2023/snapshots/2b6d039ed471f2ba5fd2acb718bf33b0a7e5598e/raw/review_categories/Toys_and_Games.jsonl'

In [5]:
def parse_2023(path):
    with open(path, 'r') as fp:
        for line in tqdm(fp):
            l = json.loads(line.strip())
            yield l
    
def load_2023(path):
    data = []
    with open(path, 'r') as fp:
        for line in tqdm(fp):
            data.append(json.loads(line.strip()))
    return data

In [25]:
def filter_amazon_dataset(category, metadata=False):
    rprint("Reading Ratings...")
    raw_dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", 
                       f"raw_meta_{category}" if metadata else f"raw_review_{category}", 
                       trust_remote_code=True)
    raw_df = raw_dataset['full'].to_pandas()
    # df_stats(raw_df, f"{category} Reviews DataFrame Stats")
    
    rprint("Reading Ratings...")
    rating_file = hf_hub_download(
            repo_id='McAuley-Lab/Amazon-Reviews-2023',
            filename=f'benchmark/5core/rating_only/{category}.csv',
            repo_type='dataset'
            )
    rating_df = pd.read_csv(rating_file)
    # df_stats(rating_df, f"{category} Rating DataFrame Stats")
    
    # create sets for filtering
    valid_users = set(rating_df['user_id'].unique())
    valid_items = set(rating_df['parent_asin'].unique())
    
    rprint("Filtering Data...")
    # filter reviews where both user_id and parent_asin are in the 5-core subset
    if metadata:
        filtered_data = raw_df[raw_df['parent_asin'].isin(valid_items)]
    else:
        filtered_data = raw_df[
            (raw_df['user_id'].isin(valid_users)) & 
            (raw_df['parent_asin'].isin(valid_items))
        ]
    
    # df_stats(filtered_reviews, f"{category} Filtered Reviews DataFrame Stats")
    rprint("Filtered Dataset Shape:", filtered_data.shape)
    
    return filtered_data

In [28]:
category = "All_Beauty"
# category = "Sports_and_Outdoors"
# category = "Toys_and_Games"

filtered_data = filter_amazon_dataset(category, metadata=False)
df_stats(filtered_data)

In [None]:
DATASET_DIR = "/home/scur2745/RecSys/dataset/amazon/2023/toys"
# filtered_reviews.to_json(f'{DATASET_DIR}/reviews.json.gz', 
#                          orient='records', 
#                          lines=True,
#                          compression='gzip')

In [50]:
import pandas as pd

file_path = "/home/scur2745/RecSys/dataset/amazon/2023/raw/toys/meta.json.gz"
meta_df = pd.read_json(file_path, lines=True, compression="gzip")

In [62]:
str(meta_df["categories"][0])

"['Toys & Games', 'Remote & App Controlled Vehicles & Parts', 'Remote & App Controlled Vehicles', 'Trucks']"

### Pre-Process

In [None]:
import gzip
import json
import numpy as np
import os
import os.path as osp
import pandas as pd
import polars as pl
import torch
import sys
from collections import defaultdict
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from data.preprocessing import PreprocessingMixin
from torch_geometric.data import download_google_url
from torch_geometric.data import extract_zip
from torch_geometric.data import HeteroData
from torch_geometric.data import InMemoryDataset
from torch_geometric.io import fs
from typing import Callable
from typing import List
from typing import Optional, Dict, Union
import logging

# fetch logger
logger = logging.getLogger("recsys_logger")

In [None]:
def parse_2023(path):
    with gzip.open(path, 'rt', encoding='utf-8') as f:
        for line in tqdm(f, desc=f"Parsing {path}"):
            yield json.loads(line)

class AmazonReviews2023(InMemoryDataset, PreprocessingMixin):
    def __init__(
        self,
        root: str,
        split: str,  # 'beauty', 'sports', 'toys'
        transform: Optional[Callable] = None,
        pre_transform: Optional[Callable] = None,
        force_reload: bool = False,
        category="brand",
    ) -> None:
        self.split = split
        self.brand_mapping = {}  # Dictionary to store brand_id -> brand_name mapping
        self.category = category
        super(AmazonReviews2023, self).__init__(
            root, transform, pre_transform, force_reload
        )
        self.load(self.processed_paths[0], data_cls=HeteroData)

    @property
    def raw_file_names(self) -> List[str]:
        return [self.split]

    @property
    def processed_file_names(self) -> str:
        return f"data_{self.split}.pt"

    def _remap_ids(self, x):
        return x - 1

    def get_brand_name(self, brand_id: int) -> str:
        """
        Returns the brand name for a given brand ID.

        Args:
            brand_id: The ID of the brand to look up

        Returns:
            The brand name as a string, or "Unknown" if the brand ID is not found
        """
        return self.brand_mapping.get(brand_id, "Unknown")

    def get_brand_mapping(self) -> Dict[int, str]:
        """
        Returns the complete brand ID to brand name mapping.

        Returns:
            Dictionary mapping brand IDs to brand names
        """
        return self.brand_mapping

    def train_test_split(self, max_seq_len=20):
        splits = ["train", "eval", "test"]
        sequences = {sp: defaultdict(list) for sp in splits}
        user_ids = []
        with open(
            os.path.join(self.raw_dir, self.split, "sequential_data.txt"), "r"
        ) as f:
            for line in f:
                parsed_line = list(map(int, line.strip().split()))
                user_ids.append(parsed_line[0])
                items = [self._remap_ids(id) for id in parsed_line[1:]]

                # We keep the whole sequence without padding. Allows flexible training-time subsampling.
                # example: items[22]
                train_items = items[:-2]  # items[0:20] → [1..20]
                sequences["train"]["itemId"].append(train_items)
                sequences["train"]["itemId_fut"].append(items[-2]) # → 21

                eval_items = items[-(max_seq_len + 2) : -2] # items[-22:-2] → [1..20]
                sequences["eval"]["itemId"].append(
                    eval_items + [-1] * (max_seq_len - len(eval_items))
                )
                sequences["eval"]["itemId_fut"].append(items[-2]) # → 21

                test_items = items[-(max_seq_len + 1) : -1] # items[-21:-1] → [2..21]
                sequences["test"]["itemId"].append(
                    test_items + [-1] * (max_seq_len - len(test_items))
                )
                sequences["test"]["itemId_fut"].append(items[-1]) # → 22

        for sp in splits:
            sequences[sp]["userId"] = user_ids
            sequences[sp] = pl.from_dict(sequences[sp])
        return sequences

    def process(self, max_seq_len=20) -> None:
        data = HeteroData()

        with open(os.path.join(self.raw_dir, self.split, "datamaps.json"), "r") as f:
            data_maps = json.load(f)

        # Construct user sequences
        sequences = self.train_test_split(max_seq_len=max_seq_len)
        data["user", "rated", "item"].history = {
            k: self._df_to_tensor_dict(v, ["itemId"]) for k, v in sequences.items()
        }

        # Compute item features
        asin2id = pd.DataFrame(
            [
                {"asin": k, "id": self._remap_ids(int(v))}
                for k, v in data_maps["item2id"].items()
            ]
        )
        meta_df =  pd.DataFrame(
                [
                    meta
                    for meta in parse_2023(
                        path=os.path.join(self.raw_dir, self.split, "meta.json.gz")
                    )
                ]
            )
        # process meta df
        meta_df.rename(columns={"parent_asin": "asin"}, inplace=True)
        meta_df["brand"] = meta_df["details"].apply(lambda x: eval(x).get("Brand", "Unknown"))
        item_data = (meta_df
            .merge(asin2id, on="asin")
            .sort_values(by="id")
            .fillna({"brand": "Unknown"})
        )

        # Create brand mapping
        unique_brands = item_data[self.category].unique()
        self.brand_mapping = {i: brand for i, brand in enumerate(unique_brands)}

        # Create reverse mapping for lookup
        brand_to_id = {brand: i for i, brand in self.brand_mapping.items()}

        # Add brand_id to item_data
        item_data["brand_id"] = item_data["brand"].map(lambda x: brand_to_id.get(x, -1))

        sentences = item_data.apply(
            lambda row: "Title: "
            + str(row["title"])
            + "; "
            + "Brand: "
            + str(row["brand"])
            + "; "
            + "Categories: "
            + (str(row["categories"]) if row["categories"] else f'[{row["main_category"]}]')
            + "; "
            + "Rating: "
            + str(row["average_rating"])
            + "; "
            + "Price: "
            + str(row["price"])
            + "; ",
            axis=1,
        )
        # Store brand_id instead of brand name
        brand_ids = item_data.apply(lambda row: row["brand_id"], axis=1)

        item_emb = self._encode_text_feature(sentences)
        data["item"].x = item_emb
        data["item"].text = np.array(sentences)
        data["item"].brand_id = np.array(
            brand_ids
        )  # Store brand_id instead of brand name

        # Save the brand mapping to the data object as well
        data["brand_mapping"] = self.brand_mapping

        gen = torch.Generator()
        gen.manual_seed(42)
        data["item"].is_train = torch.rand(item_emb.shape[0], generator=gen) > 0.05

        self.save([data], self.processed_paths[0])

        # Save brand mapping to a separate file for easy access
        brand_mapping_path = os.path.join(
            self.processed_dir, f"brand_mapping_{self.split}.json"
        )
        with open(brand_mapping_path, "w") as f:
            json.dump(self.brand_mapping, f)


amazon = AmazonReviews2023(root="../dataset/amazon/2023", split="beauty")