# 01. Data Preparation
This notebook handles:
- Loading Amazon Reviews datasets
- Building dataset pairs
- Preprocessing and merging data
- Initial EDA

## 1. import each dataset, cause it's large

In [4]:
import sys
import os
# Add the project root to the Python path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))

import pandas as pd
import numpy as np
from datasets import load_dataset
from src.data_utils import build_pairs, load_review_and_meta, preprocess_df

In [5]:
# Load dataset names and build pairs
with open("../datasets.txt", "r") as f:
    names = [line.strip() for line in f if line.strip()]

pairs = build_pairs(names)

# Show first 10 pairs
for i, (cat, pair) in enumerate(pairs.items()):
    if i == 10: break
    print(cat, "=>", pair)

All_Beauty => {'meta': 'raw_meta_All_Beauty', 'review': 'raw_review_All_Beauty'}
Toys_and_Games => {'meta': 'raw_meta_Toys_and_Games', 'review': 'raw_review_Toys_and_Games'}
Cell_Phones_and_Accessories => {'meta': 'raw_meta_Cell_Phones_and_Accessories', 'review': 'raw_review_Cell_Phones_and_Accessories'}
Industrial_and_Scientific => {'meta': 'raw_meta_Industrial_and_Scientific', 'review': 'raw_review_Industrial_and_Scientific'}
Gift_Cards => {'meta': 'raw_meta_Gift_Cards', 'review': 'raw_review_Gift_Cards'}
Musical_Instruments => {'meta': 'raw_meta_Musical_Instruments', 'review': 'raw_review_Musical_Instruments'}
Electronics => {'meta': 'raw_meta_Electronics', 'review': 'raw_review_Electronics'}
Handmade_Products => {'meta': 'raw_meta_Handmade_Products', 'review': 'raw_review_Handmade_Products'}
Arts_Crafts_and_Sewing => {'meta': 'raw_meta_Arts_Crafts_and_Sewing', 'review': 'raw_review_Arts_Crafts_and_Sewing'}
Baby_Products => {'meta': 'raw_meta_Baby_Products', 'review': 'raw_review_Ba

## Step 1: Import Dataset Names
We start by reading the available dataset names and organizing them by category. This helps us select the right review and metadata files for our analysis.




# 01. Data Preparation & Loading
This notebook covers the initial steps for working with the Amazon Reviews 2023 dataset, including loading, cleaning, and merging review and metadata tables.

**Key steps:**
- Load review and metadata files for the selected category
- Clean and filter the data
- Merge reviews with product metadata
- Sample the data for efficient processing
- Document all steps for reproducibility and team understanding

## Dataset Loading and Overview
Load the selected category's review and metadata files, print basic statistics, and count unique users/items/categories.

In [7]:
from datasets import load_dataset
import pandas as pd

category = "CDs_and_Vinyl"
review_name = pairs[category]["review"]
meta_name = pairs[category]["meta"] 
# Load CDs_and_Vinyl reviews
print(f"Loading {category} reviews from {review_name} and metadata from {meta_name}...")
dataset_review = load_dataset("McAuley-Lab/Amazon-Reviews-2023", review_name, split="full", trust_remote_code=True)
dataset_meta = load_dataset("McAuley-Lab/Amazon-Reviews-2023", meta_name, split="full", trust_remote_code=True)
print(dataset_review)
print(dataset_meta)
print("Number of feature in dataset_review:", len(dataset_review.features))
print("Number of feature in dataset_meta:", len(dataset_meta.features))

# Count unique users and items
n_users = len(set(dataset_review["user_id"]))
n_items = len(set(dataset_review["asin"]))
n_reviews = len(dataset_review)

n_categories = len(set(dataset_meta["main_category"]))
df_review = dataset_review.to_pandas()
df_meta = dataset_meta.to_pandas()
df_review.to_parquet("../data/CDs_and_Vinyl_reviews.parquet", index=False)
df_meta.to_parquet("../data/CDs_and_Vinyl_meta.parquet", index=False)
print(f"#Categories: {n_categories:,}")
print(f"#Users: {n_users:,}")
print(f"#Items: {n_items:,}")
print(f"#Reviews: {n_reviews:,}")
df_review.head(10)

Loading CDs_and_Vinyl reviews from raw_review_CDs_and_Vinyl and metadata from raw_meta_CDs_and_Vinyl...
Dataset({
    features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
    num_rows: 4827273
})
Dataset({
    features: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together', 'subtitle', 'author'],
    num_rows: 701959
})
Number of feature in dataset_review: 10
Number of feature in dataset_meta: 16
Dataset({
    features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
    num_rows: 4827273
})
Dataset({
    features: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together', 'subtit

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,5.0,Five Stars,LOVE IT!,[],B002MW50JA,B002MW50JA,AGKASBHYZPGTEPO6LWZPVJWB2BVA,1452650777000,0,True
1,5.0,Five Stars,LOVE!!,[],B008XNPN0S,B008XNPN0S,AGKASBHYZPGTEPO6LWZPVJWB2BVA,1452650764000,0,True
2,3.0,Three Stars,Sad there is not the versions with the real/or...,[],B00IKM5N02,B00IKM5N02,AGKASBHYZPGTEPO6LWZPVJWB2BVA,1452649885000,0,True
3,3.0,Disappointed,I have listen to The Broadway 1958 Flower Drum...,[],B00006JKCM,B00006JKCM,AEVWAM3YWN5URJVJIZZ6XPD2MKIA,1164036864000,3,True
4,5.0,Wonderful melding,Simply great album. One of the best. Marvelous...,[],B00013YRQY,B00013YRQY,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,1582090199946,0,False
5,5.0,Magnificent movie & music score!,The sound is incredibly beautiful. If you like...,[],B07Z76Y18X,B07Z76Y18X,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,1576100171173,5,True
6,5.0,Five Stars,beautiful music!,[],B00004NKAK,B00004NKAK,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,1524768111415,0,True
7,5.0,Great,Excellent album one of my favorites,[],B0000062P5,B0000062P5,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,1504898965457,0,True
8,5.0,YES...YES!!!!,Excellent for guitar lovers! His other CDS are...,[],B00EC6VQDS,B00EC6VQDS,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,1486090923000,0,True
9,4.0,Good album,Such a talented lady. We lost her to soon. She...,[],B002HMHR7S,B002HMHR7S,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,1402778050000,0,True
