## 1. import each dataset, cause it's large

In [1]:
from collections import defaultdict

def build_pairs(names):
    pairs = defaultdict(dict)
    for name in names:
        if name.startswith("raw_review_"):
            cat = name.replace("raw_review_", "")
            pairs[cat]["review"] = name
        elif name.startswith("raw_meta_"):
            cat = name.replace("raw_meta_", "")
            pairs[cat]["meta"] = name
    return pairs

with open("datasets.txt", "r") as f:
    names = [line.strip() for line in f if line.strip()]

pairs = build_pairs(names)

for i, (cat, pair) in enumerate(pairs.items()):
    if i == 10: break
    print(cat, "=>", pair)


All_Beauty => {'meta': 'raw_meta_All_Beauty', 'review': 'raw_review_All_Beauty'}
Toys_and_Games => {'meta': 'raw_meta_Toys_and_Games', 'review': 'raw_review_Toys_and_Games'}
Cell_Phones_and_Accessories => {'meta': 'raw_meta_Cell_Phones_and_Accessories', 'review': 'raw_review_Cell_Phones_and_Accessories'}
Industrial_and_Scientific => {'meta': 'raw_meta_Industrial_and_Scientific', 'review': 'raw_review_Industrial_and_Scientific'}
Gift_Cards => {'meta': 'raw_meta_Gift_Cards', 'review': 'raw_review_Gift_Cards'}
Musical_Instruments => {'meta': 'raw_meta_Musical_Instruments', 'review': 'raw_review_Musical_Instruments'}
Electronics => {'meta': 'raw_meta_Electronics', 'review': 'raw_review_Electronics'}
Handmade_Products => {'meta': 'raw_meta_Handmade_Products', 'review': 'raw_review_Handmade_Products'}
Arts_Crafts_and_Sewing => {'meta': 'raw_meta_Arts_Crafts_and_Sewing', 'review': 'raw_review_Arts_Crafts_and_Sewing'}
Baby_Products => {'meta': 'raw_meta_Baby_Products', 'review': 'raw_review_Ba

In [2]:
from datasets import load_dataset

category = "CDs_and_Vinyl"
review_name = pairs[category]["review"]
meta_name = pairs[category]["meta"] 
# Load CDs_and_Vinyl reviews
dataset_review = load_dataset("McAuley-Lab/Amazon-Reviews-2023", review_name, split="full")
dataset_meta = load_dataset("McAuley-Lab/Amazon-Reviews-2023", meta_name, split="full")
print(dataset_review)
print(dataset_meta)
print("Number of feature in dataset_review:", len(dataset_review.features))
print("Number of feature in dataset_meta:", len(dataset_meta.features))

# Count unique users and items
n_users = len(set(dataset_review["user_id"]))
n_items = len(set(dataset_review["asin"]))
n_reviews = len(dataset_review)

n_categories = len(set(dataset_meta["main_category"]))

print(f"#Categories: {n_categories:,}")
print(f"#Users: {n_users:,}")
print(f"#Items: {n_items:,}")
print(f"#Reviews: {n_reviews:,}")


Dataset({
    features: ['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'],
    num_rows: 4827273
})
Dataset({
    features: ['main_category', 'title', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images', 'videos', 'store', 'categories', 'details', 'parent_asin', 'bought_together', 'subtitle', 'author'],
    num_rows: 701959
})
Number of feature in dataset_review: 10
Number of feature in dataset_meta: 16
#Categories: 30
#Users: 1,754,118
#Items: 701,706
#Reviews: 4,827,273


## 2. EDA


In [3]:
import pandas as pd
df_review = dataset_review.to_pandas()
df_meta = dataset_meta.to_pandas()
print(df_review.head())

   rating              title  \
0     5.0         Five Stars   
1     5.0         Five Stars   
2     3.0        Three Stars   
3     3.0       Disappointed   
4     5.0  Wonderful melding   

                                                text images        asin  \
0                                           LOVE IT!     []  B002MW50JA   
1                                             LOVE!!     []  B008XNPN0S   
2  Sad there is not the versions with the real/or...     []  B00IKM5N02   
3  I have listen to The Broadway 1958 Flower Drum...     []  B00006JKCM   
4  Simply great album. One of the best. Marvelous...     []  B00013YRQY   

  parent_asin                       user_id      timestamp  helpful_vote  \
0  B002MW50JA  AGKASBHYZPGTEPO6LWZPVJWB2BVA  1452650777000             0   
1  B008XNPN0S  AGKASBHYZPGTEPO6LWZPVJWB2BVA  1452650764000             0   
2  B00IKM5N02  AGKASBHYZPGTEPO6LWZPVJWB2BVA  1452649885000             0   
3  B00006JKCM  AEVWAM3YWN5URJVJIZZ6XPD2MKIA  1164036

In [4]:
df_meta = df_meta[['parent_asin','main_category','title','average_rating','rating_number','features','description','price','images','categories','store']]
df_review =df_review[['asin','parent_asin','user_id','rating','title','text','timestamp','helpful_vote','verified_purchase','images']]
df_meta = df_meta.dropna(subset = ['parent_asin']).drop_duplicates(subset = ['parent_asin'])
df = df_review.merge(df_meta, on= 'parent_asin', how='left')

In [5]:
df.count()

asin                 4827273
parent_asin          4827273
user_id              4827273
rating               4827273
title_x              4827273
text                 4827273
timestamp            4827273
helpful_vote         4827273
verified_purchase    4827273
images_x             4827273
main_category        4823633
title_y              4827273
average_rating       4827273
rating_number        4827273
features             4827273
description          4827273
price                4827273
images_y             4827273
categories           4827273
store                4825928
dtype: int64

In [6]:
import io, time, math, requests
from PIL import Image
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import CLIPModel, CLIPProcessor
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device).eval()
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")



Using device: cuda


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [7]:
df.iloc[0]['images_y']

{'hi_res': array(['https://m.media-amazon.com/images/I/71GC0YGmG+L._SL1200_.jpg',
        'https://m.media-amazon.com/images/I/714J4wZ6qyL._SL1085_.jpg'],
       dtype=object),
 'large': array(['https://m.media-amazon.com/images/I/618LXtnXwXL.jpg',
        'https://m.media-amazon.com/images/I/51nKlRbfklL.jpg'],
       dtype=object),
 'thumb': array(['https://m.media-amazon.com/images/I/618LXtnXwXL._SS40_.jpg',
        'https://m.media-amazon.com/images/I/51nKlRbfklL._SS40_.jpg'],
       dtype=object),
 'variant': array(['MAIN', 'BACK'], dtype=object)}

In [8]:

import numpy as np

def pick_best_image_from_images_field(images_field):
    """
    Returns a single best URL given one row's images field.
    Prefers hi_res MAIN -> hi_res any -> large MAIN -> large any -> thumb MAIN -> thumb any.
    Returns None if nothing usable.
    """
    if images_field is None:
        return None

    # Some splits may store images as a plain list of URLs (reviews_x). Handle that too.
    if isinstance(images_field, (list, tuple)):
        return images_field[0] if images_field else None

    # Expect dict with arrays
    if isinstance(images_field, dict):
        # Normalize to python lists
        def to_list(x):
            if x is None: return []
            if isinstance(x, np.ndarray): return x.tolist()
            if isinstance(x, (list, tuple)): return list(x)
            return []

        hi_res   = to_list(images_field.get('hi_res'))
        large    = to_list(images_field.get('large'))
        thumb    = to_list(images_field.get('thumb'))
        variant  = to_list(images_field.get('variant'))

        # If no variant provided, just prefer hi_res->large->thumb by first URL
        if not variant:
            for arr in (hi_res, large, thumb):
                if arr: return arr[0]
            return None

        # Build aligned records
        L = max(len(hi_res), len(large), len(thumb), len(variant))
        recs = []
        for i in range(L):
            v = variant[i] if i < len(variant) else None
            recs.append({
                'variant': v,
                'hi_res': hi_res[i] if i < len(hi_res) else None,
                'large':  large[i]  if i < len(large)  else None,
                'thumb':  thumb[i]  if i < len(thumb)  else None,
            })

        # Priority helpers
        def first_where(key, cond=lambda r: True):
            for r in recs:
                if cond(r) and r.get(key): return r[key]
            return None

        # Try in order of preference
        return ( first_where('hi_res',  lambda r: r['variant'] == 'MAIN')
              or first_where('hi_res')
              or first_where('large',   lambda r: r['variant'] == 'MAIN')
              or first_where('large')
              or first_where('thumb',   lambda r: r['variant'] == 'MAIN')
              or first_where('thumb') )

    # Fallback: unknown type
    return None

## 
df['img_url'] = df['images_y'].apply(pick_best_image_from_images_field)


In [9]:
import io
from PIL import Image
SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/119.0 Safari/537.36"
})
def fetch_image(url, timeout=6, max_kb=4096):
    if not isinstance(url, str) or not url:
        return None
    try:
        r = SESSION.get(url, timeout=timeout, stream=True)
        r.raise_for_status()
        content = r.content
        if len(content) > max_kb * 1024:
            return None
        return Image.open(io.BytesIO(content)).convert("RGB")
    except Exception:
        # one light retry
        try:
            r = SESSION.get(url, timeout=timeout, stream=True)
            r.raise_for_status()
            return Image.open(io.BytesIO(r.content)).convert("RGB")
        except Exception:
            return None
        
@torch.no_grad()
def embed_pil_batch(pil_list, batch_size=16):
    # Keep only actual images; remember mapping to restore order
    idx_keep, batch_imgs = [], []
    for i, im in enumerate(pil_list):
        if im is not None:
            idx_keep.append(i)
            batch_imgs.append(im)

    embs = [None] * len(pil_list)
    for i in range(0, len(batch_imgs), batch_size):
        batch = batch_imgs[i:i+batch_size]
        inputs = processor(images=batch, return_tensors="pt").to(device)
        feats = model.get_image_features(**inputs)
        feats = torch.nn.functional.normalize(feats, p=2, dim=1)  # L2 normalize
        feats = feats.cpu().numpy()
        # write back into correct slots
        for j, vec in enumerate(feats):
            embs[idx_keep[i+j]] = vec
    return embs


In [10]:
import os, io, requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import CLIPModel, CLIPProcessor

# ===== assumes you already created: model, processor, device, SESSION, fetch_image(), embed_one_pil() =====
# If you don't have SESSION yet:
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119 Safari/537.36"})

def fetch_image_bytes(url, timeout=2.5, max_kb=4096):
    try:
        r = SESSION.get(url, timeout=timeout, stream=True)
        r.raise_for_status()
        b = r.content
        # cap ~4 MB (change if needed)
        if len(b) > max_kb * 1024:
            return None
        return b
    except Exception:
        return None

def to_pil(b):
    if b is None:
        return None
    try:
        return Image.open(io.BytesIO(b)).convert("RGB")
    except Exception:
        return None

@torch.no_grad()
def embed_pil_batch(pil_list, batch_size=64 if torch.cuda.is_available() else 16):
    # keep positions of valid PILs
    keep_idx, images = [], []
    for i, im in enumerate(pil_list):
        if im is not None:
            keep_idx.append(i)
            images.append(im)

    embs = [None] * len(pil_list)
    for i in range(0, len(images), batch_size):
        batch = images[i:i+batch_size]
        inputs = processor(images=batch, return_tensors="pt").to(device)
        feats = model.get_image_features(**inputs)
        feats = torch.nn.functional.normalize(feats, p=2, dim=1).cpu().numpy()
        for j, vec in enumerate(feats):
            embs[keep_idx[i+j]] = vec
    return embs

# -----------------------------
# 1) Build the worklist (rows that have a usable URL)
mask = df["img_url"].notna()
work = df.loc[mask, ["parent_asin", "img_url"]].drop_duplicates("parent_asin").copy()

# SAMPLE SIZE — adjust as you like
SAMPLE = 5000
if len(work) > SAMPLE:
    work = work.sample(SAMPLE, random_state=42).reset_index(drop=True)

print("Images to process:", len(work))

# -----------------------------
# 2) Parallel download (fast I/O)
N_WORKERS = 48  # try 32–64; lower if your network throttles

bytes_list = [None] * len(work)
with ThreadPoolExecutor(max_workers=N_WORKERS) as ex:
    futs = {ex.submit(fetch_image_bytes, url): i for i, url in enumerate(work["img_url"])}
    for fut in tqdm(as_completed(futs), total=len(futs), desc="Downloading"):
        i = futs[fut]
        bytes_list[i] = fut.result()

work["pil"] = [to_pil(b) for b in bytes_list]
ok_rate = work["pil"].notna().mean()
print(f"Download OK rate: {ok_rate:.1%}")

# -----------------------------
# 3) Batched CLIP embedding (with periodic checkpoints)
EMBATCH = 2000   # rows per embedding chunk (not CLIP batch)
vecs = [None] * len(work)
for s in range(0, len(work), EMBATCH):
    sub = work.iloc[s:s+EMBATCH]
    embs = embed_pil_batch(sub["pil"].tolist())
    for j, v in enumerate(embs):
        vecs[s + j] = v

work["clip_img_emb"] = vecs
work = work.drop(columns=["pil"])

# Keep only successful embeddings, one per parent_asin
emb_df = work.loc[work["clip_img_emb"].notna(), ["parent_asin", "clip_img_emb"]].drop_duplicates("parent_asin")

# Save (parquet with list-of-floats)
emb_out = emb_df.copy()
emb_out["clip_img_emb"] = emb_out["clip_img_emb"].apply(lambda v: v.tolist())
emb_out.to_parquet("clip_img_emb_parent.parquet", index=False)
print("Saved embeddings:", len(emb_out))

# (Optional) also save NPZ for pure-numpy training pipelines
np.savez_compressed(
    "clip_img_emb_parent.npz",
    parent_asin=emb_df["parent_asin"].values,
    clip_img_emb=np.stack(emb_df["clip_img_emb"].values)
)

# -----------------------------
# 4) Merge back to your main df for modeling
# ensure same dtype for key
df["parent_asin"] = df["parent_asin"].astype(str)
emb_out["parent_asin"] = emb_out["parent_asin"].astype(str)

df = df.merge(emb_out, on="parent_asin", how="left")
df["has_img_emb"] = df["clip_img_emb"].notna().astype(int)
print("Final coverage in df (has_img_emb):", df["has_img_emb"].mean())


Images to process: 5000


Downloading: 100%|██████████| 5000/5000 [01:56<00:00, 42.80it/s]


Download OK rate: 100.0%
Saved embeddings: 5000
Final coverage in df (has_img_emb): 0.006677890394846117


In [14]:
df["clip_img_emb"] = df["clip_img_emb"].apply(lambda v: v if isinstance(v, np.ndarray) else np.zeros(512, dtype=np.float32))

In [17]:
df_with_img_emb = df[df["has_img_emb"] == 1]
df_with_img_emb = df_with_img_emb.reset_index(drop=True)    

Unnamed: 0,asin,parent_asin,user_id,rating,title_x,text,timestamp,helpful_vote,verified_purchase,images_x,...,rating_number,features,description,price,images_y,categories,store,img_url,clip_img_emb,has_img_emb
0,B0018LMKIK,B0018LMKIK,AHZ6XMOLEWA67S3TX7IWEXXGWSOA,5.0,Five Stars,Good value,1480353818000,0,True,[],...,3515,[],[An excellent compilation from one of the fine...,,{'hi_res': ['https://m.media-amazon.com/images...,"[CDs & Vinyl, Rock, Country Rock]",Creedence Clearwater Revival Format: Audio CD,https://m.media-amazon.com/images/I/71is3JW6ma...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
1,B0041W566Q,B0041W566Q,AHZ6XMOLEWA67S3TX7IWEXXGWSOA,5.0,Wonderful collection!,"I grew up in the 50's and 60's, so this music ...",1478566720000,0,True,[],...,124,[],[The music of the Baby Boomers are the hits of...,3.39,{'hi_res': ['https://m.media-amazon.com/images...,"[CDs & Vinyl, Pop, Oldies]",Various (Original Artist re-recording) (Artis...,https://m.media-amazon.com/images/I/71OPaVlY1T...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
2,B000066RLV,B000066RLV,AFAIJYOUO3NAWLBDIKTQSC3DASWA,5.0,"Bright, Breezy, Easy Listening",Maria Muldaur never disappoints regardless of ...,1199453906000,1,True,[],...,17,[],"[Product Description, Animal Crackers in My So...",3.99,{'hi_res': ['https://m.media-amazon.com/images...,"[CDs & Vinyl, Children's Music, Sing-A-Longs]",Maria Muldaur Music for Little People Form...,https://m.media-amazon.com/images/I/617qs4dHKg...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
3,B0006SSNPA,B0006SSNPA,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,5.0,"Smoky, jazzy music to transport you to a 1920s...","Paris Combo, headed by the enchanting vocalist...",1137347872000,4,False,[],...,32,[],"[Amazon.com, With a sound made up of equal par...",16.0,"{'hi_res': [None], 'large': ['https://m.media-...","[CDs & Vinyl, International Music, Europe, Con...",Paris Combo Format: Audio CD,https://m.media-amazon.com/images/I/6188HWE72Y...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
4,B0000E6XHK,B0000E6XHK,AFW2PDT3AMT4X3PYQG7FJZH5FXFA,5.0,TzN's back and better than ever!!,"Tiziano Ferro's debut ""Rojo Relativo"" was a fa...",1074480228000,18,False,[],...,30,[],[],23.93,"{'hi_res': [None], 'large': ['https://m.media-...","[CDs & Vinyl, International Music, Europe, Con...",Tiziano Ferro Format: Audio CD,https://m.media-amazon.com/images/I/51xtXfoO1b...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1
