# Merge All_Beauty + Meta, Sample Unlabeled Data

In [None]:
from pathlib import Path
import re
import pandas as pd

PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent

DATA_DIR = PROJECT_ROOT / "data"
REVIEWS_PATH = DATA_DIR / "All_Beauty.jsonl"
META_PATH = DATA_DIR / "meta_All_Beauty.jsonl"
OUTPUT_PATH = DATA_DIR / "unlabeled_data.csv"
LABELED_PATH = PROJECT_ROOT / "labeled_data" / "labeled_data.csv"

SAMPLE_SIZE = 13000
RANDOM_STATE = 42


In [None]:
if not REVIEWS_PATH.exists():
    raise FileNotFoundError(f"Missing reviews file: {REVIEWS_PATH}")
if not META_PATH.exists():
    raise FileNotFoundError(f"Missing meta file: {META_PATH}")

print("Loading reviews...")
reviews = pd.read_json(REVIEWS_PATH, lines=True)
print("Loading meta...")
meta = pd.read_json(META_PATH, lines=True)

if "title" in meta.columns:
    meta = meta.rename(columns={"title": "product_title"})
keep_meta_cols = [
    "parent_asin",
    "product_title",
    "store",
    "average_rating",
    "rating_number",
    "main_category",
    "price",
]
keep_meta_cols = [c for c in keep_meta_cols if c in meta.columns]
meta = meta[keep_meta_cols].drop_duplicates(subset=["parent_asin"])

merged = reviews.merge(meta, on="parent_asin", how="left")
print("Merged rows:", len(merged))


In [None]:
merged = merged.dropna(subset=["user_id", "parent_asin", "text"]).copy()
merged["text"] = merged["text"].astype(str)
merged = merged[merged["text"].str.strip().ne("")]

cleaner = re.compile(r"[^a-z0-9\s]+")
def clean_text(text: str) -> str:
    text = text.lower()
    text = cleaner.sub(" ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

merged["clean_text"] = merged["text"].map(clean_text)
merged = merged[merged["clean_text"].str.strip().ne("")]
merged = merged.reset_index(drop=True)
print("Eligible rows after cleaning:", len(merged))


In [None]:
sample = merged.sample(n=SAMPLE_SIZE, random_state=RANDOM_STATE).reset_index(drop=True)
sample = sample[["user_id", "parent_asin", "text", "clean_text"]].copy()
sample["label_intent"] = ""

OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
sample.to_csv(OUTPUT_PATH, index=False)
print(f"Wrote {len(sample)} rows to {OUTPUT_PATH}")


## Manual labeling
Open `data/unlabeled_data.csv`, label the `label_intent` column,
and save the result as `labeled_data/labeled_data.csv`.
