In [33]:
from pathlib import Path
import pandas as pd
import re, json

RAW = Path("data/raw")
OUT = Path("data/processed")
OUT.mkdir(parents=True, exist_ok=True)

RAW.resolve(), OUT.resolve()

(PosixPath('/Users/maksimpahomov/Data Science/P&G/src/data/raw'),
 PosixPath('/Users/maksimpahomov/Data Science/P&G/src/data/processed'))

In [34]:
def clean_text(x: str) -> str:
    if not isinstance(x, str):
        x = "" if x is None else str(x)
    return re.sub(r"\s+", " ", x).strip()

def chunk_text(text: str, max_chars: int = 800):
    
    words = text.split()
    chunks, cur, count = [], [], 0
    for w in words:
        if count + len(w) + 1 > max_chars:
            if cur:
                chunks.append(" ".join(cur))
            cur, count = [w], len(w) + 1
        else:
            cur.append(w)
            count += len(w) + 1
    if cur:
        chunks.append(" ".join(cur))
    return chunks

In [35]:
desc_df = pd.read_csv(RAW / "product_descriptions.csv")
brand_map = dict(zip(desc_df["product_id"], desc_df["brand"]))
desc_df

Unnamed: 0,product_id,brand,product_name,description
0,1,Head & Shoulders,Classic Clean Dandruff Shampoo Twin Pack,Confidence looks good on you with Head & Shoul...
1,2,Dove,Damage Therapy Shampoo Daily Moisture for Dry ...,Feel free and confident with beautifully hydra...
2,3,CeraVe,Gentle Hydrating Shampoo,Introducing the CeraVe Gentle Hydrating Shampo...
3,4,Pantene,Shampoo & Conditioner Set,LUXURY HAIR REPAIR WITHOUT THE LUXURY PRICE. P...


In [36]:
reviews_df = pd.read_csv(RAW / "reviews.csv")
reviews_df

Unnamed: 0,product_id,rating,title,text,url
0,1,5,"Great Value, Trusted Quality – My Go-To for a ...",I've been using the Head & Shoulders Shampoo a...,https://www.amazon.com/gp/customer-reviews/R1N...
1,1,5,Great for my locs,"Great shampoo, no itch. Cooling and tingly sen...",https://www.amazon.com/gp/customer-reviews/R1U...
2,1,5,Shampoo- Great Deal!,"Works exactly as expected-reliable, clean scen...",https://www.amazon.com/gp/customer-reviews/R3K...
3,1,5,Best shampoo ever,About 2 years ago I started having an itchy sc...,https://www.amazon.com/gp/customer-reviews/RDG...
4,1,5,Excellent product.,No more itchy scalp.Not harsh.Nice price point...,https://www.amazon.com/gp/customer-reviews/RCP...
5,1,4,"Great for dry hair, but not for oily scalp",This shampoo works really well if you have dry...,https://www.amazon.com/gp/customer-reviews/R1P...
6,2,5,GREAT product!!,Only washed my hair with it once. But Holy Mol...,https://www.amazon.com/gp/customer-reviews/R60...
7,2,5,Pleasantly Surprised,I bought both the Dove 'daily moisture' shampo...,https://www.amazon.com/gp/customer-reviews/R2E...
8,2,5,Reasonable price,Love the shampoo & how soft my hair is after w...,https://www.amazon.com/gp/customer-reviews/R2W...
9,2,5,Effective and affordable,This is a great hair care line! Dove has formu...,https://www.amazon.com/gp/customer-reviews/R33...


In [37]:
records = []

for _, row in desc_df.iterrows():
    desc_chunks = chunk_text(clean_text(row["description"]), max_chars=2000)
    for i, ch in enumerate(desc_chunks):
        records.append({
            "id": f"{row['product_id']}_desc_{i}",
            "product_id": row["product_id"],
            "brand": row["brand"],
            "source": "description",
            "content": ch
        })

for ridx, row in reviews_df.iterrows():
    text_chunks = chunk_text(clean_text(row["text"]), max_chars=900)
    brand = brand_map.get(row["product_id"], "?")
    for j, ch in enumerate(text_chunks):
        records.append({
            "id": f"{row['product_id']}_rev_{ridx}_{j}",
            "product_id": row["product_id"],
            "brand": brand,   # 👈 add brand here
            "source": "review",
            "rating": int(row["rating"]) if "rating" in row and pd.notna(row["rating"]) else None,
            "content": ch
        })

len(records)

37

In [38]:
records

[{'id': '1_desc_0',
  'product_id': 1,
  'brand': 'Head & Shoulders',
  'source': 'description',
  'content': "Confidence looks good on you with Head & Shoulders Classic Clean Anti-Dandruff Shampoo. The rich-lathering formula delivers proven protection from flakes, itch, oil and dryness* and is brought to you by America’s #1 dandruff shampoo brand.† This formula is gentle enough on hair for daily use—even for color-treated hair—and is the perfect first step in your daily haircare routine, so you can start every day with your head held high. *flakes and itch associated with dandruff †based on volume sales Regular use of Head & Shoulders anti dandruff products nourish your scalp three surface layers deep to prevent dryness, flakes and itch, associated with dandruff. The Head & Shoulders collection includes clarifying shampoos, volumizing conditioners, and 2 in 1 combos, so you can introduce dandruff treatment to every step of your hair care routine. With a pH-balanced formula brought to 

In [39]:
out_file = OUT / "corpus.jsonl"
with out_file.open("w", encoding="utf-8") as f:
    for r in records:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print("Saved:", out_file, "with", len(records), "chunks")

Saved: data/processed/corpus.jsonl with 37 chunks


In [40]:
# checking
import random
for r in random.sample(records, min(5, len(records))):
    print(r["id"], r["source"])
    print(r["content"][:200])
    print("-"*60)

3_rev_15_0 review
I might have loved this shampoo more a few years ago, when I rarely shampooed my hair, cause so many shampoos irritated my scalp. It is very gentle, and doesn't foam much (which indicates there aren't
------------------------------------------------------------
2_rev_6_0 review
Only washed my hair with it once. But Holy Moly!! What a great job it did with my hair. It was dry and brittle beforehand. One wash and it feels like it was freshly washed and cut. No more brittle end
------------------------------------------------------------
2_rev_7_0 review
I bought both the Dove 'daily moisture' shampoo and the Dove 'daily moisturizer' conditioner. I chose this specific formula because I have thick, wavy hair that gets quite dry - and this line (accordi
------------------------------------------------------------
2_rev_9_0 review
This is a great hair care line! Dove has formulas that clean and nourish hair at a drugstore price. To me, this performs better than the salon fo