In [12]:
import faiss

In [13]:
import json, numpy as np, os
from pathlib import Path
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

INP = Path("data/processed/corpus.jsonl")
IDX_DIR = Path("index")
IDX_DIR.mkdir(parents=True, exist_ok=True)

EMBED_MODEL = "text-embedding-3-small"

In [14]:
docs = [json.loads(l) for l in INP.read_text(encoding="utf-8").splitlines()]
len(docs), docs[0]

(37,
 {'id': '1_desc_0',
  'product_id': 1,
  'brand': 'Head & Shoulders',
  'source': 'description',
  'content': "Confidence looks good on you with Head & Shoulders Classic Clean Anti-Dandruff Shampoo. The rich-lathering formula delivers proven protection from flakes, itch, oil and dryness* and is brought to you by America’s #1 dandruff shampoo brand.† This formula is gentle enough on hair for daily use—even for color-treated hair—and is the perfect first step in your daily haircare routine, so you can start every day with your head held high. *flakes and itch associated with dandruff †based on volume sales Regular use of Head & Shoulders anti dandruff products nourish your scalp three surface layers deep to prevent dryness, flakes and itch, associated with dandruff. The Head & Shoulders collection includes clarifying shampoos, volumizing conditioners, and 2 in 1 combos, so you can introduce dandruff treatment to every step of your hair care routine. With a pH-balanced formula brough

In [15]:
def embed_texts(texts, model=EMBED_MODEL, batch=128):
    vectors = []
    for i in range(0, len(texts), batch):
        batch_texts = texts[i:i+batch]
        resp = client.embeddings.create(model=model, input=batch_texts)
        vectors.extend([d.embedding for d in resp.data])
    return np.array(vectors, dtype="float32")

In [16]:
texts = [d["content"] for d in docs]
vecs = embed_texts(texts)
vecs.shape

(37, 1536)

In [17]:
dim = vecs.shape[1]
faiss.normalize_L2(vecs)
index = faiss.IndexFlatIP(dim)
index.add(vecs)

faiss.write_index(index, str(IDX_DIR / "faiss.index"))


meta_file = IDX_DIR / "meta.jsonl"
meta_file.write_text("\n".join(json.dumps(d, ensure_ascii=False) for d in docs), encoding="utf-8")

print(f"Indexed {len(docs)} chunks. Index size:", index.ntotal)

Indexed 37 chunks. Index size: 37


In [18]:
def embed_query(q):
    e = client.embeddings.create(model=EMBED_MODEL, input=[q]).data[0].embedding
    v = np.array(e, dtype="float32")
    return v / np.linalg.norm(v)

def search(query, k=3):
    v = embed_query(query)
    D, I = index.search(np.array([v]), k)
    return [(docs[i], float(D[0][j])) for j, i in enumerate(I[0])]

hits = search("Which shampoo is best for dandruff?", k=3)
for h in hits:
    print("Score:", h[1], "| Source:", h[0]["source"], "| Brand:", h[0].get("brand", "?"))
    print(h[0]["content"][:200] + "...")
    print("-"*60)

Score: 0.6620045900344849 | Source: description | Brand: Head & Shoulders
Confidence looks good on you with Head & Shoulders Classic Clean Anti-Dandruff Shampoo. The rich-lathering formula delivers proven protection from flakes, itch, oil and dryness* and is brought to you ...
------------------------------------------------------------
Score: 0.5882282257080078 | Source: review | Brand: CeraVe
Wow! Saved my scalp and hair! I have 1/2a/2b hair—fine, medium density, low porosity, and protein sensitive, with a very sensitive scalp. I used to use Head & Shoulders (I tried everything they offer)...
------------------------------------------------------------
Score: 0.5673204660415649 | Source: review | Brand: CeraVe
I kept getting sebum plugs or something that caused sand-like particles to grow on my scalp and show up when you run your nails across the roots of your hair/scalp (ie- scratch your itchy scalp). I’ve...
------------------------------------------------------------
