In [1]:
# ingest_products.py

import os
import json
from datasets import load_dataset
from items import Item
from sentence_transformers import SentenceTransformer
import chromadb

# 1) load a small slice of your product dataset
#    adjust dataset name / split as appropriate for your course materials
ds = load_dataset("amazon_us_reviews", split="train[:500]")

# 2) turn each raw record into an Item (filters to MIN/MAX tokens & builds .prompt)
items = []
for rec in ds:
    # adapt if your dataset fields differ
    data = {
        "title": rec.get("title", ""),
        "description": rec.get("review_body", "").splitlines(),
        "features": rec.get("feature", []),
        "details": rec.get("review_title", "")
    }
    price = float(rec.get("price", 0))  # or your field
    itm = Item(data, price)
    if itm.include:
        items.append(itm)

print(f"Will ingest {len(items)} items into vector store.")

# 3) embed prompts
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
docs       = [itm.prompt for itm in items]
embeddings = model.encode(docs).tolist()
metadatas  = [{"price": itm.price, "category": itm.category} for itm in items]

# 4) add to ChromaDB
client = chromadb.PersistentClient(path="products_vectorstore")
collection = client.get_or_create_collection("products")
collection.add(
    documents=docs,
    embeddings=embeddings,
    metadatas=metadatas,
)

print("Ingestion complete. Current count:",
      collection.count())  # verify non-zero count





README.md: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


amazon_us_reviews.py: 0.00B [00:00, ?B/s]

RuntimeError: Dataset scripts are no longer supported, but found amazon_us_reviews.py

In [7]:
import json
from sentence_transformers import SentenceTransformer
import chromadb

# 1) Load your memory.json descriptions
with open("memory.json") as f:
    opps = json.load(f)
docs = [opp["deal"]["product_description"] for opp in opps]

# 2) Embed them
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embs  = model.encode(docs).tolist()

# 3) Persist into ChromaDB (now passing `ids`)
client     = chromadb.PersistentClient(path="products_vectorstore")
collection = client.get_or_create_collection("products")

ids = [str(i) for i in range(len(docs))]
metadatas = [{"price": opp["deal"]["price"]} for opp in opps]

collection.add(
    ids=ids,
    documents=docs,
    embeddings=embs,
    metadatas=metadatas,
)

print("Ingested", len(ids), "items into products_vectorstore.")


Ingested 2 items into products_vectorstore.


In [2]:
from chromadb import PersistentClient
client = PersistentClient(path="products_vectorstore")
col    = client.get_or_create_collection("products")
res    = col.get(include=['documents'], limit=5)
print("Loaded docs:", res['documents'])
# → should now list the first few prompts you just ingested

Loaded docs: []
