In [None]:
import json
from pathlib import Path
from langchain_core.documents.base import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

VECTOR_DIR = "data/vectorstore"
CORPUS = Path("data/corpus.jsonl")

# Clear existing data in the store
if Path(VECTOR_DIR).exists():
    store_temp = Chroma(persist_directory=VECTOR_DIR, embedding_function=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"))
    store_temp.delete_collection()
    print(f"Cleared existing data from {VECTOR_DIR}")

emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
store = Chroma(persist_directory=VECTOR_DIR, embedding_function=emb)

docs = []
with CORPUS.open("r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        item = json.loads(line)
        text = item["text"]
        metadata = {
            "doc_id": item.get("doc_id"),
            "url": item.get("url"),
            "captured_at": item.get("captured_at"),
            "source_type": item.get("source_type"),
        }
        docs.append(Document(page_content=text, metadata=metadata))

if docs:
    store.add_documents(docs)
    print(f"Ingested {len(docs)} chunks into {VECTOR_DIR}")
else:
    print("No documents found to ingest.")
print(f"Total documents in store: {store._collection.count()}")