In [None]:
# pip install chromadb sentence-transformers
import chromadb

In [None]:
from chromadb.config import Settings
from sentence_transformers import sentence_transformers
import uuid

In [None]:
# Initialize embedding model + DB
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

chroma_client = chroma.PersistentClient(
    Settings(
        chroma_db_impl="duckdb+parquet",
        persist_directory="chromadb_store")
)

collection = chroma_client.get_or_create_collection(
    name="multimodal_index", 
    metadata={"hnsw:space": "cosine"} # cosine similarity for retrieval
    )

def normalize_and_index(ingested_audio_outputs: list[dict]):
    """
    Convert ingested ASR segments to unitfied normalized schema + embed + store in vector DB.
    """

    ids = []
    embeddings = []
    documents = []
    metadatas = []

    for seg in ingested_audio_outputs:
        chunk_id = str(uuid.uuid4())
        text = seg["text"]
        meta = seg["metadata"]

        # Generate embedding
        emb = embed_model.encode(text).tolist()

        # Append to lists
        ids.append(chunk_id)
        embeddings.append(emb)
        documents.append(text)
        metadatas.append(meta)

    # Store in Chroma
    collection.add(
        ids=ids,
        embeddings=embeddings,
        documents=documents,
        metadatas=metadatas
    )

    print(f"Indexed {len(ids)} audio chunks")
    return ids

if __name__ == "__main__":
    from ingest_audio_script import ingest_audio # import your script

    audio_data = ingest_audio("audio/")
    normalize_and_index(audio_data)
