In [1]:
import pandas as pd
import os
import sys

In [2]:
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
os.chdir(project_root)

if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [3]:
from src.chunking import chunk_texts
from src.embedder import load_embedding_model, get_embeddings
from src.vector_store import create_faiss_index, save_faiss_index, save_metadata

In [4]:
#Step 1: Load cleaned data
filtered_path = os.path.join(project_root, "data", "filtered_complaints.csv")
df = pd.read_csv(filtered_path)

In [5]:
#Step 2: Create chunk-level documents + metadata
documents = []
metadata = []
for _, row in df.iterrows():
    chunks = chunk_texts([row["cleaned_narrative"]])
    for chunk in chunks:
        documents.append(chunk)
        metadata.append({
            "complaint_id": row["Complaint ID"] if "Complaint ID" in row else None,
            "product": row["Product"],
            "chunk": chunk
        })

In [7]:
#Step 3: Embed
model = load_embedding_model()
embeddings = get_embeddings(model, documents)

#Step 4: Index & Save
index = create_faiss_index(embeddings)
save_faiss_index(index)
save_metadata(metadata)

print(f"✅ Indexed {len(documents)} chunks.")

Batches:   0%|          | 0/799 [00:00<?, ?it/s]

✅ Indexed 25564 chunks.
