# Embedding and Indexing

## 1. Imports

In [None]:
import os
import sys
from pathlib import Path
# Ensure project root is on sys.path so `src` imports work in the notebook
sys.path.insert(0, str(Path.cwd().parent.resolve()))


In [None]:
from src.vector_store import FaissVectorStore
from src.embedding import EmbeddingModel
from src.chunking import TextChunker
from src.sampling import stratified_sample
import pandas as pd

# 2. Load Cleaned Dataset

In [None]:
# Resolve processed dataset path relative to project root
PROJECT_ROOT = Path.cwd().parent.resolve()
CSV_PATH = PROJECT_ROOT / "data" / "processed" / "filtered_complaints.csv"
print('Reading dataset from:', CSV_PATH)
df = pd.read_csv(CSV_PATH)

df = df.rename(columns={
    "Consumer complaint narrative": "narrative",
    "Product": "product",
    "Complaint ID": "complaint_id"
})

df.head()
df["product"].value_counts()
df["product"].unique()

## 3 Stratified Sampling (10â€“15k)

In [None]:
sampled_df = stratified_sample(
    df=df,
    label_col="product",
    sample_size=12000
)

sampled_df["product"].value_counts(normalize=True)

## 4. Chunk Narratives

In [None]:
chunker = TextChunker(chunk_size=500, overlap=1)

all_chunks = []

for _, row in sampled_df.iterrows():
    chunks = chunker.chunk(
        text=row["narrative"],
        metadata={
            "complaint_id": row["complaint_id"],
            "product": row["product"]
        }
    )
    all_chunks.extend(chunks)

len(all_chunks)

## 5. Prepare Texts + Metadata

In [None]:
# Prepare texts + metadatas and filter out empty/whitespace-only texts
pairs = [(c['text'], c['metadata']) for c in all_chunks]
# Keep only pairs where text is a non-empty string after stripping
filtered = [(t, m) for t, m in pairs if isinstance(t, str) and t.strip() != '']
n_dropped = len(pairs) - len(filtered)
if n_dropped > 0:
    print(f'Filtered out {n_dropped} empty/whitespace-only chunks before embedding')
texts = [t for t, _ in filtered]
metadatas = [m for _, m in filtered]

## 6. Generate Embeddings

In [None]:
embedder = EmbeddingModel()
embeddings = embedder.embed_texts(texts)

embeddings.shape

## 7. Build & Persist FAISS Vector Store

In [None]:
# Diagnostics: ensure embeddings and metadatas align
import numpy as _np
emb_shape = getattr(embeddings, 'shape', None)
n_emb = int(emb_shape[0]) if emb_shape is not None else None
n_meta = len(metadatas) if metadatas is not None else None
print('embeddings.shape =', emb_shape)
print('num metadatas =', n_meta)
if n_emb != n_meta:
    raise ValueError(f'Embeddings and metadata length mismatch: embeddings={n_emb}, metadatas={n_meta}')
# Proceed to build and save the vector store
vector_store = FaissVectorStore(embedding_dim=embeddings.shape[1])
vector_store.add(embeddings, metadatas)
vector_store.save("vector_store/faiss_index")
print("Vector store saved successfully.")