In [7]:
from pathlib import Path
import sys
import importlib

# Ensure project root (parent of notebooks/) is on sys.path so `src` can be imported
sys.path.insert(0, str(Path.cwd().parent.resolve()))

import pandas as pd
import numpy as np

# Reload config to pick up recent edits during the same kernel session
import src.config as _config_module
importlib.reload(_config_module)
from src.config import *

from src.sampling import stratified_sample
from src.chunking import chunk_text
from src.embeddings import load_embedding_model
from src.vector_store import create_faiss_index
from src.utils import validate_dataframe, stack_embeddings

# Load data
df = pd.read_csv("../data/processed/cleaned_complaints.csv")

# If the configured ID column does not exist in the processed CSV, create it from the index
if ID_COLUMN not in df.columns:
    df[ID_COLUMN] = df.index

print(df.columns)

validate_dataframe(df, [TEXT_COLUMN, ID_COLUMN, STRATIFY_COLUMN])

# Stratified sampling
sampled_df = stratified_sample(
    df,
    SAMPLE_SIZE,
    STRATIFY_COLUMN,
    RANDOM_SEED
)

sampled_df.to_csv("../data/processed/sampled_complaints.csv", index=False)

# Load model
model = load_embedding_model(EMBEDDING_MODEL_NAME)

all_embeddings = []
all_metadata = []

for _, row in sampled_df.iterrows():
    chunks = chunk_text(row[TEXT_COLUMN], CHUNK_SIZE, CHUNK_OVERLAP)

    for chunk in chunks:
        emb = model.encode(chunk)
        all_embeddings.append(emb)
        all_metadata.append({
            "complaint_id": row[ID_COLUMN],
            "product": row[STRATIFY_COLUMN],
            "text": chunk
        })

embedding_matrix = stack_embeddings(all_embeddings)

# Create FAISS index
index = create_faiss_index(
    embedding_matrix,
    all_metadata,
    VECTOR_DB_PATH
)

print("✨ Vector store created and saved successfully.")


Index(['Product', 'clean_narrative', 'word_count', 'complaint_id'], dtype='object')
✨ Vector store created and saved successfully.


In [None]:
df.columns




Index(['Product', 'clean_narrative', 'word_count', 'complaint_id'], dtype='object')