Load the Cleaned Dataset

In [1]:
import pandas as pd

df = pd.read_csv("../data/processed/cleaned_complaints.csv")

df.head()


Unnamed: 0,Product,clean_narrative,word_count
0,Credit card,a xxxx xxxx card was opened under my name by a...,91
1,Credit card,dear cfpb i have a secured credit card with ci...,156
2,Credit card,i have a citi rewards cards the credit balance...,233
3,Credit card,b i am writing to dispute the following charge...,454
4,Credit card,although the account had been deemed closed i ...,170


Create a Stratified Sample (10,000–15,000)

In [2]:
SAMPLE_SIZE = 12000

# calculate proportion per product
product_counts = df["Product"].value_counts(normalize=True)

# determine sample size per product
sample_sizes = (product_counts * SAMPLE_SIZE).astype(int)

# stratified sampling
df_sampled = (
    df.groupby("Product", group_keys=False)
    .apply(lambda x: x.sample(n=sample_sizes[x.name], random_state=42))
)

df_sampled["Product"].value_counts(), df_sampled.shape


  .apply(lambda x: x.sample(n=sample_sizes[x.name], random_state=42))


(Product
 Credit card        11781
 Money transfers      218
 Name: count, dtype: int64,
 (11999, 3))

Text Chunking Strategy

In [3]:
%pip install -U langchain langchain-text-splitters


from langchain_text_splitters import RecursiveCharacterTextSplitter



Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
  from pydantic.v1.fields import FieldInfo as FieldInfoV1
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n\n", "\n", " ", ""]
)


Apply Chunking

In [11]:
from tqdm.notebook import tqdm

chunks = []
metadatas = []

# Use itertuples for speed and show progress; handle missing/empty texts
for row in tqdm(df_sampled.itertuples(index=True), total=len(df_sampled)):
    text = getattr(row, "clean_narrative", "")
    if not text or not str(text).strip():
        continue

    split_texts = text_splitter.split_text(str(text))

    for i, chunk in enumerate(split_texts):
        chunk = chunk.strip()
        if not chunk:
            continue

        chunks.append(chunk)
        metadatas.append({
            "product": getattr(row, "Product"),
            "complaint_index": row.Index,
            "chunk_index": i,
            "total_chunks": len(split_texts)
        })

len(chunks)

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

Generate Embeddings

In [6]:
# Ensure required packages are available in the kernel
%pip install -U sentence-transformers

# (Optional) install a compatible torch build if needed
# %pip install -U torch

from sentence_transformers import SentenceTransformer

# Use the short model name (works with SentenceTransformer) and do a quick smoke test
try:
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    test_emb = embedding_model.encode(["test sentence"], show_progress_bar=False)
    print("Model loaded and encoding OK. Embedding dim:", len(test_emb[0]))
except Exception as e:
    print("Error loading/using SentenceTransformer:", e)
    raise

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Model loaded and encoding OK. Embedding dim: 384


In [7]:
embeddings = embedding_model.encode(
    chunks,
    show_progress_bar=True,
    batch_size=64
)


Batches: 100%|██████████| 527/527 [10:09<00:00,  1.16s/it]


Create the Vector Store (FAISS)

In [13]:
# Install FAISS for CPU (use conda if you prefer a supported build on Windows)
%pip install -q faiss-cpu

# Import FAISS and numpy
try:
    import faiss
    import numpy as np
    print("FAISS imported successfully (version:", faiss.__version__, ")")
except Exception as e:
    print("Failed to import faiss:", e)
    raise

Note: you may need to restart the kernel to use updated packages.
FAISS imported successfully (version: 1.13.2 )



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

index.add(np.array(embeddings))

index.ntotal


33692

Persist the Vector Store

In [18]:
import os

os.makedirs("vector_store", exist_ok=True)

faiss.write_index(index, "vector_store/complaints_faiss.index")

# save metadata separately
metadata_df = pd.DataFrame(metadatas)
metadata_df.to_pickle("vector_store/complaints_metadata.pkl")
