# Text Chunking, Embedding, and Vector Store Indexing

## Import Libaries

In [1]:
import pandas as pd
import numpy as np
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src.rag import (retrieve_similar_complaints, load_faiss_index, load_metadata, load_embedding_model, 
                    initialize_faiss_index, save_faiss_index, embed_chunks, prepare_chunks_and_metadata)
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


## Load Chunked data

In [2]:
file_path = '../data/chunked_complaints.csv'
df = pd.read_csv(file_path)

In [3]:
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,...,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,narrative_length,cleaned_narrative,cleaned_narrative_length,narrative_chunks
0,2025-06-13,Credit card,Store credit card,Getting a credit card,Card opened without my consent or knowledge,A XXXX XXXX card was opened under my name by a...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78230,...,Web,2025-06-13,Closed with non-monetary relief,Yes,,14069121,91,a xxxx xxxx card was opened under my name by a...,91,['a xxxx xxxx card was opened under my name by...
1,2025-06-12,Credit card,General-purpose credit card or charge card,"Other features, terms, or problems",Other problem,"Dear CFPB, I have a secured credit card with c...",Company has responded to the consumer and the ...,"CITIBANK, N.A.",NY,11220,...,Web,2025-06-13,Closed with monetary relief,Yes,,14047085,156,dear cfpb i have a secured credit card with ci...,156,['dear cfpb i have a secured credit card with ...
2,2025-06-12,Credit card,General-purpose credit card or charge card,Incorrect information on your report,Account information incorrect,I have a Citi rewards cards. The credit balanc...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",IL,60067,...,Web,2025-06-12,Closed with explanation,Yes,,14040217,233,i have a citi rewards cards the credit balance...,231,['i have a citi rewards cards the credit balan...
3,2025-06-09,Credit card,General-purpose credit card or charge card,Problem with a purchase shown on your statement,Credit card company isn't resolving a dispute ...,b'I am writing to dispute the following charge...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78413,...,Web,2025-06-09,Closed with monetary relief,Yes,,13968411,454,bi am writing to dispute the following charges...,454,['bi am writing to dispute the following charg...
4,2025-06-09,Credit card,General-purpose credit card or charge card,Problem when making payments,Problem during payment process,"Although the account had been deemed closed, I...",Company believes it acted appropriately as aut...,Atlanticus Services Corporation,NY,11212,...,Web,2025-06-09,Closed with monetary relief,Yes,,13965746,170,although the account had been deemed closed i ...,170,['although the account had been deemed closed ...


## Embedding Model Choice

For this project, I used the `all-MiniLM-L6-v2` model from the SentenceTransformers library. This model is lightweight, fast, and provides high-quality sentence embeddings suitable for semantic search and retrieval tasks. It is widely used in industry and research for its balance of performance and efficiency.

In [None]:
# Prepare chunks and metadata
all_chunks, metadata = prepare_chunks_and_metadata(df)

In [None]:
# Load the embedding model
embedding_model = load_embedding_model('all-MiniLM-L6-v2')

In [None]:
# Generate embeddings for all chunks
embeddings = embed_chunks(all_chunks, embedding_model)

# Store embeddings as a numpy array
embeddings = np.array(embeddings)
print('Embeddings shape:', embeddings.shape)
embeddings[:2]

NameError: name 'embed_chunks' is not defined

## Vector Store Indexing

In [None]:
# Create FAISS index
index = initialize_faiss_index(embeddings)

In [None]:
index_path = '../vector_store/complaint_chunks.index'
metadata_path = '../vector_store/complaint_chunks_metadata.pkl'

save_faiss_index(index, metadata, index_path, metadata_path)

In [None]:
# Save index and metadata
os.makedirs('vector_store', exist_ok=True)
faiss.write_index(index, 'vector_store/complaint_chunks.index')
with open('vector_store/complaint_chunks_metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)

print(f"Stored {len(all_chunks)} chunk vectors and metadata in 'vector_store/' directory.")

Stored 1609126 chunk vectors and metadata in 'vector_store/' directory.


In [None]:
# 1. Define a query string
query_string = "Why are people unhappy with saving accounts?"

# 2. Generate an embedding for the query string
query_embedding = embedding_model.encode([query_string])[0] # Get the single embedding vector

# Ensure the query embedding is a 2D NumPy array
query_embedding = np.array([query_embedding])

# 3. Use the loaded FAISS index to perform a similarity search
k = 5  # Number of nearest neighbors to retrieve
distances, indices = index.search(query_embedding, k)

# 5. Map the retrieved indices back to the original data
similar_complaints = [sample_chunks.iloc[i] for i in indices[0]]

# 6. Print the query string and the retrieved similar items along with their corresponding distances
print(f"Query: '{query_string}'\n")
print("Most similar complaints:")
for i in range(k):
    print(f"  Complaint {i+1} (Distance: {distances[0][i]:.4f}):")
    print(f"  {similar_complaints[i]}\n")

Query: 'Why are people unhappy with saving accounts?'

Most similar complaints:
  Complaint 1 (Distance: 0.8811):
  was not advised of 360 saving account lack or information potential to increase saving

  Complaint 2 (Distance: 0.8894):
  saving account always going to xxxx  dollars

  Complaint 3 (Distance: 0.8951):
  i never agreed to open a savings account and magically i had one

  Complaint 4 (Distance: 0.9012):
  like many people i dont really use my savings account its a nice perk i guess a rainy day fund that allows me to mentally set some money aside ive had checking and savings accounts with bank of america for almost 15 years now for almost all of that time no

  Complaint 5 (Distance: 0.9196):
  capital one mislead its consumers about their 360 savings account kept them in the dark about savings account which offer greater benefits in return interest rates while keeping the 360 savings accounts interest returns deflated compared to the market



In [None]:
# Load data and models
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
index = load_faiss_index('../vector_store/complaint_chunks.index')
metadata_list = load_metadata('../vector_store/complaint_chunks_metadata.pkl')

# Load all_chunks (flattened list of all text chunks, in the same order as embeddings/metadata)
df = pd.read_csv('../data/filtered_complaints.csv')
all_chunks = []
for idx, row in df.iterrows():
    if 'narrative_chunks' in row and isinstance(row['narrative_chunks'], list):
        all_chunks.extend(row['narrative_chunks'])
    elif 'narrative_chunks' in row and isinstance(row['narrative_chunks'], str):
        # If stored as string, try to eval to list
        import ast
        try:
            chunks = ast.literal_eval(row['narrative_chunks'])
            if isinstance(chunks, list):
                all_chunks.extend(chunks)
        except Exception:
            pass

# Example usage
question = "Why are people unhappy with saving accounts?"
results = retrieve_similar_complaints(question, embedding_model, index, metadata_list, all_chunks, k=5)
for i, (chunk, meta, dist) in enumerate(results, 1):
    print(f"Result {i} (Distance: {dist:.4f})")
    print(f"Complaint ID: {meta['complaint_id']}, Product: {meta['product']}")
    print(f"Text: {chunk}\n")