# Text Chunking, Embedding, and Vector Store Indexing

## Import Libaries

In [11]:
import pandas as pd
import numpy as np
import faiss
import os
import pickle
from tqdm import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

## Load Filtered data

In [2]:
file_path = '../data/filtered_complaints.csv'
df = pd.read_csv(file_path)

In [3]:
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,narrative_length,cleaned_narrative
0,2025-06-13,Credit card,Store credit card,Getting a credit card,Card opened without my consent or knowledge,A XXXX XXXX card was opened under my name by a...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78230,Servicemember,Consent provided,Web,2025-06-13,Closed with non-monetary relief,Yes,,14069121,91,a xxxx xxxx card was opened under my name by a...
1,2025-06-12,Credit card,General-purpose credit card or charge card,"Other features, terms, or problems",Other problem,"Dear CFPB, I have a secured credit card with c...",Company has responded to the consumer and the ...,"CITIBANK, N.A.",NY,11220,,Consent provided,Web,2025-06-13,Closed with monetary relief,Yes,,14047085,156,dear cfpb i have a secured credit card with ci...
2,2025-06-12,Credit card,General-purpose credit card or charge card,Incorrect information on your report,Account information incorrect,I have a Citi rewards cards. The credit balanc...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",IL,60067,,Consent provided,Web,2025-06-12,Closed with explanation,Yes,,14040217,233,i have a citi rewards cards the credit balance...
3,2025-06-09,Credit card,General-purpose credit card or charge card,Problem with a purchase shown on your statement,Credit card company isn't resolving a dispute ...,b'I am writing to dispute the following charge...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78413,Older American,Consent provided,Web,2025-06-09,Closed with monetary relief,Yes,,13968411,454,bi am writing to dispute the following charges...
4,2025-06-09,Credit card,General-purpose credit card or charge card,Problem when making payments,Problem during payment process,"Although the account had been deemed closed, I...",Company believes it acted appropriately as aut...,Atlanticus Services Corporation,NY,11212,Older American,Consent provided,Web,2025-06-09,Closed with monetary relief,Yes,,13965746,170,although the account had been deemed closed i ...


## Text Chunking

In [4]:
df['cleaned_narrative_length'] = df['cleaned_narrative'].fillna('').apply(lambda x: len(str(x).split()))

In [5]:
print(df['cleaned_narrative_length'].describe())

min_narrative_length = df['cleaned_narrative_length'].min()
max_narrative_length = df['cleaned_narrative_length'].max()

print(f"Minimum narrative length: {min_narrative_length}")
print(f"Maximum narrative length: {max_narrative_length}")

count    272371.000000
mean        195.221356
std         214.504954
min           1.000000
25%          87.000000
50%         126.000000
75%         242.000000
max        6469.000000
Name: cleaned_narrative_length, dtype: float64
Minimum narrative length: 1
Maximum narrative length: 6469


In [6]:
# Example function to experiment with chunk_size and chunk_overlap
def experiment_text_splitter(text, chunk_sizes, chunk_overlaps):
    results = {}
    for size in chunk_sizes:
        for overlap in chunk_overlaps:
            splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=overlap)
            chunks = splitter.split_text(text)
            results[(size, overlap)] = len(chunks)
            print(f"chunk_size={size}, chunk_overlap={overlap} => {len(chunks)} chunks")
    return results

# Use a sample narrative for testing
sample_narrative = df['cleaned_narrative'].dropna().iloc[0]
chunk_sizes = [256, 512, 1024]
chunk_overlaps = [0, 50, 100]
experiment_text_splitter(sample_narrative, chunk_sizes, chunk_overlaps)

chunk_size=256, chunk_overlap=0 => 2 chunks
chunk_size=256, chunk_overlap=50 => 3 chunks
chunk_size=256, chunk_overlap=100 => 3 chunks
chunk_size=512, chunk_overlap=0 => 1 chunks
chunk_size=512, chunk_overlap=50 => 1 chunks
chunk_size=512, chunk_overlap=100 => 1 chunks
chunk_size=1024, chunk_overlap=0 => 1 chunks
chunk_size=1024, chunk_overlap=50 => 1 chunks
chunk_size=1024, chunk_overlap=100 => 1 chunks


{(256, 0): 2,
 (256, 50): 3,
 (256, 100): 3,
 (512, 0): 1,
 (512, 50): 1,
 (512, 100): 1,
 (1024, 0): 1,
 (1024, 50): 1,
 (1024, 100): 1}

- 75% are shorter than 256 → perfect match for chunk_size=256
- Only ~25% will be split into multiple chunks — where overlap helps
- Therefore the best balance is chunk_size=256, chunk_overlap=50 => 3 chunks

In [7]:
# Final splitter for production use
final_splitter = RecursiveCharacterTextSplitter(chunk_size=256, chunk_overlap=50)
df['narrative_chunks'] = df['cleaned_narrative'].fillna('').apply(lambda x: final_splitter.split_text(x))
df[['cleaned_narrative', 'narrative_chunks']].head()

Unnamed: 0,cleaned_narrative,narrative_chunks
0,a xxxx xxxx card was opened under my name by a...,[a xxxx xxxx card was opened under my name by ...
1,dear cfpb i have a secured credit card with ci...,[dear cfpb i have a secured credit card with c...
2,i have a citi rewards cards the credit balance...,[i have a citi rewards cards the credit balanc...
3,bi am writing to dispute the following charges...,[bi am writing to dispute the following charge...
4,although the account had been deemed closed i ...,[although the account had been deemed closed i...


## Embedding Model Choice

For this project, I used the `all-MiniLM-L6-v2` model from the SentenceTransformers library. This model is lightweight, fast, and provides high-quality sentence embeddings suitable for semantic search and retrieval tasks. It is widely used in industry and research for its balance of performance and efficiency.

In [10]:
# Load the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Example: Embed the first chunk of each narrative
sample_chunks = df['narrative_chunks'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else "")
embeddings = embedding_model.encode(sample_chunks.tolist())

# Store embeddings as a numpy array (for demonstration)
embeddings = np.array(embeddings)
print('Embeddings shape:', embeddings.shape)
embeddings[:2]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Embeddings shape: (272371, 384)


array([[-2.87894364e-02,  6.25683069e-02, -3.16462480e-02,
        -1.23118423e-02, -1.35409571e-02, -4.06970084e-02,
         1.37513146e-01, -4.91844676e-02,  5.14071509e-02,
        -8.16076621e-02,  3.44399922e-02, -2.37378106e-02,
         6.27596304e-02, -1.51942130e-02, -1.35831255e-02,
        -4.79593780e-03, -9.63343401e-03,  4.22335267e-02,
        -2.99106073e-02,  6.44735098e-02,  3.44999577e-03,
         1.50824655e-02, -1.02461442e-01, -1.24572718e-03,
        -2.40202919e-02,  3.24372202e-02, -3.56704257e-02,
         8.13993718e-03, -4.28259792e-03, -2.79151779e-02,
         8.60326514e-02,  4.76762764e-02,  4.65976819e-02,
        -9.57847945e-03,  6.24813549e-02, -9.82983187e-02,
        -4.33812290e-02, -3.82946548e-03, -2.38289349e-02,
        -5.64124100e-02,  9.84484181e-02, -8.80536959e-02,
         2.28847284e-02,  5.39500974e-02, -1.59075316e-02,
         9.25016180e-02,  1.66382138e-02,  4.98943105e-02,
        -2.31792536e-02,  5.19540943e-02, -2.07538810e-0

## Vector Store Indexing

In [13]:
# Create FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

In [14]:
# Save index and metadata
os.makedirs('vector_store', exist_ok=True)
faiss.write_index(index, 'vector_store/complaint_chunks.index')
with open('vector_store/complaint_chunks_metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)

print(f"Stored {len(all_chunks)} chunk vectors and metadata in 'vector_store/' directory.")

Stored 1609126 chunk vectors and metadata in 'vector_store/' directory.


In [18]:
# 1. Define a query string
query_string = "Why are people unhappy with saving accounts?"

# 2. Generate an embedding for the query string
query_embedding = embedding_model.encode([query_string])[0] # Get the single embedding vector

# Ensure the query embedding is a 2D NumPy array
query_embedding = np.array([query_embedding])

# 3. Use the loaded FAISS index to perform a similarity search
k = 5  # Number of nearest neighbors to retrieve
distances, indices = index.search(query_embedding, k)

# 5. Map the retrieved indices back to the original data
similar_complaints = [sample_chunks.iloc[i] for i in indices[0]]

# 6. Print the query string and the retrieved similar items along with their corresponding distances
print(f"Query: '{query_string}'\n")
print("Most similar complaints:")
for i in range(k):
    print(f"  Complaint {i+1} (Distance: {distances[0][i]:.4f}):")
    print(f"  {similar_complaints[i]}\n")

Query: 'Why are people unhappy with saving accounts?'

Most similar complaints:
  Complaint 1 (Distance: 0.8811):
  was not advised of 360 saving account lack or information potential to increase saving

  Complaint 2 (Distance: 0.8894):
  saving account always going to xxxx  dollars

  Complaint 3 (Distance: 0.8951):
  i never agreed to open a savings account and magically i had one

  Complaint 4 (Distance: 0.9012):
  like many people i dont really use my savings account its a nice perk i guess a rainy day fund that allows me to mentally set some money aside ive had checking and savings accounts with bank of america for almost 15 years now for almost all of that time no

  Complaint 5 (Distance: 0.9196):
  capital one mislead its consumers about their 360 savings account kept them in the dark about savings account which offer greater benefits in return interest rates while keeping the 360 savings accounts interest returns deflated compared to the market

