# Text Chunking, Embedding, and Vector Store Indexing

In [None]:
import os
import sys

system_path = os.path.abspath('../')
if system_path not in sys.path:
    sys.path.append(system_path)
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import src.chunk_and_embed as chunk 

In [3]:
data = pd.read_csv('../data/filtered_complaints.csv')
data

Unnamed: 0.1,Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,Consumer complaint narrative clean
0,12237,2025-06-13,Credit card,Store credit card,Getting a credit card,Card opened without my consent or knowledge,A XXXX XXXX card was opened under my name by a...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78230,Servicemember,Consent provided,Web,2025-06-13,Closed with non-monetary relief,Yes,,14069121,a xxxx xxxx card was opened under my name by a...
1,12532,2025-06-13,Checking or savings account,Checking account,Managing an account,Deposits and withdrawals,I made the mistake of using my wellsfargo debi...,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,ID,83815,,Consent provided,Web,2025-06-13,Closed with explanation,Yes,,14061897,i made the mistake of using my wellsfargo debi...
2,13280,2025-06-12,Credit card,General-purpose credit card or charge card,"Other features, terms, or problems",Other problem,"Dear CFPB, I have a secured credit card with c...",Company has responded to the consumer and the ...,"CITIBANK, N.A.",NY,11220,,Consent provided,Web,2025-06-13,Closed with monetary relief,Yes,,14047085,"dear cfpb, i have a secured credit card with c..."
3,13506,2025-06-12,Credit card,General-purpose credit card or charge card,Incorrect information on your report,Account information incorrect,I have a Citi rewards cards. The credit balanc...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",IL,60067,,Consent provided,Web,2025-06-12,Closed with explanation,Yes,,14040217,i have a citi rewards cards. the credit balanc...
4,13622,2025-06-11,Vehicle loan or lease,Loan,Repossession,Deficiency balance after repossession,Was never notified of repossession. Once repos...,Company has responded to the consumer and the ...,CREDIT ACCEPTANCE CORPORATION,TX,75070,,Consent provided,Web,2025-06-11,Closed with explanation,Yes,,14019199,was never notified of repossession. once repos...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558460,9609687,2022-08-19,"Payday loan, title loan, or personal loan",Installment loan,Problem when making payments,,Omni financial loan. Paid for by allotment,Company believes the complaint provided an opp...,Siggi LLC,VA,20136,Servicemember,Consent provided,Web,2022-08-19,Closed with explanation,Yes,,5896172,omni financial loan. paid for by allotment
558461,9609704,2022-08-19,"Money transfer, virtual currency, or money ser...",International money transfer,Other transaction problem,,"Hello, I'm an XXXX of XXXX XXXX company that ...",Company has responded to the consumer and the ...,"CITIBANK, N.A.",,XXXXX,,Consent provided,Web,2022-08-19,Closed with explanation,Yes,,5896060,i'm an xxxx of xxxx xxxx company that xxxx xxx...
558462,9609735,2022-11-23,Checking or savings account,Savings account,Managing an account,Deposits and withdrawals,"I already have a case XXXX, but it should not ...",Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,OK,73160,,Consent provided,Web,2022-11-23,Closed with explanation,Yes,,6238646,"i already have a case xxxx, but it should not ..."
558463,9609743,2022-11-23,"Payday loan, title loan, or personal loan",Title loan,Vehicle was repossessed or sold the vehicle,,On XX/XX/XXXX my final payment was supposed to...,Company has responded to the consumer and the ...,"Westlake Services, LLC",CA,90059,Older American,Consent provided,Web,2022-11-23,Closed with explanation,Yes,,6238123,on xx xx xxxx my final payment was supposed to...


In [None]:
from src.chunk_and_embed import langchain_chunk_narratives

# Apply LangChain chunking to the DataFrame
data_with_chunks = langchain_chunk_narratives(data)

# Show a sample of the chunked data
data_with_chunks[['Consumer complaint narrative clean', 'chunks']].head()



Unnamed: 0,Consumer complaint narrative clean,chunks
0,a xxxx xxxx card was opened under my name by a...,[a xxxx xxxx card was opened under my name by ...
1,i made the mistake of using my wellsfargo debi...,[i made the mistake of using my wellsfargo deb...
2,"dear cfpb, i have a secured credit card with c...","[dear cfpb, i have a secured credit card with ..."
3,i have a citi rewards cards. the credit balanc...,[i have a citi rewards cards. the credit balan...
4,was never notified of repossession. once repos...,[was never notified of repossession. once repo...


In [10]:
data_with_chunks[['Complaint ID','Product', 'Consumer complaint narrative clean','chunks']].to_csv('../data/chunk_data.csv')

In [5]:
df = pd.read_csv('../data/filtered_complaints.csv')

In [6]:
import sidetable
df.stb.missing()

Unnamed: 0,missing,total,percent
Consumer disputed?,515780,558465,92.356728
Tags,462490,558465,82.8145
Company public response,345031,558465,61.782028
Sub-issue,154872,558465,27.731729
Sub-product,20619,558465,3.692085
State,5650,558465,1.011702
Consumer complaint narrative clean,42,558465,0.007521
Unnamed: 0,0,558465,0.0
Product,0,558465,0.0
Date received,0,558465,0.0


In [8]:
# Drop rows where 'Consumer complaint narrative clean' is missing
df = df.dropna(subset=['Consumer complaint narrative clean'])


In [9]:
df.stb.missing()

Unnamed: 0,missing,total,percent
Consumer disputed?,515738,558423,92.356153
Tags,462449,558423,82.813387
Company public response,345010,558423,61.782914
Sub-issue,154864,558423,27.732382
Sub-product,20619,558423,3.692362
State,5648,558423,1.01142
Unnamed: 0,0,558423,0.0
Date received,0,558423,0.0
Product,0,558423,0.0
Consumer complaint narrative,0,558423,0.0


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import pandas as pd

# Load processed data
#df = pd.read_csv('../data/filtered_complaints.csv')

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    length_function=len
)

# Split documents
docs = []
for _, row in df.iterrows():
    chunks = text_splitter.create_documents(
        [row['Consumer complaint narrative clean']],
        metadatas=[{
            'product': row['Product'],
            'complaint_id': row['Complaint ID']
        }]
    )
    docs.extend(chunks)

# Initialize embeddings
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

# Create and save vector store
vector_store = FAISS.from_documents(docs, embedding_model)
vector_store.save_local("../vector_store/creditrust_faiss_index")

  embedding_model = HuggingFaceEmbeddings(


In [2]:

import pandas as pd
import sidetable

In [3]:
datas = pd.read_csv('../data/filtered_complaints.csv', low_memory=False)

In [7]:
datas.stb.missing()

Unnamed: 0,missing,total,percent
Consumer disputed?,515738,558423,92.356153
Tags,462449,558423,82.813387
Company public response,345010,558423,61.782914
Sub-issue,154864,558423,27.732382
Sub-product,20619,558423,3.692362
State,5648,558423,1.01142
Unnamed: 0,0,558423,0.0
Date received,0,558423,0.0
Product,0,558423,0.0
Consumer complaint narrative,0,558423,0.0


In [6]:
# Drop rows where 'Consumer complaint narrative clean' is missing
datas = datas.dropna(subset=['Consumer complaint narrative clean'])

In [8]:
datas.to_parquet('../data/filtered_data.parquet', index=False)

In [1]:
import faiss
import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import pickle
import os


In [3]:
print("========== Step 1: Load and preprocess ==========")
df = pd.read_parquet('../data/filtered_data.parquet')



In [4]:

print("========== Step 2: Chunk text ==========")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=350, chunk_overlap=40)
documents = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    chunks = text_splitter.create_documents(
        [row['Consumer complaint narrative clean']],
        metadatas=[{
            'product': row['Product'],
            'complaint_id': row['Complaint ID']
        }]
    )
    documents.extend(chunks)

texts = [doc.page_content for doc in documents]
metadatas = [doc.metadata for doc in documents]




100%|██████████| 558423/558423 [17:37<00:00, 527.92it/s] 


In [5]:
documents

[Document(metadata={'product': 'Credit card', 'complaint_id': 14069121}, page_content='a xxxx xxxx card was opened under my name by a fraudster. i received a notice from xxxx that an account was just opened under my name. i reached out to xxxx xxxx to state that this activity was unauthorized and not me. xxxx xxxx confirmed this was fraudulent and immediately closed the card. however, they have failed to remove this from the three'),
 Document(metadata={'product': 'Credit card', 'complaint_id': 14069121}, page_content='failed to remove this from the three credit agencies and this fraud is now impacting my credit score based on a hard credit pull done by xxxx xxxx that was done by a fraudster.'),
 Document(metadata={'product': 'Checking or savings account', 'complaint_id': 14061897}, page_content="i made the mistake of using my wellsfargo debit card to depsit funds into xxxxxxxx atm machine outside their branch. i went into the branch and was told they couldn't help and had to phone the

In [6]:
texts

['a xxxx xxxx card was opened under my name by a fraudster. i received a notice from xxxx that an account was just opened under my name. i reached out to xxxx xxxx to state that this activity was unauthorized and not me. xxxx xxxx confirmed this was fraudulent and immediately closed the card. however, they have failed to remove this from the three',
 'failed to remove this from the three credit agencies and this fraud is now impacting my credit score based on a hard credit pull done by xxxx xxxx that was done by a fraudster.',
 "i made the mistake of using my wellsfargo debit card to depsit funds into xxxxxxxx atm machine outside their branch. i went into the branch and was told they couldn't help and had to phone the customer service for help. i did this and was told i was helped gave all the info for the time terminal id aact s, xxxx was able to find the transaction and",
 'was able to find the transaction and give me this info, he said the dispute would take a few days. i waited a f

In [11]:
# Filter the first 200 documents, texts, and metadatas
sample_documents = documents[:200]
sample_texts = texts[:200]
sample_metadatas = metadatas[:200]
len(sample_documents)

200

In [21]:
print("========== Step 3: Embed in batch ==========")
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(sample_texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True)



Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [24]:
print("Shape of embeddings:", embeddings.shape)

Shape of embeddings: (200, 384)


In [22]:
print("========== Step 4: Build native FAISS index ==========")
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)  # L2 = Euclidean distance, good default

# Optional: Use GPU if available
# res = faiss.StandardGpuResources()
# index = faiss.index_cpu_to_gpu(res, 0, index)

index.add(embeddings.astype("float32"))
print(f"FAISS index contains {index.ntotal} vectors.")

print("========== Step 5: Save FAISS index and metadata ==========")
output_dir = "../vector_store/creditrust_faiss_native"
os.makedirs(output_dir, exist_ok=True)

print("Save index")
faiss.write_index(index, os.path.join(output_dir, "index.faiss"))

print("Save metadata (for lookup after search)")
with open(os.path.join(output_dir, "metadata.pkl"), "wb") as f:
    pickle.dump({'texts': texts, 'metadatas': metadatas}, f)

print(" Complete Saved FAISS index and metadata.")

FAISS index contains 200 vectors.
Save index
Save metadata (for lookup after search)
 Complete Saved FAISS index and metadata.
