Task 2: Text Chunking, Embedding, and Vector Store Indexing

In [1]:
# Import necessary libraries
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss #pip install faiss-cpu
import numpy as np
import warnings


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the cleaned dataset
df = pd.read_csv('../data/filtered_complaints.csv')

In [3]:
df

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,cleaned_narrative
0,2025-06-13,Credit card,Store credit card,Getting a credit card,Card opened without my consent or knowledge,A XXXX XXXX card was opened under my name by a...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78230,Servicemember,Consent provided,Web,2025-06-13,Closed with non-monetary relief,Yes,,14069121,a xxxx xxxx card was opened under my name by a...
1,2025-06-12,Credit card,General-purpose credit card or charge card,"Other features, terms, or problems",Other problem,"Dear CFPB, I have a secured credit card with c...",Company has responded to the consumer and the ...,"CITIBANK, N.A.",NY,11220,,Consent provided,Web,2025-06-13,Closed with monetary relief,Yes,,14047085,dear cfpb i have a secured credit card with ci...
2,2025-06-12,Credit card,General-purpose credit card or charge card,Incorrect information on your report,Account information incorrect,I have a Citi rewards cards. The credit balanc...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",IL,60067,,Consent provided,Web,2025-06-12,Closed with explanation,Yes,,14040217,i have a citi rewards cards the credit balance...
3,2025-06-09,Credit card,General-purpose credit card or charge card,Problem with a purchase shown on your statement,Credit card company isn't resolving a dispute ...,b'I am writing to dispute the following charge...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78413,Older American,Consent provided,Web,2025-06-09,Closed with monetary relief,Yes,,13968411,bi am writing to dispute the following charges...
4,2025-06-09,Credit card,General-purpose credit card or charge card,Problem when making payments,Problem during payment process,"Although the account had been deemed closed, I...",Company believes it acted appropriately as aut...,Atlanticus Services Corporation,NY,11212,Older American,Consent provided,Web,2025-06-09,Closed with monetary relief,Yes,,13965746,although the account had been deemed closed i ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82159,2017-02-01,Credit card,,APR or interest rate,,My husband and I attended a XXXX XXXX XXXX tim...,Company has responded to the consumer and the ...,BARCLAYS BANK DELAWARE,NJ,08610,,Consent provided,Web,2017-02-01,Closed with monetary relief,Yes,No,2323561,my husband and i attended a xxxx xxxx xxxx tim...
82160,2017-01-15,Credit card,,Unsolicited issuance of credit card,,I am an infrequent user of PayPal. I only use ...,,"Paypal Holdings, Inc",IL,60126,,Consent provided,Web,2017-01-18,Closed with explanation,Yes,No,2290909,i am an infrequent user of paypal i only use t...
82161,2017-03-22,Credit card,,Other,,I have been doing business with Barclay Bank f...,Company has responded to the consumer and the ...,BARCLAYS BANK DELAWARE,CT,06460,,Consent provided,Web,2017-03-22,Closed with monetary relief,Yes,No,2400431,i have been doing business with barclay bank f...
82162,2015-10-16,Credit card,,Credit determination,,"Defamation of Character, False Credit Reportin...",,JPMORGAN CHASE & CO.,MN,55987,,Consent provided,Web,2015-10-16,Closed with explanation,Yes,Yes,1610424,defamation of character false credit reporting...


In [4]:
# Define text chunking parameters
chunk_size = 1000  # Max tokens per chunk
chunk_overlap = 0  # Overlapping tokens between chunks

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [5]:
# Prepare to store chunks and metadata
chunks = []
metadata = []

# Chunk the narratives and store in a list
for index, row in df.iterrows():
    narrative_chunks = text_splitter.split_text(row['cleaned_narrative'])
    for chunk in narrative_chunks:
        chunks.append(chunk)
        metadata.append({
            'original_id': row['Complaint ID'],  # Assuming 'Complaint ID' is a column in your dataset
            'product': row['Product']
        })

In [6]:
# Load the embedding model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# Generate embeddings for each chunk
embeddings = model.encode(chunks, show_progress_bar=True)
# Create a FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 distance metric
index.add(np.array(embeddings, dtype=np.float32))
# Save the vector store and metadata
faiss.write_index(index, 'vector_store/faiss_index.index')

Batches: 100%|██████████| 4194/4194 [50:25<00:00,  1.39it/s]  


In [7]:

# Save metadata to a CSV for reference
metadata_df = pd.DataFrame(metadata)
metadata_df.to_csv('vector_store/metadata.csv', index=False)
# Generate embeddings for each chunk
embeddings = model.encode(chunks, show_progress_bar=True)
# Create a FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 distance metric
index.add(np.array(embeddings, dtype=np.float32))
# Save the vector store and metadata
faiss.write_index(index, 'vector_store/faiss_index.index')
# Save metadata to a CSV for reference
metadata_df = pd.DataFrame(metadata)
metadata_df.to_csv('vector_store/metadata.csv', index=False)
# Display the first few rows of the metadata DataFrame
metadata_df.head()

Batches: 100%|██████████| 4194/4194 [43:20<00:00,  1.61it/s]  


Unnamed: 0,original_id,product
0,14069121,Credit card
1,14047085,Credit card
2,14040217,Credit card
3,14040217,Credit card
4,13968411,Credit card
