In [5]:
# import necessary libraries
import pandas as pd
import numpy as np
import os

from sentence_transformers import SentenceTransformer
import faiss

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Load the dataset
df = pd.read_csv('../data/filtered_complaints.csv')

In [10]:
df.head()

Unnamed: 0,clean_narrative,Complaint ID,Product,Issue,Date received
0,a card was opened under my name by a fraudster...,14069121,credit card,Getting a credit card,2025-06-13
1,i made the mistake of using my wellsfargo debi...,14061897,checking or savings account,Managing an account,2025-06-13
2,dear cfpb i have a secured credit card with ci...,14047085,credit card,"Other features, terms, or problems",2025-06-12
3,i have a citi rewards cards the credit balance...,14040217,credit card,Incorrect information on your report,2025-06-12
4,i am writing to dispute the following charges ...,13968411,credit card,Problem with a purchase shown on your statement,2025-06-09


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 454472 entries, 0 to 454471
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   clean_narrative  454466 non-null  object
 1   Complaint ID     454472 non-null  int64 
 2   Product          454472 non-null  object
 3   Issue            454472 non-null  object
 4   Date received    454472 non-null  object
dtypes: int64(1), object(4)
memory usage: 17.3+ MB


We have 6 null values from clean_narrative column

In [None]:
df = df[df['clean_narrative'].notnull()]
df.head()

Unnamed: 0,clean_narrative,Complaint ID,Product,Issue,Date received
0,a card was opened under my name by a fraudster...,14069121,credit card,Getting a credit card,2025-06-13
1,i made the mistake of using my wellsfargo debi...,14061897,checking or savings account,Managing an account,2025-06-13
2,dear cfpb i have a secured credit card with ci...,14047085,credit card,"Other features, terms, or problems",2025-06-12
3,i have a citi rewards cards the credit balance...,14040217,credit card,Incorrect information on your report,2025-06-12
4,i am writing to dispute the following charges ...,13968411,credit card,Problem with a purchase shown on your statement,2025-06-09


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 454466 entries, 0 to 454471
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   clean_narrative  454466 non-null  object
 1   Complaint ID     454466 non-null  int64 
 2   Product          454466 non-null  object
 3   Issue            454466 non-null  object
 4   Date received    454466 non-null  object
dtypes: int64(1), object(4)
memory usage: 20.8+ MB


In [12]:
# Character count
df['char_count'] = df['clean_narrative'].apply(len)

# Word count
df['word_count'] = df['clean_narrative'].apply(lambda x: len(x.split()))

TypeError: object of type 'float' has no len()

In [None]:
df.head()

Unnamed: 0,clean_narrative,Complaint ID,Product,Issue,Date received,char_count,word_count
0,a card was opened under my name by a fraudster...,14069121,credit card,Getting a credit card,2025-06-13,436,82
1,i made the mistake of using my wellsfargo debi...,14061897,checking or savings account,Managing an account,2025-06-13,522,105
2,dear cfpb i have a secured credit card with ci...,14047085,credit card,"Other features, terms, or problems",2025-06-12,780,155
3,i have a citi rewards cards the credit balance...,14040217,credit card,Incorrect information on your report,2025-06-12,1109,221
4,i am writing to dispute the following charges ...,13968411,credit card,Problem with a purchase shown on your statement,2025-06-09,2727,451


In [None]:
print("Character Length Summary:")
print(df['char_count'].describe())

print("\nWord Count Summary:")
print(df['word_count'].describe())

Character Length Summary:
count    454466.000000
mean       1047.658034
std        1153.491054
min           4.000000
25%         430.000000
50%         714.000000
75%        1277.000000
max       30992.000000
Name: char_count, dtype: float64

Word Count Summary:
count    454466.000000
mean        191.783777
std         210.696631
min           1.000000
25%          77.000000
50%         128.000000
75%         238.000000
max        6236.000000
Name: word_count, dtype: float64


In [None]:
# Filter out rows with very short narratives
df = df[df['word_count'] >= 30]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 425365 entries, 0 to 454471
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   clean_narrative  425365 non-null  object
 1   Complaint ID     425365 non-null  int64 
 2   Product          425365 non-null  object
 3   Issue            425365 non-null  object
 4   Date received    425365 non-null  object
 5   char_count       425365 non-null  int64 
 6   word_count       425365 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 26.0+ MB


In [None]:
#  Text Chunking
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,         # max characters per chunk
    chunk_overlap=50        # overlap between chunks for context continuity
)

In [None]:
df['clean_narrative'].iloc[0]

'a card was opened under my name by a fraudster i received a notice from that an account was just opened under my name i reached out to to state that this activity was unauthorized and not me confirmed this was fraudulent and immediately closed the card however they have failed to remove this from the three credit agencies and this fraud is now impacting my credit score based on a hard credit pull done by that was done by a fraudster'

In [None]:
# Example: chunk a single narrative
chunks = text_splitter.split_text('a card was opened under my name by a fraudster i received a notice from that an account was just opened under my name i reached out to to state that this activity was unauthorized and not me confirmed this was fraudulent and immediately closed the card however they have failed to remove this from the three credit agencies and this fraud is now impacting my credit score based on a hard credit pull done by that was done by a fraudster')
print(f"Chunks count: {len(chunks)}")

Chunks count: 1


In [None]:
# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize FAISS index (384 dim for all-MiniLM-L6-v2)
embedding_dim = 384
index = faiss.IndexFlatL2(embedding_dim)

# Metadata store (list of dicts)
metadata = []

# Folder to save vector store
vector_store_dir = "vector_store"
os.makedirs(vector_store_dir, exist_ok=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Example: process entire DataFrame
for _, row in df.iterrows():
    narrative = row['clean_narrative']
    complaint_id = row['Complaint ID']
    product = row['Product']

    # Chunk text
    chunks = text_splitter.split_text(narrative)

    # Embed each chunk
    embeddings = model.encode(chunks)

    # Add vectors to FAISS index
    index.add(np.array(embeddings).astype('float32'))

    # Save metadata per chunk
    for chunk_text in chunks:
        metadata.append({
            "complaint_id": complaint_id,
            "product": product,
            "chunk_text": chunk_text
        })

In [None]:
import time

start = time.time()
# Example: process entire DataFrame
for i, row in df.head(10000).iterrows():
    narrative = row['clean_narrative']
    complaint_id = row['Complaint ID']
    product = row['Product']

    # Chunk text
    chunks = text_splitter.split_text(narrative)

    # Embed each chunk
    embeddings = model.encode(chunks)

    # Add vectors to FAISS index
    index.add(np.array(embeddings).astype('float32'))

    # Save metadata per chunk
    for chunk_text in chunks:
        metadata.append({
            "complaint_id": complaint_id,
            "product": product,
            "chunk_text": chunk_text
        })
    if i % 5000 == 0:
        print(f"Processed {i} rows in {time.time() - start:.2f} seconds")

Processed 0 rows in 0.02 seconds
Processed 5000 rows in 45.13 seconds
Processed 10000 rows in 91.69 seconds


In [None]:
all_chunks = []
chunk_metadata = []

for _, row in df.iterrows():
    narrative = row['clean_narrative']
    complaint_id = row['Complaint ID']
    product = row['Product']

    chunks = text_splitter.split_text(narrative)

    all_chunks.extend(chunks)
    chunk_metadata.extend([{
        "complaint_id": complaint_id,
        "product": product,
        "chunk_text": chunk
    } for chunk in chunks])

# Embed all at once (or in batches)
batch_embeddings = model.encode(all_chunks, batch_size=32, show_progress_bar=True)

# Add all to FAISS
index.add(np.array(batch_embeddings).astype('float32'))
metadata = chunk_metadata

In [None]:
# Save FAISS index
faiss.write_index(index, os.path.join(vector_store_dir, "faiss_index.bin"))

# Save metadata (e.g. as JSON or pickle)
import pickle
with open(os.path.join(vector_store_dir, "metadata.pkl"), "wb") as f:
    pickle.dump(metadata, f)

In [None]:
!zip -r vector_store.zip vector_store

  adding: vector_store/ (stored 0%)
  adding: vector_store/faiss_index.bin (deflated 18%)
  adding: vector_store/metadata.pkl (deflated 76%)


In [None]:
from google.colab import files
files.download("vector_store.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Load FAISS index
index = faiss.read_index(os.path.join(vector_store_dir, "faiss_index.bin"))

# Load metadata
with open(os.path.join(vector_store_dir, "metadata.pkl"), "rb") as f:
    metadata = pickle.load(f)

# Example query embedding
query = "Why are customers unhappy with BNPL?"
query_emb = model.encode([query]).astype('float32')

# Search top 5 nearest neighbors
D, I = index.search(query_emb, 5)

for i in I[0]:
    print(metadata[i]["complaint_id"], metadata[i]["product"])
    print(metadata[i]["chunk_text"])
    print("---")


11801814 checking or savings account
complain about it they have no explanation its obvious they dont care about their customers
---
11994279 money transfer, virtual currency, or money service
they do not show any concern for customer satisfaction and no drive or urgency to resolve issues for their customers it comes across as quite arrogant
---
11994279 money transfer, virtual currency, or money service
they do not show any concern for customer satisfaction and no drive or urgency to resolve issues for their customers it comes across as quite arrogant
---
6477501 money transfer, virtual currency, or money service
of their customers to potentially big financial losses
---
11421332 payday loan, title loan, personal loan, or advance loan
this company out of business they are bad for the consumer
---
