# import

In [3]:
import sys
sys.path.insert(0, '..')
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.notebook import tqdm
import faiss
import pickle
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
import hashlib

In [4]:
# Paths
FILTERED_DATA_PATH = Path('../data/filtered_complaints.csv')
VECTOR_STORE_PATH = Path('../vector_store')
FAISS_INDEX_PATH = VECTOR_STORE_PATH / 'faiss_index.bin'
METADATA_PATH = VECTOR_STORE_PATH / 'metadata.pkl'

# Chunking parameters
CHUNK_SIZE = 500
CHUNK_OVERLAP = 100

# Embedding model
EMBEDDING_MODEL = 'sentence-transformers/paraphrase-MiniLM-L3-v2'
EMBEDDING_DIM = 384

# For demo, limit rows (set to None for full dataset)
DEMO_LIMIT = 1000

In [16]:
df = pd.read_csv(r"C:\Users\user\Desktop\Project\complaint-chatbot\data\processed\complaints.csv")
print(f"Total complaints: {len(df):,}")

if DEMO_LIMIT:
    df = df.head(DEMO_LIMIT)
    print(f"Using demo subset: {len(df):,} complaints")

df.head()

  df = pd.read_csv(r"C:\Users\user\Desktop\Project\complaint-chatbot\data\processed\complaints.csv")


Total complaints: 9,609,797
Using demo subset: 1,000 complaints


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2025-06-20,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information belongs to someone else,,,Experian Information Solutions Inc.,FL,32092,,,Web,2025-06-20,In progress,Yes,,14195687
1,2025-06-20,Debt collection,Telecommunications debt,Attempts to collect debt not owed,Debt is not yours,,Company can't verify or dispute the facts in t...,"Eastern Account Systems of Connecticut, Inc.",FL,342XX,,,Web,2025-06-20,Closed with explanation,Yes,,14195688
2,2025-06-20,Credit reporting or other personal consumer re...,Credit reporting,Improper use of your report,Reporting company used your report improperly,,,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",AZ,85225,,,Web,2025-06-20,In progress,Yes,,14195689
3,2025-06-20,Credit reporting or other personal consumer re...,Credit reporting,Improper use of your report,Reporting company used your report improperly,,,Experian Information Solutions Inc.,AZ,85225,,,Web,2025-06-20,In progress,Yes,,14195690
4,2025-06-20,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Account status incorrect,,,Experian Information Solutions Inc.,IL,60628,,,Web,2025-06-20,In progress,Yes,,14195692


In [20]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""]
)

# Example: chunk a single narrative
sample_narrative = df['Product'].iloc[0]
sample_chunks = splitter.split_text(sample_narrative)

print(f"Original length: {len(sample_narrative)} chars")
print(f"Number of chunks: {len(sample_chunks)}")
print(f"\nFirst chunk ({len(sample_chunks[0])} chars):")
print(sample_chunks[0][:300] + "...")

Original length: 51 chars
Number of chunks: 1

First chunk (51 chars):
Credit reporting or other personal consumer reports...


In [40]:
# Chunk all complaints
chunks = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="Chunking"):
    narrative = row['Product']
    if pd.isna(narrative) or not narrative.strip():
        continue
    
    text_chunks = splitter.split_text(narrative)
    
    for i, chunk_text in enumerate(text_chunks):
        chunk_id = hashlib.md5(f"{row['Complaint ID']}_{i}".encode()).hexdigest()
        
        chunks.append({
            'id': chunk_id,
            'text': chunk_text,
            'metadata': {
                'Complaint ID': str(row['Complaint ID']),
                'product': row['Product'],
                'Sub-product': row['Sub-product'] if pd.notna(row['Sub-product']) else '',
                'Issue': row['Issue'] if pd.notna(row['Issue']) else '',
                'Company': row['Company'] if pd.notna(row['Company']) else '',
                'chunk_index': i,
                'total_chunks': len(text_chunks)
            }
        })

print(f"\nCreated {len(chunks):,} chunks from {len(df):,} complaints")
print(f"Average chunks per complaint: {len(chunks)/len(df):.2f}")











[A[A[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









Chunking: 100%|██████████| 1000/1000 [00:00<00:00, 1470.61it/s]


Created 1,000 chunks from 1,000 complaints
Average chunks per complaint: 1.00





In [42]:
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = "sentence-transformers/paraphrase-MiniLM-L3-v2"
model = SentenceTransformer(EMBEDDING_MODEL)
print(f"Model: {EMBEDDING_MODEL}")
print(f"Embedding dimension: {model.get_sentence_embedding_dimension()}")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model: sentence-transformers/paraphrase-MiniLM-L3-v2
Embedding dimension: 384


In [46]:
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = "sentence-transformers/paraphrase-MiniLM-L3-v2"
EMBEDDING_DIM = 384

model = SentenceTransformer(EMBEDDING_MODEL)

texts = [c["text"] for c in chunks]

embeddings = model.encode(
    texts,
    show_progress_bar=True,
    convert_to_numpy=True
)

print(f"Embeddings shape: {embeddings.shape}")












[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A

Embeddings shape: (1000, 384)


In [47]:
# Create FAISS index (L2 distance)
index = faiss.IndexFlatL2(EMBEDDING_DIM)

# Add embeddings
embeddings_float32 = embeddings.astype('float32')
index.add(embeddings_float32)

print(f"FAISS index built with {index.ntotal:,} vectors")

FAISS index built with 1,000 vectors


In [48]:
# Prepare metadata for storage
metadata_list = []
for c in chunks:
    metadata_list.append({
        'id': c['id'],
        'text': c['text'],
        **c['metadata']
    })

print(f"Metadata entries: {len(metadata_list):,}")

Metadata entries: 1,000


In [50]:
# Test semantic search
test_query = "billing dispute credit card"
query_embedding = model.encode([test_query], convert_to_numpy=True).astype('float32')

k = 5
distances, indices = index.search(query_embedding, k)

print(f"Query: '{test_query}'")
print(f"\nTop {k} results:")
for i, (dist, idx) in enumerate(zip(distances[0], indices[0]), 1):
    meta = metadata_list[idx]
    print(f"\n{i}. [Distance: {dist:.4f}] product: {meta['product']}")
    print(f"   Issue: {meta['Issue']}")
    print(f"   Text: {meta['text'][:150]}...")

Query: 'billing dispute credit card'

Top 5 results:

1. [Distance: 22.9198] product: Credit card
   Issue: Problem with a company's investigation into an existing problem
   Text: Credit card...

2. [Distance: 22.9198] product: Credit card
   Issue: Problem with a company's investigation into an existing problem
   Text: Credit card...

3. [Distance: 22.9198] product: Credit card
   Issue: Problem with a company's investigation into an existing problem
   Text: Credit card...

4. [Distance: 22.9198] product: Credit card
   Issue: Problem with a company's investigation into an existing problem
   Text: Credit card...

5. [Distance: 22.9198] product: Credit card
   Issue: Incorrect information on your report
   Text: Credit card...


In [None]:
# Uncomment to save demo index
# VECTOR_STORE_PATH.mkdir(parents=True, exist_ok=True)
# faiss.write_index(index, str(FAISS_INDEX_PATH))
# with open(METADATA_PATH, 'wb') as f:
#     pickle.dump(metadata_list, f)
# print("Index saved!")