# import

In [2]:
import sys
sys.path.insert(0, '..')
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.notebook import tqdm
import faiss
import pickle
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
import hashlib

ModuleNotFoundError: No module named 'langchain_text_splitters'

In [None]:
# Paths
FILTERED_DATA_PATH = Path('../data/filtered_complaints.csv')
VECTOR_STORE_PATH = Path('../vector_store')
FAISS_INDEX_PATH = VECTOR_STORE_PATH / 'faiss_index.bin'
METADATA_PATH = VECTOR_STORE_PATH / 'metadata.pkl'

# Chunking parameters
CHUNK_SIZE = 500
CHUNK_OVERLAP = 100

# Embedding model
EMBEDDING_MODEL = 'sentence-transformers/paraphrase-MiniLM-L3-v2'
EMBEDDING_DIM = 384

# For demo, limit rows (set to None for full dataset)
DEMO_LIMIT = 1000

In [None]:
df = pd.read_csv(FILTERED_DATA_PATH)
print(f"Total complaints: {len(df):,}")

if DEMO_LIMIT:
    df = df.head(DEMO_LIMIT)
    print(f"Using demo subset: {len(df):,} complaints")

df.head()

In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""]
)

# Example: chunk a single narrative
sample_narrative = df['narrative'].iloc[0]
sample_chunks = splitter.split_text(sample_narrative)

print(f"Original length: {len(sample_narrative)} chars")
print(f"Number of chunks: {len(sample_chunks)}")
print(f"\nFirst chunk ({len(sample_chunks[0])} chars):")
print(sample_chunks[0][:300] + "...")

In [None]:
# Chunk all complaints
chunks = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="Chunking"):
    narrative = row['narrative']
    if pd.isna(narrative) or not narrative.strip():
        continue
    
    text_chunks = splitter.split_text(narrative)
    
    for i, chunk_text in enumerate(text_chunks):
        chunk_id = hashlib.md5(f"{row['complaint_id']}_{i}".encode()).hexdigest()
        
        chunks.append({
            'id': chunk_id,
            'text': chunk_text,
            'metadata': {
                'complaint_id': str(row['complaint_id']),
                'product': row['product'],
                'product_original': row['product_original'] if pd.notna(row['product_original']) else '',
                'issue': row['issue'] if pd.notna(row['issue']) else '',
                'company': row['company'] if pd.notna(row['company']) else '',
                'chunk_index': i,
                'total_chunks': len(text_chunks)
            }
        })

print(f"\nCreated {len(chunks):,} chunks from {len(df):,} complaints")
print(f"Average chunks per complaint: {len(chunks)/len(df):.2f}")

In [None]:
model = SentenceTransformer(EMBEDDING_MODEL)
print(f"Model: {EMBEDDING_MODEL}")
print(f"Embedding dimension: {model.get_sentence_embedding_dimension()}")
Model: sentence-transformers/paraphrase-MiniLM-L3-v2
Embedding dimension: 384
# Generate embeddings
texts = [c['text'] for c in chunks]
embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

print(f"\nEmbeddings shape: {embeddings.shape}")

In [None]:
# Create FAISS index (L2 distance)
index = faiss.IndexFlatL2(EMBEDDING_DIM)

# Add embeddings
embeddings_float32 = embeddings.astype('float32')
index.add(embeddings_float32)

print(f"FAISS index built with {index.ntotal:,} vectors")

In [None]:
# Prepare metadata for storage
metadata_list = []
for c in chunks:
    metadata_list.append({
        'id': c['id'],
        'text': c['text'],
        **c['metadata']
    })

print(f"Metadata entries: {len(metadata_list):,}")

In [None]:
# Test semantic search
test_query = "billing dispute credit card"
query_embedding = model.encode([test_query], convert_to_numpy=True).astype('float32')

k = 5
distances, indices = index.search(query_embedding, k)

print(f"Query: '{test_query}'")
print(f"\nTop {k} results:")
for i, (dist, idx) in enumerate(zip(distances[0], indices[0]), 1):
    meta = metadata_list[idx]
    print(f"\n{i}. [Distance: {dist:.4f}] Product: {meta['product']}")
    print(f"   Issue: {meta['issue']}")
    print(f"   Text: {meta['text'][:150]}...")

In [None]:
# Uncomment to save demo index
# VECTOR_STORE_PATH.mkdir(parents=True, exist_ok=True)
# faiss.write_index(index, str(FAISS_INDEX_PATH))
# with open(METADATA_PATH, 'wb') as f:
#     pickle.dump(metadata_list, f)
# print("Index saved!")