In [2]:
import pandas as pd
import os
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_core.documents import Document

# ==========================================
# 1. Load Processed Data
# ==========================================
INPUT_PATH = '../data/processed/filtered_complaints.csv'
VECTOR_DB_PATH = '../vector_store'

print("Loading filtered dataset...")
df = pd.read_csv(INPUT_PATH)
print(f"Total records available: {len(df)}")

# ==========================================
# 2. Stratified Sampling (Crucial Step)
# ==========================================
# We want a balanced dataset so the AI knows about all products equally.
# We will take 3,000 complaints from each of the 4 categories = 12,000 total.

SAMPLES_PER_CATEGORY = 3000
sampled_df = df.groupby('Unified_Product', group_keys=False).apply(
    lambda x: x.sample(min(len(x), SAMPLES_PER_CATEGORY), random_state=42)
)

print(f"\nStratified Sample Size: {len(sampled_df)}")
print(sampled_df['Unified_Product'].value_counts())

# ==========================================
# 3. Document Preparation & Chunking
# ==========================================
# We convert dataframe rows into LangChain "Document" objects.

print("\nPreparing documents...")

documents = []

for index, row in sampled_df.iterrows():
    # We include metadata so the AI knows which product/company the text belongs to
    metadata = {
        "complaint_id": row['Complaint ID'],
        "product": row['Unified_Product'],
        "company": row['Company'],
        "issue": row['Issue'],
        "date": row['Date received']
    }
    
    # Create the document
    doc = Document(
        page_content=row['cleaned_narrative'],
        metadata=metadata
    )
    documents.append(doc)

# Chunking: Break long complaints into smaller pieces (500 chars)
# This ensures the AI reads focused segments rather than getting lost in long rants.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50, # Slight overlap to maintain context between chunks
    separators=["\n\n", "\n", ".", " ", ""]
)

print(f"Splitting {len(documents)} documents into chunks...")
chunked_docs = text_splitter.split_documents(documents)
print(f"Total chunks created: {len(chunked_docs)}")

# ==========================================
# 4. Embedding & Indexing (The Slow Part)
# ==========================================
print("\nInitializing Embedding Model (all-MiniLM-L6-v2)...")
# This downloads a small, fast model optimized for semantic search
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

print(f"Creating Chroma Vector Store at {VECTOR_DB_PATH}...")
print("This may take 5-10 minutes depending on your CPU. Please wait...")

# Create and persist the database
# vector_store = Chroma.from_documents(
#     documents=chunked_docs,
#     embedding=embedding_model,
#     persist_directory=VECTOR_DB_PATH
# )

# Optimization: Process in batches to avoid memory issues on Windows
batch_size = 500
total_chunks = len(chunked_docs)

# Initialize empty DB
vector_store = Chroma(
    collection_name="complaints_rag",
    embedding_function=embedding_model,
    persist_directory=VECTOR_DB_PATH
)

for i in range(0, total_chunks, batch_size):
    batch = chunked_docs[i:i + batch_size]
    vector_store.add_documents(batch)
    print(f"Processed batch {i}/{total_chunks}...")

print("\n✅ Success! Vector Store created and saved.")
print(f"You can now find the database files in: {os.path.abspath(VECTOR_DB_PATH)}")

Loading filtered dataset...
Total records available: 476443


  sampled_df = df.groupby('Unified_Product', group_keys=False).apply(



Stratified Sample Size: 12000
Unified_Product
Credit Card        3000
Money Transfers    3000
Personal Loan      3000
Savings Account    3000
Name: count, dtype: int64

Preparing documents...
Splitting 12000 documents into chunks...
Total chunks created: 34897

Initializing Embedding Model (all-MiniLM-L6-v2)...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Creating Chroma Vector Store at ../vector_store...
This may take 5-10 minutes depending on your CPU. Please wait...
Processed batch 0/34897...
Processed batch 500/34897...
Processed batch 1000/34897...
Processed batch 1500/34897...
Processed batch 2000/34897...
Processed batch 2500/34897...
Processed batch 3000/34897...
Processed batch 3500/34897...
Processed batch 4000/34897...
Processed batch 4500/34897...
Processed batch 5000/34897...
Processed batch 5500/34897...
Processed batch 6000/34897...
Processed batch 6500/34897...
Processed batch 7000/34897...
Processed batch 7500/34897...
Processed batch 8000/34897...
Processed batch 8500/34897...
Processed batch 9000/34897...
Processed batch 9500/34897...
Processed batch 10000/34897...
Processed batch 10500/34897...
Processed batch 11000/34897...
Processed batch 11500/34897...
Processed batch 12000/34897...
Processed batch 12500/34897...
Processed batch 13000/34897...
Processed batch 13500/34897...
Processed batch 14000/34897...
Processed