# Task 2: Text Chunking, Embedding, and Vector Store Indexing

## Objective
Convert the cleaned text narratives into a format suitable for efficient semantic search, ensuring proportional representation across 5 product categories.

## Final Workflow
1. **Load Raw Data**: Re-load to capture the full range of product categories.
2. **Target Products**: Filter for the 5 most common/relevant categories:
    - Credit card or prepaid card
    - Checking or savings account
    - Payday/Personal loans
    - Money transfers
    - Debt collection
3. **Stratified Sampling**: 3,000 complaints per product (Total ~15,000).
4. **Text Chunking**: Recursive character splitting (500/50).
5. **Embeddings**: `all-MiniLM-L6-v2`.
6. **Indexing**: FAISS FlatL2.

In [None]:
import pandas as pd
import numpy as np
import re
import os
import pickle
import faiss
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

# File paths
raw_data_path = "../data/raw/complaints.csv"
vector_store_dir = "../vector_store/"
os.makedirs(vector_store_dir, exist_ok=True)

def clean_text(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

## 1. Load and Filter Data
We load the raw data and filter for the 5 categories to ensure balanced representation.

In [None]:
product_map = {
    'Credit card or prepaid card': 'Credit card',
    'Checking or savings account': 'Savings account',
    'Payday loan, title loan, personal loan, or advance loan': 'Personal loan',
    'Money transfer, virtual currency, or money service': 'Money transfers',
    'Debt collection': 'Debt collection'
}

print("Loading and filtering data...")
cols = ['Complaint ID', 'Product', 'Consumer complaint narrative']
df = pd.read_csv(raw_data_path, usecols=cols)
df = df[df['Product'].isin(product_map.keys())].copy()
df['Product'] = df['Product'].map(product_map)
df = df.dropna(subset=['Consumer complaint narrative'])

print(f"Available complaints with narratives: {len(df)}")

## 2. Stratified Sampling
Goal: 15,000 complaints (3,000 per product).

In [None]:
target_sample_size = 15000
samples_per_product = target_sample_size // 5

df_sampled = df.groupby('Product', group_keys=False).apply(
    lambda x: x.sample(n=min(len(x), samples_per_product), random_state=42)
)

print(f"Sampled {len(df_sampled)} complaints.")
print(df_sampled['Product'].value_counts())

## 3. Pre-processing and Chunking

In [None]:
df_sampled['cleaned_narrative'] = df_sampled['Consumer complaint narrative'].apply(clean_text)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = []
metadata = []

for idx, row in df_sampled.iterrows():
    doc_chunks = text_splitter.split_text(row['cleaned_narrative'])
    for chunk in doc_chunks:
        chunks.append(chunk)
        metadata.append({
            'complaint_id': row['Complaint ID'],
            'product': row['Product'],
            'original_text': row['Consumer complaint narrative']
        })

print(f"Generated {len(chunks)} chunks.")

## 4. Embedding Generation

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(chunks, show_progress_bar=True)
print(f"Embeddings shape: {embeddings.shape}")

## 5. Persistence

In [None]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings).astype('float32'))

faiss.write_index(index, os.path.join(vector_store_dir, "complaints.index"))
with open(os.path.join(vector_store_dir, "metadata.pkl"), "wb") as f:
    pickle.dump(metadata, f)

print("Vector store persisted successfully.")