In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# KIAM8

## Task 2: Text Chunking, Embedding, and Vector Store Indexing

Objective: convert the cleaned text narratives into a format suitable for efficient semantic search.

### Sub TASKS

#### ●	Create a stratified sample of 10,000-15,000 complaints from your cleaned dataset:
        ○	Ensure proportional representation across all five product categories.
        ○	Document your sampling strategy in your report.


In [1]:
# import essential libraries 
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
# Load cleaned data
df_clean = pd.read_csv('../data/processed/filtered_complaints.csv')

# Create stratified sample of 12,000 complaints
sample_size = 12000

# Calculate samples per category (proportional)
category_counts = df_clean['product_category'].value_counts()
sample_distribution = (category_counts / len(df_clean) * sample_size).round().astype(int)

print("Target sample distribution:")
print(sample_distribution)

Target sample distribution:
product_category
Credit Card        5021
Savings Account    3712
Money Transfers    2576
Personal Loan       691
Name: count, dtype: int64


In [3]:
# Sample from each category
sampled_dfs = []
for category, n_samples in sample_distribution.items():
    category_df = df_clean[df_clean['product_category'] == category]
    if len(category_df) >= n_samples:
        sampled = category_df.sample(n=n_samples, random_state=42)
    else:
        sampled = category_df
    sampled_dfs.append(sampled)

df_sample = pd.concat(sampled_dfs, ignore_index=True)

print(f"\nActual sample size: {len(df_sample)}")
print("\nActual distribution:")
print(df_sample['product_category'].value_counts())

# Save sample
df_sample.to_csv('../data/processed/sample_complaints.csv', index=False)


Actual sample size: 12000

Actual distribution:
product_category
Credit Card        5021
Savings Account    3712
Money Transfers    2576
Personal Loan       691
Name: count, dtype: int64


#### ●	Long narratives are often ineffective when embedded as a single vector. Implement a text chunking strategy.
        ○	Use a library like LangChain's RecursiveCharacterTextSplitter or write your own function.
        ○	Experiment with chunk_size and chunk_overlap to find a good balance. Justify your final choice in your report.


In [4]:
# Import Scripts Path to environment to load predefined scripts

import os
import sys

cwd = os.getcwd()
scripts_path = os.path.join(cwd,'..','src')
scripts_abs_path = os.path.abspath(scripts_path)

if scripts_path not in sys.path and os.path.isdir(scripts_abs_path):
    sys.path.append(scripts_abs_path)
    print ('Scripts Path Succcessfully Added to the Environment')
else:
    print('Invalid Scripts Path or Scripts Path Already Added to the Environemnt')

Scripts Path Succcessfully Added to the Environment


In [5]:
# Import Helper Scripts

from text_transformations import ComplaintChunker

# Apply chunking
chunker = ComplaintChunker(chunk_size=500, chunk_overlap=50)
df_chunks = chunker.chunk_dataset(df_sample)

print(f"Total chunks created: {len(df_chunks)}")
print(f"Average chunks per complaint: {len(df_chunks) / len(df_sample):.2f}")

# Save chunks
df_chunks.to_csv('../data/processed/complaint_chunks.csv', index=False)

  from .autonotebook import tqdm as notebook_tqdm


Total chunks created: 36227
Average chunks per complaint: 3.02


In [6]:
# Import Helper Scripts

from text_embedding import EmbeddingGenerator

# Generate embeddings
print("Generating embeddings...")
embed_gen = EmbeddingGenerator()
embeddings = embed_gen.generate_embeddings(df_chunks['text'].tolist())

print(f"Embeddings shape: {embeddings.shape}")
print(f"Embedding dimension: {embed_gen.embedding_dim}")

# Add embeddings to dataframe
df_chunks['embedding'] = embeddings.tolist()

# Save chunks with embeddings
df_chunks.to_parquet('../data/processed/chunks_with_embeddings.parquet')

Generating embeddings...
Using CPU for embeddings


Batches: 100%|██████████| 1133/1133 [03:51<00:00,  4.90it/s]


Embeddings shape: (36227, 384)
Embedding dimension: 384


In [None]:
# Import Helper Scripts

import importlib
import text_vectorization

# Force the module to refresh from disk
importlib.reload(text_vectorization)

# Re-pull the classes into the local namespace
from text_vectorization import FAISSVectorStore, ChromaVectorStore


In [8]:
# Using Both FAISS & Chroma DB to Create Verctor Store

#1. Using FAISS

# Create and populate vector store

from pathlib import Path

vector_store_dir = '../vector_store/FAISS/'
#vector_store_dir.mkdir(exist_ok=True)

print("Creating FAISS index...")
vector_store = FAISSVectorStore(embedding_dim=384)

# Prepare metadata
metadata_list = df_chunks.drop('embedding', axis=1).to_dict('records')

# Add to vector store
vector_store.add_embeddings(
    embeddings=np.array(df_chunks['embedding'].tolist()),
    metadata=metadata_list
)

# Save vector store
vector_store.save(
    index_path=str(vector_store_dir)+'faiss_index.bin',
    metadata_path=str(vector_store_dir)+'metadata.pkl'
)

print(f"Vector store created with {len(metadata_list)} chunks")

Creating FAISS index...
Vector store created with 36227 chunks


In [20]:
# 2. Using Chroma DB

chroma_store = ChromaVectorStore()
chroma_store.add_documents(df_chunks)
print("ChromaDB vector store created successfully")

Added batch 1/37
Added batch 2/37
Added batch 3/37
Added batch 4/37
Added batch 5/37
Added batch 6/37
Added batch 7/37
Added batch 8/37
Added batch 9/37
Added batch 10/37
Added batch 11/37
Added batch 12/37
Added batch 13/37
Added batch 14/37
Added batch 15/37
Added batch 16/37
Added batch 17/37
Added batch 18/37
Added batch 19/37
Added batch 20/37
Added batch 21/37
Added batch 22/37
Added batch 23/37
Added batch 24/37
Added batch 25/37
Added batch 26/37
Added batch 27/37
Added batch 28/37
Added batch 29/37
Added batch 30/37
Added batch 31/37
Added batch 32/37
Added batch 33/37
Added batch 34/37
Added batch 35/37
Added batch 36/37
Added batch 37/37
ChromaDB vector store created successfully
