In [28]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# KIAM8

## Task 2: Text Chunking, Embedding, and Vector Store Indexing

Objective: convert the cleaned text narratives into a format suitable for efficient semantic search.

### Sub TASKS

#### ●	Create a stratified sample of 10,000-15,000 complaints from your cleaned dataset:
        ○	Ensure proportional representation across all five product categories.
        ○	Document your sampling strategy in your report.


In [29]:
# import essential libraries 
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split

In [30]:
# Load cleaned data
df_clean = pd.read_csv('../data/processed/filtered_complaints.csv')

# Create stratified sample of 12,000 complaints
sample_size = 12000

# Calculate samples per category (proportional)
category_counts = df_clean['product_category'].value_counts()
sample_distribution = (category_counts / len(df_clean) * sample_size).round().astype(int)

print("Target sample distribution:")
print(sample_distribution)

Target sample distribution:
product_category
Credit Card        5021
Savings Account    3712
Money Transfers    2576
Personal Loan       691
Name: count, dtype: int64


In [31]:
# Sample from each category
sampled_dfs = []
for category, n_samples in sample_distribution.items():
    category_df = df_clean[df_clean['product_category'] == category]
    if len(category_df) >= n_samples:
        sampled = category_df.sample(n=n_samples, random_state=42)
    else:
        sampled = category_df
    sampled_dfs.append(sampled)

df_sample = pd.concat(sampled_dfs, ignore_index=True)

print(f"\nActual sample size: {len(df_sample)}")
print("\nActual distribution:")
print(df_sample['product_category'].value_counts())

# Save sample
df_sample.to_csv('../data/processed/sample_complaints.csv', index=False)


Actual sample size: 12000

Actual distribution:
product_category
Credit Card        5021
Savings Account    3712
Money Transfers    2576
Personal Loan       691
Name: count, dtype: int64


#### ●	Long narratives are often ineffective when embedded as a single vector. Implement a text chunking strategy.
        ○	Use a library like LangChain's RecursiveCharacterTextSplitter or write your own function.
        ○	Experiment with chunk_size and chunk_overlap to find a good balance. Justify your final choice in your report.


In [32]:
# Import Scripts Path to environment to load predefined scripts

import os
import sys

cwd = os.getcwd()
scripts_path = os.path.join(cwd,'..','src')
scripts_abs_path = os.path.abspath(scripts_path)

if scripts_path not in sys.path and os.path.isdir(scripts_abs_path):
    sys.path.append(scripts_abs_path)
    print ('Scripts Path Succcessfully Added to the Environment')
else:
    print('Invalid Scripts Path or Scripts Path Already Added to the Environemnt')

Scripts Path Succcessfully Added to the Environment


In [33]:
# Import Helper Scripts

from text_transformations import ComplaintChunker

# Apply chunking
chunker = ComplaintChunker(chunk_size=500, chunk_overlap=50)
df_chunks = chunker.chunk_dataset(df_sample)

print(f"Total chunks created: {len(df_chunks)}")
print(f"Average chunks per complaint: {len(df_chunks) / len(df_sample):.2f}")

# Save chunks
df_chunks.to_csv('../data/processed/complaint_chunks.csv', index=False)

INFO:text_transformations:Chunking complete. Created 36098 chunks from 12000 rows.


Total chunks created: 36098
Average chunks per complaint: 3.01


In [34]:
# Import Helper Scripts

from text_embedding import EmbeddingGenerator

# Generate embeddings
print("Generating embeddings...")
embed_gen = EmbeddingGenerator()
embeddings = embed_gen.generate_embeddings(df_chunks['text'].tolist())

print(f"Embeddings shape: {embeddings.shape}")
print(f"Embedding dimension: {embed_gen.embedding_dim}")

# Add embeddings to dataframe
df_chunks['embedding'] = embeddings.tolist()

# Save chunks with embeddings
df_chunks.to_parquet('../data/processed/chunks_with_embeddings.parquet')

INFO:text_embedding:Loading model: sentence-transformers/all-MiniLM-L6-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


Generating embeddings...


INFO:text_embedding:Model loaded successfully on cpu
Batches: 100%|██████████| 1129/1129 [07:08<00:00,  2.64it/s]


Embeddings shape: (36098, 384)
Embedding dimension: 384


In [35]:
# Import Helper Scripts

import importlib
import text_vectorization

# Force the module to refresh from disk
importlib.reload(text_vectorization)

# Re-pull the classes into the local namespace
from text_vectorization import FAISSVectorStore, ChromaVectorStore


In [36]:
# Using Both FAISS & Chroma DB to Create Verctor Store

#1. Using FAISS

# Create and populate vector store

from pathlib import Path

vector_store_dir = '../vector_store/FAISS/'
#vector_store_dir.mkdir(exist_ok=True)

print("Creating FAISS index...")
vector_store = FAISSVectorStore(embedding_dim=384)

# Prepare metadata
metadata_list = df_chunks.drop('embedding', axis=1).to_dict('records')

# Add to vector store
vector_store.add_embeddings(
    embeddings=np.array(df_chunks['embedding'].tolist()),
    metadata=metadata_list
)

# Save vector store
vector_store.save(
    index_path=str(vector_store_dir)+'faiss_index.bin',
    metadata_path=str(vector_store_dir)+'metadata.pkl'
)

print(f"Vector store created with {len(metadata_list)} chunks")

Creating FAISS index...


INFO:text_vectorization:Added 36098 vectors. Total store size: 36098
INFO:text_vectorization:Successfully persisted FAISS index and metadata.


Vector store created with 36098 chunks


In [37]:
# 2. Using Chroma DB

chroma_store = ChromaVectorStore()
chroma_store.add_documents(df_chunks)
print("ChromaDB vector store created successfully")

INFO:text_vectorization:Successfully initialized ChromaDB collection: complaints
INFO:text_vectorization:Starting ingestion of 36098 documents.
INFO:text_vectorization:Successfully added batch 1
INFO:text_vectorization:Successfully added batch 2
INFO:text_vectorization:Successfully added batch 3
INFO:text_vectorization:Successfully added batch 4
INFO:text_vectorization:Successfully added batch 5
INFO:text_vectorization:Successfully added batch 6
INFO:text_vectorization:Successfully added batch 7
INFO:text_vectorization:Successfully added batch 8
INFO:text_vectorization:Successfully added batch 9
INFO:text_vectorization:Successfully added batch 10
INFO:text_vectorization:Successfully added batch 11
INFO:text_vectorization:Successfully added batch 12
INFO:text_vectorization:Successfully added batch 13
INFO:text_vectorization:Successfully added batch 14
INFO:text_vectorization:Successfully added batch 15
INFO:text_vectorization:Successfully added batch 16
INFO:text_vectorization:Successfu

ChromaDB vector store created successfully
