# Embedding, and Vector Store Indexing

## Import Libaries

In [None]:
import pandas as pd
import numpy as np
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src.rag import (retrieve_similar_complaints, load_faiss_index, load_metadata, load_embedding_model,
                    initialize_faiss_index, save_faiss_index, embed_chunks, prepare_chunks_and_metadata)
from sentence_transformers import SentenceTransformer
import faiss

## Load Chunked data

In [None]:
file_path = '/content/drive/MyDrive/data/chunked_complaints.csv'
df = pd.read_csv(file_path)

In [None]:
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,...,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,narrative_length,cleaned_narrative,cleaned_narrative_length,narrative_chunks
0,2025-06-13,Credit card,Store credit card,Getting a credit card,Card opened without my consent or knowledge,A XXXX XXXX card was opened under my name by a...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78230,...,Web,2025-06-13,Closed with non-monetary relief,Yes,,14069121,91,a xxxx xxxx card was opened under my name by a...,91,['a xxxx xxxx card was opened under my name by...
1,2025-06-12,Credit card,General-purpose credit card or charge card,"Other features, terms, or problems",Other problem,"Dear CFPB, I have a secured credit card with c...",Company has responded to the consumer and the ...,"CITIBANK, N.A.",NY,11220,...,Web,2025-06-13,Closed with monetary relief,Yes,,14047085,156,dear cfpb i have a secured credit card with ci...,156,['dear cfpb i have a secured credit card with ...
2,2025-06-12,Credit card,General-purpose credit card or charge card,Incorrect information on your report,Account information incorrect,I have a Citi rewards cards. The credit balanc...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",IL,60067,...,Web,2025-06-12,Closed with explanation,Yes,,14040217,233,i have a citi rewards cards the credit balance...,231,['i have a citi rewards cards the credit balan...
3,2025-06-09,Credit card,General-purpose credit card or charge card,Problem with a purchase shown on your statement,Credit card company isn't resolving a dispute ...,b'I am writing to dispute the following charge...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78413,...,Web,2025-06-09,Closed with monetary relief,Yes,,13968411,454,bi am writing to dispute the following charges...,454,['bi am writing to dispute the following charg...
4,2025-06-09,Credit card,General-purpose credit card or charge card,Problem when making payments,Problem during payment process,"Although the account had been deemed closed, I...",Company believes it acted appropriately as aut...,Atlanticus Services Corporation,NY,11212,...,Web,2025-06-09,Closed with monetary relief,Yes,,13965746,170,although the account had been deemed closed i ...,170,['although the account had been deemed closed ...


## Embedding Model Choice

For this project, I used the `all-MiniLM-L6-v2` model from the SentenceTransformers library. This model is lightweight, fast, and provides high-quality sentence embeddings suitable for semantic search and retrieval tasks. It is widely used in industry and research for its balance of performance and efficiency.

In [None]:
# Prepare chunks and metadata
all_chunks, metadata = prepare_chunks_and_metadata(df)

In [None]:
# Load the embedding model
embedding_model = load_embedding_model('all-MiniLM-L6-v2')

In [None]:
# Generate embeddings for all chunks
embeddings = embed_chunks(all_chunks, embedding_model)

In [None]:
# Store embeddings as a numpy array
# embeddings = np.array(embeddings)
print('Embeddings shape:', embeddings.shape)

Embeddings shape: (1609126, 384)


## Vector Store Indexing

In [None]:
# Create FAISS index
index = initialize_faiss_index(embeddings)

In [None]:
import os

index_path = './vector_store/complaint_chunks.index'
metadata_path = './vector_store/complaint_chunks_metadata.pkl'

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(index_path), exist_ok=True)

save_faiss_index(index, metadata, index_path, metadata_path)