# Embedding, and Vector Store Indexing

## Import Libaries

In [None]:
import pandas as pd
import numpy as np
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src.rag import (retrieve_similar_complaints, load_faiss_index, load_metadata, load_embedding_model,
                    initialize_faiss_index, save_faiss_index, embed_chunks, prepare_chunks_and_metadata)
from sentence_transformers import SentenceTransformer
import faiss

## Load Chunked data

In [None]:
file_path = '/content/drive/MyDrive/data/chunked_complaints.csv'
df = pd.read_csv(file_path)

In [None]:
df.head()

## Embedding Model Choice

For this project, I used the `all-MiniLM-L6-v2` model from the SentenceTransformers library. This model is lightweight, fast, and provides high-quality sentence embeddings suitable for semantic search and retrieval tasks. It is widely used in industry and research for its balance of performance and efficiency.

In [None]:
# Prepare chunks and metadata
all_chunks, metadata = prepare_chunks_and_metadata(df)

In [None]:
# Load the embedding model
embedding_model = load_embedding_model('all-MiniLM-L6-v2')

In [None]:
# Generate embeddings for all chunks
embeddings = embed_chunks(all_chunks, embedding_model)

## Vector Store Indexing

In [None]:
# Create FAISS index
index = initialize_faiss_index(embeddings)

In [None]:
import os

index_path = './vector_store/complaint_chunks.index'
metadata_path = './vector_store/complaint_chunks_metadata.pkl'

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(index_path), exist_ok=True)

save_faiss_index(index, metadata, index_path, metadata_path)