In [1]:
# Import libraries and modules
import sys
sys.path.append('../src')
                                    
import pandas as pd
from text_chunker import TextChunker
from text_embedder import TextEmbedder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load filtered & cleaned dataset
filtered_df = pd.read_csv("../data/filtered_complaints.csv")
# Initialize chunker
chunker = TextChunker(chunk_size=300, chunk_overlap=50)
# Apply chunking
chunked_df = chunker.chunk_dataframe(filtered_df)
# Show example
chunked_df.sample(3)

Unnamed: 0,original_index,product,chunk
1137447,252502,"Payday loan, title loan, or personal loan",because they said i was behind a payment the n...
1001777,225546,Checking or savings account,items must be posted step contact us to close ...
830922,190654,Credit card,rate charge xxxx xxxx however it said that i o...


In [None]:
# Initialize model
embedder = TextEmbedder(model_name="paraphrase-MiniLM-L3-v2")
# Extract chunks from the chunked DataFrame
chunks = chunked_df["chunk"].tolist()
# Embed the chunks
embeddings = embedder.embed_texts(chunks, batch_size=512)
# Confirm shape
print("Embeddings shape:", embeddings.shape)

In [None]:
import os
import faiss
import numpy as np
import json

In [None]:
os.makedirs("vector_store", exist_ok=True)
embeddings = embeddings.astype('float32')
embedding_dim = embeddings.shape[1]

# Initialize FAISS index (Flat index for simplicity)
index = faiss.IndexFlatL2(embedding_dim)  # L2 distance; use IndexFlatIP for cosine sim

# Add embeddings to index
index.add(embeddings)
print(f"Number of vectors in the index: {index.ntotal}")

# Save the FAISS index to disk
faiss.write_index(index, "vector_store/faiss_index.bin")

# Save metadata for each chunk: complaint ID, product category, original text chunk, etc.
# Assuming your chunked_df contains these columns, adjust as per your actual DataFrame
metadata_list = []
for _, row in chunked_df.iterrows():
    metadata = {
        "complaint_id": row.get("complaint_id", None),   # adjust column names accordingly
        "product": row.get("product", None),
        "chunk": row.get("chunk"),
    }
    metadata_list.append(metadata)

# Save metadata as JSON file aligned with the vectors in the FAISS index
with open("vector_store/metadata.json", "w", encoding="utf-8") as f:
    json.dump(metadata_list, f, ensure_ascii=False, indent=2)

print("FAISS index and metadata saved to 'vector_store/' directory.")