In [None]:
# Import standard libraries
import numpy as np
import pandas as pd
import os
import pickle
import faiss

# Import LangChain's text splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

In [None]:
# Define the path to the cleaned data
clean_data_path = "../data/filtered_complaints.csv"

# Load the cleaned dataset
df = pd.read_csv(clean_data_path)

# Preview the shape and a few rows
print("Cleaned dataset shape:", df.shape)
df[["Product", "Cleaned_Narrative"]].head()

In [None]:
# Chunk long text using LangChain

# Define the chunking strategy
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,        # number of characters per chunk
    chunk_overlap=50,      # overlap to retain context
    length_function=len,   # can be replaced with token count
    separators=["\n\n", "\n", ".", " "]  # split preference
)

# Apply chunking to all narratives
all_chunks = []
metadata = []

for idx, row in df.iterrows():
    chunks = text_splitter.split_text(row["Cleaned_Narrative"])
    all_chunks.extend(chunks)
    metadata.extend([{
        "product": row["Product"],
        "complaint_id": row["Complaint ID"],
        "original_index": idx
    }] * len(chunks))

# Check result
print(f"Total chunks created: {len(all_chunks)}")
print(f"Sample chunk:\n{all_chunks[0]}")


In [None]:
#Create Embeddings
# Load the embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each chunk
embeddings = model.encode(all_chunks, show_progress_bar=True)

# Check shape
print("Total embeddings:", len(embeddings))
print("Embedding vector size:", len(embeddings[0]))


In [None]:
#Store embeddings in FAISS Vectore Index
# Convert list of embeddings to NumPy array
embedding_matrix = np.array(embeddings).astype('float32')

# Create FAISS index
index = faiss.IndexFlatL2(embedding_matrix.shape[1])  # L2 = Euclidean distance
index.add(embedding_matrix)

# Save index and metadata
os.makedirs("vector_store", exist_ok=True)

faiss.write_index(index, "vector_store/index.faiss")

with open("vector_store/index.pkl", "wb") as f:
    pickle.dump(metadata, f)

print(" Vector index and metadata saved to vector_store/")


In [None]:
#Load and Test the index
# Load saved index
index = faiss.read_index("vector_store/complaint_index.faiss")

# Load metadata
with open("vector_store/metadata.pkl", "rb") as f:
    metadata = pickle.load(f)

# Example: Find top 3 similar chunks to a query
query = "Why are customers upset about Buy Now Pay Later?"
query_vector = model.encode([query]).astype("float32")

# Search
top_k = 3
D, I = index.search(query_vector, top_k)

print("\nTop matching complaint chunks:\n")
for idx in I[0]:
    print(f"Product: {metadata[idx]['product']}")
    print(f"Complaint ID: {metadata[idx]['complaint_id']}")
    print(all_chunks[idx])
    print("-" * 80)
