# Load the Cleaned Dataset

In [2]:
import pandas as pd

# Load cleaned complaint dataset
df = pd.read_csv('../data/filtered_complaints.csv')
df = df[['cleaned_narrative', 'Product']]  # optionally include 'complaint_id' if you kept one
df = df.dropna(subset=['cleaned_narrative'])  # safety check


# Chunk the Narratives

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Configure chunk size & overlap
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
)

# Apply splitter to each row
docs = []
for i, row in df.iterrows():
    chunks = text_splitter.split_text(row['cleaned_narrative'])
    for chunk in chunks:
        docs.append({
            "text": chunk,
            "Product": row['Product'],
            "source_id": i  # or row['complaint_id'] if available
        })

print(f"Generated {len(docs)} text chunks.")


Generated 499900 text chunks.


# Generate Embeddings

In [5]:
from sentence_transformers import SentenceTransformer

# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Extract texts
texts = [doc['text'] for doc in docs]

# Generate embeddings
embeddings = model.encode(texts, show_progress_bar=True)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/15622 [00:00<?, ?it/s]

#  Index with FAISS

In [7]:
import faiss
import numpy as np
import os
import pickle

# Convert embeddings to numpy array
embedding_dim = len(embeddings[0])
index = faiss.IndexFlatL2(embedding_dim)
index.add(np.array(embeddings))

# Save metadata (map chunk index → metadata)
metadata = [{k: doc[k] for k in ('text', 'Product', 'source_id')} for doc in docs]

# Create directory
os.makedirs('vector_store', exist_ok=True)

# Save FAISS index
faiss.write_index(index, 'vector_store/faiss_index.index')

# Save metadata
with open('vector_store/metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)

print("Vector store saved to 'vector_store/'")


Vector store saved to 'vector_store/'
