# Cell 2: Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
import os
import shutil

# 1. Load Processed Data

In [2]:
df = pd.read_csv("../data/processed/filtered_complaints.csv")

# 2. Stratified Sampling

## We take a sample of 10,000 for efficiency in this demo

In [3]:
SAMPLE_SIZE = 10000

if len(df) > SAMPLE_SIZE:
    # Stratify by Product Category to ensure representation
    df_sample, _ = train_test_split(
        df, 
        train_size=SAMPLE_SIZE, 
        stratify=df['Product_Category'], 
        random_state=42
    )
else:
    df_sample = df

print(f"Sample size: {len(df_sample)}")

Sample size: 10000


# 3. Chunking

## We need to split long text into smaller chunks for the LLM

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""]
)

documents = []
metadatas = []

print("Chunking documents...")
for idx, row in df_sample.iterrows():
    narrative = row['cleaned_narrative']
    if pd.isna(narrative) or narrative == "":
        continue
        
    chunks = text_splitter.split_text(narrative)
    
    for i, chunk in enumerate(chunks):
        documents.append(chunk)
        metadatas.append({
            "complaint_id": str(row.get('Complaint ID', idx)), # Use ID if available, else index
            "product_category": row['Product_Category'],
            "issue": row.get('Issue', 'Unknown'),
            "chunk_index": i
        })

print(f"Total chunks created: {len(documents)}")

Chunking documents...
Total chunks created: 28837


# 4. Embedding & Indexing

## Define the embedding model

In [5]:
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Define vector store path

In [6]:
persist_directory = "../vector_store"

## Clear existing vector store if it exists (to avoid duplicates during testing)

In [7]:
if os.path.exists(persist_directory):
    shutil.rmtree(persist_directory)

print("Creating Vector Store (this may take a few minutes)...")

Creating Vector Store (this may take a few minutes)...


## Create ChromaDB

In [8]:
vector_store = Chroma.from_texts(
    texts=documents,
    embedding=embeddings,
    metadatas=metadatas,
    persist_directory=persist_directory
)

print(f"Vector store created and saved to {persist_directory}")

Vector store created and saved to ../vector_store
