In [5]:
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter # Updated import
from langchain_huggingface import HuggingFaceEmbeddings           # Updated import
from langchain_community.vectorstores import Chroma



In [6]:
# 1. Load the cleaned data from Task 1
# Ensure the path is correct relative to where your notebook/script is
df = pd.read_csv('../data/processed/filtered_complaints.csv')

In [7]:
# 2. Stratified Sampling (15,000 samples)
# This ensures each product category is represented proportionally
sample_size = 15000
num_categories = df['Product'].nunique()
samples_per_cat = sample_size // num_categories

df_sampled = df.groupby('Product', group_keys=False).apply(
    lambda x: x.sample(n=min(len(x), samples_per_cat), random_state=42)
)

print(f"Sampling complete. Total rows: {len(df_sampled)}")

Sampling complete. Total rows: 15000


  df_sampled = df.groupby('Product', group_keys=False).apply(


In [8]:
# 3. Setup Chunking Strategy
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

# 4. Prepare Documents and Metadata
documents = []
metadatas = []

for _, row in df_sampled.iterrows():
    # We use the 'cleaned_narrative' created in Task 1
    content = str(row['cleaned_narrative'])
    chunks = text_splitter.split_text(content)
    
    for chunk in chunks:
        documents.append(chunk)
        metadatas.append({
            "complaint_id": str(row['Complaint ID']),
            "product": row['Product']
        })

In [9]:
# 5. Initialize Embedding Model
# This will download the model (approx 90MB) on the first run
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)

In [11]:
# 6. Create and Save Vector Store
import os
vector_db_path = "../vector_store"

# Ensure the directory exists
if not os.path.exists(vector_db_path):
    os.makedirs(vector_db_path)

vector_db = Chroma.from_texts(
    texts=documents,
    embedding=embeddings,
    metadatas=metadatas,
    persist_directory=vector_db_path
)

print(f"Success! Vector store created with {len(documents)} chunks.")
print(f"Location: {os.path.abspath(vector_db_path)}")

Success! Vector store created with 42904 chunks.
Location: c:\Users\Her\Desktop\Week_7\rag-complaint-chatbot\vector_store


In [12]:
# Test Search
test_query = "problems with unauthorized transactions on my credit card"
results = vector_db.similarity_search(test_query, k=2)

for i, doc in enumerate(results):
    print(f"\nResult {i+1}:")
    print(f"Product: {doc.metadata['product']}")
    print(f"Snippet: {doc.page_content[:150]}...")


Result 1:
Product: Checking or savings account
Snippet: i called my financial institution immediately after i noticed several unauthorized purchases on my account while on the phone with the fraud departmen...

Result 2:
Product: Credit card
Snippet: i had 2 unauthorized transactions on my card xxxx for xxxx and xxxx for xxxx xxxx the xxxx charge they took care of no problem but the xxxx charge the...
