In [None]:
import polars as pl
import numpy as np
import faiss
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
tqdm.pandas()

# Sample data for testing (replace with actual dataset path in production)
sample_data = pd.DataFrame({
    'Complaint ID': [1001, 1002, 1003],
    'Product': ['Credit card', 'Personal loan', 'Savings account'],
    'cleaned_narrative': [
        'unauthorized transaction appeared account disputed bank resolved issue timely manner',
        'applied loan high interest rate disclosed upfront felt misled terms',
        'account frozen without notice customer service unresponsive'
    ]
})

# Define paths
cleaned_data_path = '../data/filtered_and_cleaned_complaints.csv'
vector_store_dir = '../vector_store/'
faiss_index_path = os.path.join(vector_store_dir, 'faiss_index.bin')
metadata_path = os.path.join(vector_store_dir, 'metadata.json')

# Load dataset (use sample_data for testing)
df_cleaned = sample_data if os.path.exists(cleaned_data_path) else pd.read_csv(cleaned_data_path, encoding='utf-8')
print(f"Loaded dataset shape: {df_cleaned.shape}")

In [None]:
# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100,
    length_function=len,
    add_start_index=True
)

# Prepare documents for splitting
documents = [
    {
        "page_content": row['cleaned_narrative'],
        "metadata": {
            "product": row['Product'],
            "complaint_id": row['Complaint ID']
        }
    } for _, row in df_cleaned.iterrows() if pd.notna(row['cleaned_narrative']) and row['cleaned_narrative'].strip()
]

In [None]:
# Split documents into chunks
chunks = text_splitter.create_documents(
    [d["page_content"] for d in documents],
    metadatas=[d["metadata"] for d in documents]
)
print(f"Created {len(chunks)} chunks")

# Store chunks in DataFrame
chunks_df = pd.DataFrame([
    {
        'chunk_content': chunk.page_content,
        'product': chunk.metadata['product'],
        'complaint_id': chunk.metadata['complaint_id']
    } for chunk in chunks
])
print(f"Chunks DataFrame shape: {chunks_df.shape}")
print("Sample chunks:")
print(chunks_df.head())

In [None]:
# Visualize chunk length distribution
chunk_lengths = [len(chunk.page_content) for chunk in chunks]
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8, 5))
sns.histplot(chunk_lengths, bins=20, kde=True, color='purple')
plt.title('Distribution of Chunk Lengths')
plt.xlabel('Chunk Length (characters)')
plt.ylabel('Frequency')
plt.show()
print(f"Chunk length stats: Min={np.min(chunk_lengths)}, Max={np.max(chunk_lengths)}, Mean={np.mean(chunk_lengths):.2f}")

In [None]:
# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
embeddings_np = model.encode(chunks_df['chunk_content'].tolist(), show_progress_bar=True, convert_to_numpy=True)
print(f"Embeddings shape: {embeddings_np.shape}")

# Initialize and populate FAISS index
dimension = embeddings_np.shape[1]
faiss_index = faiss.IndexFlatIP(dimension)
faiss_index.add(embeddings_np)
print(f"FAISS index contains {faiss_index.ntotal} vectors")

In [None]:
# Save FAISS index and metadata
os.makedirs(vector_store_dir, exist_ok=True)
faiss.write_index(faiss_index, faiss_index_path)
chunks_df.to_json(metadata_path, orient='records', indent=4)
print(f"Saved FAISS index to {faiss_index_path} and metadata to {metadata_path}")