##  import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
# Add src to path
sys.path.append('../src')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Stratified Sampling

In [12]:
import pandas as pd

def create_stratified_sample(input_file, sample_size, stratify_col='Product', random_state=42):
    # Load the data
    df = pd.read_csv(r"C:\Users\user\Desktop\Project\complaint-chatbot\data\raw\filtered_complaints.csv")
    
    # Calculate fraction per group
    total_rows = len(df)
    frac = sample_size / total_rows
    
    # Stratified sampling
    df_sampled = df.groupby(stratify_col, group_keys=False).apply(
        lambda x: x.sample(frac=frac, random_state=random_state)
    ).reset_index(drop=True)
    
    return df_sampled


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load original dataset
original_df = pd.read_csv(r"C:\Users\user\Desktop\Project\complaint-chatbot\data\processed\filtered_complaints.csv")

# If sample_df already exists from previous stratified sampling, keep it
# Otherwise, create it using your create_stratified_sample function

# Count product occurrences
original_counts = original_df['Product'].value_counts()
sample_counts = sample_df['Product'].value_counts()

# Plot side-by-side
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Original dataset
axes[0].bar(range(len(original_counts)), original_counts.values)
axes[0].set_xticks(range(len(original_counts)))
axes[0].set_xticklabels(original_counts.index, rotation=45, ha='right')
axes[0].set_title('Original Dataset - Product Distribution')
axes[0].set_ylabel('Count')

# Sample dataset
axes[1].bar(range(len(sample_counts)), sample_counts.values, color='orange')
axes[1].set_xticks(range(len(sample_counts)))
axes[1].set_xticklabels(sample_counts.index, rotation=45, ha='right')
axes[1].set_title('Sample Dataset - Product Distribution')
axes[1].set_ylabel('Count')

plt.tight_layout()

# Save figure
plt.savefig(r'../report/images/sampling_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

# Proportional representation check
print("\nProportional representation check:")

print("\nOriginal proportions (%):")
print((original_df['Product'].value_counts(normalize=True) * 100).round(2))

print("\nSample proportions (%):")
print((sample_df['Product'].value_counts(normalize=True) * 100).round(2))


In [None]:
from chunking import create_text_chunks, analyze_chunks, save_chunks

# Create text chunks
chunks_data = create_text_chunks(
    df=sample_df,
    chunk_size=500,
    chunk_overlap=50,
    text_column="Consumer complaint narrative"
)

# Analyze chunks
analyze_chunks(chunks_data)

# Save chunks
save_chunks(chunks_data, r"C:\Users\user\Desktop\Project\complaint-chatbot\data\processed\chunck_sample.csv")

In [None]:
# Visualize chunk statistics
chunk_lengths = [len(chunk['text']) for chunk in chunks_data]
chunks_per_complaint = {}
for chunk in chunks_data:
    cid = chunk['complaint_id']
    chunks_per_complaint[cid] = chunk['total_chunks']

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Chunk length distribution
axes[0].hist(chunk_lengths, bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(np.mean(chunk_lengths), color='red', linestyle='--', label=f'Mean: {np.mean(chunk_lengths):.0f}')
axes[0].axvline(np.median(chunk_lengths), color='green', linestyle='--', label=f'Median: {np.median(chunk_lengths):.0f}')
axes[0].set_xlabel('Chunk Length (characters)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Chunk Lengths')
axes[0].legend()

# Chunks per complaint distribution
chunks_counts = list(chunks_per_complaint.values())
axes[1].hist(chunks_counts, bins=range(1, max(chunks_counts)+2), edgecolor='black', alpha=0.7, color='orange')
axes[1].axvline(np.mean(chunks_counts), color='red', linestyle='--', label=f'Mean: {np.mean(chunks_counts):.2f}')
axes[1].set_xlabel('Chunks per Complaint')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Chunks per Complaint')
axes[1].legend()

plt.tight_layout()
plt.savefig('../report/images/chunking_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
from embedding import generate_embeddings, analyze_embeddings, save_embeddings

# Generate embeddings
embeddings, chunks_df = generate_embeddings(
    chunks_file=r"C:\Users\user\Desktop\Project\complaint-chatbot\data\processed\chunck_sample.csv",
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    batch_size=32
)

# Analyze embeddings
analyze_embeddings(embeddings)

# Save embeddings
save_embeddings(embeddings, chunks_df, r"C:\Users\user\Desktop\Project\complaint-chatbot\data\processed\sample_embedding.npy")

In [None]:
# Visualize embedding statistics
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Embedding value distribution
axes[0].hist(embeddings.flatten(), bins=100, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Embedding Value')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Embedding Values')
axes[0].set_yscale('log')

# Embedding norms
norms = np.linalg.norm(embeddings, axis=1)
axes[1].hist(norms, bins=50, edgecolor='black', alpha=0.7, color='green')
axes[1].axvline(np.mean(norms), color='red', linestyle='--', label=f'Mean: {np.mean(norms):.4f}')
axes[1].set_xlabel('L2 Norm')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Distribution of Embedding Norms')
axes[1].legend()

plt.tight_layout()
plt.savefig('../report/images/embedding_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

##  FAISS Vector Store

In [None]:
from vector_store_builder import build_vector_store, test_vector_store

# Build vector store
store = build_vector_store(
    embeddings_file="../data/processed/sample_embeddings.npy",
    metadata_file="../data/processed/sample_chunks.csv",
    output_dir="../vector_store/faiss_index",
    index_type="flat"
)

# Test vector store
test_vector_store(store)

##  Interactive Testing

In [None]:
from vector_store_builder import FAISSVectorStore

# Load the vector store
store = FAISSVectorStore.load("../vector_store/faiss_index")
store.load_model()

# Custom query
query = "I want to dispute fraudulent charges on my account"
results = store.query(query, k=5)

print(f"Query: '{query}'\n")
print("="*80)

for i, result in enumerate(results, 1):
    print(f"\nResult {i}:")
    print(f"  Similarity: {result['similarity']:.4f}")
    print(f"  Product: {result.get('product_category', 'N/A')}")
    print(f"  Issue: {result.get('issue', 'N/A')}")
    print(f"  Company: {result.get('company', 'N/A')}")
    print(f"  State: {result.get('state', 'N/A')}")
    print(f"  Text: {result.get('text', '')[:300]}...")

In [None]:
# Test with metadata filtering
query = "problems with online banking"
results = store.query(query, k=5, filter_metadata={'product_category': 'Checking or savings account'})

print(f"Query: '{query}'")
print(f"Filter: Product = 'Checking or savings account'\n")
print("="*80)

for i, result in enumerate(results, 1):
    print(f"\nResult {i}:")
    print(f"  Similarity: {result['similarity']:.4f}")
    print(f"  Product: {result.get('product_category', 'N/A')}")
    print(f"  Issue: {result.get('issue', 'N/A')}")
    print(f"  Text: {result.get('text', '')[:200]}...")
    print("-"*80)