### 1. Import Libraries and Load Cleaned Data

In [None]:
# Task 2: Text Chunking, Embedding, and Vector Store Indexing

import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
BASE_DIR = Path.cwd().parent
sys.path.append(str(BASE_DIR / 'src'))

# Import modules
from sampling import create_stratified_sample, analyze_sample_quality
from chunking import ComplaintChunker, experiment_with_chunking
from embedding import EmbeddingModel
from vector_store import VectorStore

#  Load Cleaned Data
print("Step 1: Loading cleaned data...")
cleaned_data_path = BASE_DIR / 'data' / 'processed' / 'filtered_complaints.csv'
df = pd.read_csv(cleaned_data_path)

print(f"Loaded {len(df):,} cleaned complaints")
print(f"Columns: {df.columns.tolist()}")

### 2. Create Stratified Sample

In [None]:
print("\n" + "="*50)
print("Step 2: Creating stratified sample...")

sampled_df = create_stratified_sample(
    df=df,
    sample_size=12000,  # Target: 12,000 complaints
    stratify_col='Product_standardized',
    random_state=42
)

# Analyze sample quality
quality_report = analyze_sample_quality(df, sampled_df)
print(f"\nSample Quality Report:")
print(f"Mean absolute difference in distribution: {quality_report['mean_absolute_difference']:.2f}%")
print(f"Coverage: {quality_report['coverage']:.2f}% of original data")

# Save the sample
sample_path = BASE_DIR / 'data' / 'processed' / 'stratified_sample.csv'
sampled_df.to_csv(sample_path, index=False)
print(f"\nSample saved to: {sample_path}")

### 3. Experiment with Chunking

In [None]:
print("\n" + "="*50)
print("Step 3: Experimenting with chunking parameters...")

# Experiment with different parameters
experiment_results = experiment_with_chunking(sampled_df, sample_size=200)
print("\nChunking Experiment Results:")
print(experiment_results.to_string())

# Visualize experiment results
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Plot 1: Chunks per complaint vs chunk size
for overlap in experiment_results['chunk_overlap'].unique():
    subset = experiment_results[experiment_results['chunk_overlap'] == overlap]
    axes[0, 0].plot(subset['chunk_size'], subset['avg_chunks_per_complaint'], 
                   marker='o', label=f'Overlap={overlap}')
axes[0, 0].set_xlabel('Chunk Size')
axes[0, 0].set_ylabel('Avg Chunks per Complaint')
axes[0, 0].set_title('Chunk Size vs Number of Chunks')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Plot 2: Preservation score
for size in experiment_results['chunk_size'].unique():
    subset = experiment_results[experiment_results['chunk_size'] == size]
    axes[0, 1].plot(subset['chunk_overlap'], subset['preservation_score'], 
                   marker='s', label=f'Size={size}')
axes[0, 1].set_xlabel('Chunk Overlap')
axes[0, 1].set_ylabel('Preservation Score')
axes[0, 1].set_title('Information Preservation')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Plot 3: Avg chunk length
scatter = axes[1, 0].scatter(
    experiment_results['chunk_size'],
    experiment_results['chunk_overlap'],
    c=experiment_results['avg_chunk_length'],
    s=experiment_results['avg_chunks_per_complaint'] * 50,
    alpha=0.6,
    cmap='viridis'
)
axes[1, 0].set_xlabel('Chunk Size')
axes[1, 0].set_ylabel('Chunk Overlap')
axes[1, 0].set_title('Chunk Size & Overlap vs Length (size=chunks/complaint)')
plt.colorbar(scatter, ax=axes[1, 0], label='Avg Chunk Length')

# Plot 4: Choose optimal parameters
# Based on experiments, we choose:
# - chunk_size = 500 (standard for sentence transformers)
# - chunk_overlap = 50 (10% overlap for context preservation)
chosen_size = 500
chosen_overlap = 50

axes[1, 1].axvline(chosen_size, color='red', linestyle='--', alpha=0.5)
axes[1, 1].axhline(chosen_overlap, color='red', linestyle='--', alpha=0.5)
axes[1, 1].scatter(chosen_size, chosen_overlap, color='red', s=200, marker='*', label='Chosen')
axes[1, 1].set_xlabel('Chunk Size')
axes[1, 1].set_ylabel('Chunk Overlap')
axes[1, 1].set_title(f'Chosen Parameters: Size={chosen_size}, Overlap={chosen_overlap}')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(BASE_DIR / 'reports' / 'chunking_experiment.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nChosen chunking parameters:")
print(f"  - Chunk size: {chosen_size} characters")
print(f"  - Chunk overlap: {chosen_overlap} characters")
print(f"  - Rationale: Standard size for sentence transformers, 10% overlap for context")

### 4. Chunk All Sampled Complaints

In [None]:
print("\n" + "="*50)
print("Step 4: Chunking all sampled complaints...")

chunker = ComplaintChunker(
    chunk_size=chosen_size,
    chunk_overlap=chosen_overlap
)

chunks = chunker.chunk_dataframe(sampled_df)

# Analyze chunking results
chunking_stats = chunker.analyze_chunking_results(chunks, sampled_df)

print(f"\nChunking Statistics:")
print(f"  Total chunks: {chunking_stats['total_chunks']:,}")
print(f"  Total complaints: {chunking_stats['total_complaints']:,}")
print(f"  Avg chunks per complaint: {chunking_stats['avg_chunks_per_complaint']:.2f}")
print(f"  Chunk length - Min: {chunking_stats['chunk_length_stats']['min']}")
print(f"  Chunk length - Max: {chunking_stats['chunk_length_stats']['max']}")
print(f"  Chunk length - Mean: {chunking_stats['chunk_length_stats']['mean']:.1f}")
print(f"  Chunk length - Median: {chunking_stats['chunk_length_stats']['median']}")

# Visualize chunk distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Chunk length distribution
chunk_lengths = [len(chunk['text']) for chunk in chunks]
axes[0].hist(chunk_lengths, bins=50, edgecolor='black', alpha=0.7)
axes[0].axvline(chosen_size, color='red', linestyle='--', label=f'Target: {chosen_size}')
axes[0].set_xlabel('Chunk Length (characters)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Chunk Lengths')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Chunks per complaint distribution
chunks_per_complaint = {}
for chunk in chunks:
    complaint_id = chunk['metadata']['complaint_id']
    chunks_per_complaint[complaint_id] = chunks_per_complaint.get(complaint_id, 0) + 1

axes[1].hist(list(chunks_per_complaint.values()), bins=20, edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Chunks per Complaint')
axes[1].set_ylabel('Number of Complaints')
axes[1].set_title('Distribution of Chunks per Complaint')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(BASE_DIR / 'reports' / 'chunking_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

### 5. Choose and Initialize Embedding Model

In [None]:
print("\n" + "="*50)
print("Step 5: Initializing embedding model...")

model_name = 'all-MiniLM-L6-v2'
print(f"Model choice: {model_name}")
print("Reasons for choosing this model:")
print("  1. Efficient: Small model (80MB) with good performance")
print("  2. Standard: Widely used in RAG applications")
print("  3. Balanced: Good trade-off between speed and accuracy")
print("  4. Dimension: 384 dimensions - efficient for retrieval")
print("  5. Specialized: Trained for semantic similarity tasks")

embedding_model = EmbeddingModel(model_name=model_name)
model_info = embedding_model.get_model_info()

print(f"\nModel Information:")
for key, value in model_info.items():
    print(f"  {key}: {value}")

### 6. Generate Embeddings

In [None]:
print("\n" + "="*50)
print("Step 6: Generating embeddings for chunks...")

# Extract texts from chunks
chunk_texts = [chunk['text'] for chunk in chunks]

# Generate embeddings
embeddings = embedding_model.encode(chunk_texts, show_progress=True)

print(f"\nEmbeddings generated:")
print(f"  Shape: {embeddings.shape}")
print(f"  Size: {embeddings.nbytes / (1024**2):.2f} MB")

### 7. Create and Save Vector Store

In [None]:
print("\n" + "="*50)
print("Step 7: Creating vector store...")

# Choose vector store type (ChromaDB for development, FAISS for production)
store_type = 'chroma'  # or 'faiss'

vector_store = VectorStore(
    store_type=store_type,
    persist_directory=BASE_DIR / 'vector_store' / 'sample',
    collection_name='complaint_chunks_sample'
)

# Create vector store
vector_store.create_from_chunks(chunks, embeddings, batch_size=1000)

# Get stats
stats = vector_store.get_stats()
print(f"\nVector Store Statistics:")
for key, value in stats.items():
    print(f"  {key}: {value}")


### 8. Test Vector Store

In [None]:
print("\n" + "="*50)
print("Step 8: Testing vector store with sample queries...")

# Sample test queries
test_queries = [
    "credit card fees too high",
    "problems with money transfer",
    "savings account interest rates",
    "personal loan application denied"
]

print("\nTesting retrieval with sample queries:")
for query in test_queries:
    print(f"\nQuery: '{query}'")
    
    # Generate query embedding
    query_embedding = embedding_model.encode_single(query)
    
    # Search
    results = vector_store.search(query_embedding, k=3)
    
    print(f"  Top 3 results:")
    for i, result in enumerate(results, 1):
        product = result['metadata']['product_category']
        score = result['score']
        text_preview = result['text'][:100] + "..." if len(result['text']) > 100 else result['text']
        print(f"    {i}. [{product}] Score: {score:.3f}")
        print(f"       Text: {text_preview}")


### 9. Save Configuration

In [None]:
print("\n" + "="*50)
print("Step 9: Saving configuration...")

config = {
    'sampling': {
        'sample_size': 12000,
        'stratify_column': 'Product_standardized',
        'random_state': 42
    },
    'chunking': {
        'chunk_size': chosen_size,
        'chunk_overlap': chosen_overlap,
        'separators': ["\n\n", "\n", ". ", "! ", "? ", ", ", " ", ""]
    },
    'embedding': {
        'model_name': model_name,
        'embedding_dimension': model_info['embedding_dimension'],
        'device': model_info['device']
    },
    'vector_store': {
        'type': store_type,
        'persist_directory': str(BASE_DIR / 'vector_store' / 'sample'),
        'collection_name': 'complaint_chunks_sample',
        'total_chunks': len(chunks)
    }
}

# Save config
import json
config_path = BASE_DIR / 'config' / 'task2_config.json'
config_path.parent.mkdir(exist_ok=True)

with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)

print(f"Configuration saved to: {config_path}")

print("\n" + "="*50)
print("TASK 2 COMPLETED SUCCESSFULLY!")
print("="*50)