In [None]:
# Task 3: Building the RAG Core Logic and Evaluation

import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from datetime import datetime

# Add src to path
BASE_DIR = Path.cwd().parent
sys.path.append(str(BASE_DIR / 'src'))

# Import our modules
from rag_pipeline import RAGPipeline
from evaluator import RAGEvaluator
from config import RAGConfig

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")





### 1. Initialize RAG Pipeline

In [None]:
print("Step 1: Initializing RAG Pipeline...")

# Load configuration
config = RAGConfig.from_yaml()

print("Configuration loaded:")
print(f"  - Vector Store: {config.vector_store_path}")
print(f"  - Collection: {config.collection_name}")
print(f"  - Retriever: top_k={config.retriever.top_k}, threshold={config.retriever.similarity_threshold}")
print(f"  - Generator: {config.generator.model_name}")
print(f"  - Temperature: {config.generator.temperature}")

# Initialize pipeline
rag_pipeline = RAGPipeline(config=config)

# Get pipeline info
pipeline_info = rag_pipeline.get_pipeline_info()
print("\nPipeline Components:")
print(json.dumps(pipeline_info, indent=2))

### 2. Test Basic Retrieval

In [None]:
print("\n" + "="*50)
print("Step 2: Testing Basic Retrieval...")

test_queries = [
    "What are common credit card fee complaints?",
    "Issues with money transfers",
    "Problems with savings accounts",
    "Personal loan application difficulties"
]

print("\nTesting retrieval with sample queries:")
for query in test_queries:
    print(f"\nQuery: '{query}'")
    
    # Analyze query
    analysis = rag_pipeline.retriever.analyze_query(query)
    print(f"  Suggested filters: {analysis['suggested_filters']}")
    
    # Retrieve
    results = rag_pipeline.retriever.retrieve(query, k=3)
    
    if results:
        print(f"  Retrieved {len(results)} chunks:")
        for i, result in enumerate(results, 1):
            product = result['metadata'].get('product_category', 'Unknown')
            similarity = result['similarity']
            text_preview = result['text'][:100] + "..." if len(result['text']) > 100 else result['text']
            print(f"    {i}. [{product}] Similarity: {similarity:.3f}")
            print(f"       {text_preview}")
    else:
        print("  No relevant chunks found")

### 3. Test Full RAG Pipeline

In [None]:
print("\n" + "="*50)
print("Step 3: Testing Full RAG Pipeline...")

# Test questions for demonstration
demo_questions = [
    "What are customers saying about credit card interest rates?",
    "How reliable are money transfer services based on complaints?",
    "What are the main issues with savings accounts?"
]

print("\nTesting complete RAG pipeline:")
for i, question in enumerate(demo_questions, 1):
    print(f"\n{i}. Question: '{question}'")
    
    try:
        response = rag_pipeline.query(question)
        
        print(f"   Answer: {response.answer[:150]}...")
        print(f"   Retrieved chunks: {response.retrieved_chunks}")
        print(f"   Processing time: {response.processing_time:.2f}s")
        print(f"   Model: {response.generation_stats['model']}")
        print(f"   Tokens: {response.generation_stats['total_tokens']}")
        
        # Show sample source
        if response.sources:
            print(f"   Sample source: {response.sources[0]['text'][:100]}...")
    
    except Exception as e:
        print(f"   Error: {e}")

### 4. Comprehensive Evaluation

In [None]:
print("\n" + "="*50)
print("Step 4: Comprehensive Evaluation...")

# Initialize evaluator
evaluator = RAGEvaluator(rag_pipeline)

# Load test questions
test_questions = evaluator.load_test_questions()
print(f"\nLoaded {len(test_questions)} test questions")

# Display question categories
categories = pd.DataFrame(test_questions)['category'].value_counts()
print("\nQuestion Categories:")
for category, count in categories.items():
    print(f"  {category}: {count} questions")

# Run batch evaluation
print("\nRunning batch evaluation...")
results_df, summary = evaluator.evaluate_batch(test_questions, save_results=True)

# Display evaluation summary
print("\nEvaluation Summary:")
print(f"  Average Overall Score: {summary['avg_overall_score']:.2f}/5")
print(f"  Average Relevance: {summary['avg_relevance']:.2f}/5")
print(f"  Average Accuracy: {summary['avg_accuracy']:.2f}/5")
print(f"  Average Completeness: {summary['avg_completeness']:.2f}/5")
print(f"  Average Processing Time: {summary['avg_processing_time']:.2f}s")
print(f"  Total Chunks Retrieved: {summary['total_chunks_retrieved']}")

### 5. Visualization of Results

In [None]:
print("\n" + "="*50)
print("Step 5: Visualizing Evaluation Results...")

# Create visualizations
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# 1. Score Distribution
score_counts = results_df['overall_score'].round().value_counts().sort_index()
axes[0, 0].bar(score_counts.index, score_counts.values, color='skyblue', edgecolor='black')
axes[0, 0].set_xlabel('Score (1-5)')
axes[0, 0].set_ylabel('Number of Questions')
axes[0, 0].set_title('Score Distribution')
for i, (score, count) in enumerate(score_counts.items()):
    axes[0, 0].text(score, count + 0.1, str(count), ha='center')

# 2. Performance by Category
category_scores = results_df.groupby('category')['overall_score'].mean().sort_values()
axes[0, 1].barh(range(len(category_scores)), category_scores.values, color='lightgreen', edgecolor='black')
axes[0, 1].set_yticks(range(len(category_scores)))
axes[0, 1].set_yticklabels(category_scores.index)
axes[0, 1].set_xlabel('Average Score')
axes[0, 1].set_title('Performance by Category')
for i, score in enumerate(category_scores.values):
    axes[0, 1].text(score + 0.05, i, f'{score:.2f}', va='center')

# 3. Performance by Difficulty
difficulty_scores = results_df.groupby('difficulty')['overall_score'].mean()
axes[0, 2].bar(range(len(difficulty_scores)), difficulty_scores.values, 
               color=['lightcoral', 'gold', 'lightseagreen'], edgecolor='black')
axes[0, 2].set_xticks(range(len(difficulty_scores)))
axes[0, 2].set_xticklabels(difficulty_scores.index)
axes[0, 2].set_ylabel('Average Score')
axes[0, 2].set_title('Performance by Difficulty')
for i, score in enumerate(difficulty_scores.values):
    axes[0, 2].text(i, score + 0.05, f'{score:.2f}', ha='center')

# 4. Processing Time Distribution
axes[1, 0].hist(results_df['processing_time'], bins=10, color='lightcoral', edgecolor='black', alpha=0.7)
axes[1, 0].axvline(results_df['processing_time'].mean(), color='red', linestyle='--', label=f'Mean: {results_df["processing_time"].mean():.2f}s')
axes[1, 0].set_xlabel('Processing Time (seconds)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Processing Time Distribution')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 5. Chunks Retrieved vs Score
scatter = axes[1, 1].scatter(
    results_df['retrieved_sources_count'],
    results_df['overall_score'],
    c=results_df['processing_time'],
    s=100,
    alpha=0.6,
    cmap='viridis'
)
axes[1, 1].set_xlabel('Chunks Retrieved')
axes[1, 1].set_ylabel('Overall Score')
axes[1, 1].set_title('Retrieval vs Score (color=time)')
plt.colorbar(scatter, ax=axes[1, 1], label='Processing Time (s)')

# 6. Score Components Comparison
score_components = ['relevance_score', 'accuracy_score', 'completeness_score']
component_means = [results_df[col].mean() for col in score_components]
axes[1, 2].bar(range(len(score_components)), component_means, 
               color=['skyblue', 'lightgreen', 'gold'], edgecolor='black')
axes[1, 2].set_xticks(range(len(score_components)))
axes[1, 2].set_xticklabels(['Relevance', 'Accuracy', 'Completeness'])
axes[1, 2].set_ylabel('Average Score')
axes[1, 2].set_title('Score Components Comparison')
axes[1, 2].set_ylim([0, 5])
for i, mean in enumerate(component_means):
    axes[1, 2].text(i, mean + 0.1, f'{mean:.2f}', ha='center')

plt.tight_layout()
plt.savefig(BASE_DIR / 'evaluation' / 'results' / 'evaluation_visualizations.png', dpi=300, bbox_inches='tight')
plt.show()

### 6. Detailed Analysis 

In [None]:
print("\n" + "="*50)
print("Step 6: Detailed Analysis of Results...")

# Find best and worst performing questions
best_question = results_df.loc[results_df['overall_score'].idxmax()]
worst_question = results_df.loc[results_df['overall_score'].idxmin()]

print("\nBest Performing Question:")
print(f"  Question: {best_question['question'][:80]}...")
print(f"  Category: {best_question['category']}")
print(f"  Score: {best_question['overall_score']}/5")
print(f"  Chunks Retrieved: {best_question['retrieved_sources_count']}")
print(f"  Time: {best_question['processing_time']:.2f}s")

print("\nWorst Performing Question:")
print(f"  Question: {worst_question['question'][:80]}...")
print(f"  Category: {worst_question['category']}")
print(f"  Score: {worst_question['overall_score']}/5")
print(f"  Chunks Retrieved: {worst_question['retrieved_sources_count']}")
print(f"  Time: {worst_question['processing_time']:.2f}s")

# Analyze correlation
correlation_matrix = results_df[['overall_score', 'relevance_score', 'accuracy_score', 
                                 'completeness_score', 'processing_time', 
                                 'retrieved_sources_count']].corr()

print("\nCorrelation Matrix:")
print(correlation_matrix.round(2))
