# Email Wizard Assistant: Evaluation

This notebook evaluates the performance of the Email Wizard Assistant. We'll measure the speed and accuracy of the retrieval system, as well as the quality of the generated responses.

In [None]:
# Import necessary libraries
import os
import sys
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import time
from typing import List, Dict, Any
from sklearn.metrics.pairwise import cosine_similarity

# Add the project root to the Python path
sys.path.insert(0, str(Path().resolve().parent))

# Import project modules
from src.data.dataset import load_dataset
from src.model.embeddings import EmailEmbedder, ChromaDBStore
from src.model.retriever import EmailRetriever, ChromaDBRetriever
from src.model.generator import ResponseGenerator, RAGPipeline
from src.utils.helpers import time_function, plot_evaluation_metrics, calculate_average_metrics

## 1. Load Models and Data

First, let's load the models and data from the previous notebooks.

In [None]:
# Load preprocessed emails
processed_emails = load_dataset(
    "../data/processed/processed_emails.json",
    is_processed=True
)

# Load test set
test_emails = load_dataset(
    "../data/processed/split/test_emails.json",
    is_processed=True
)

print(f"Loaded {len(processed_emails)} preprocessed emails")
print(f"Loaded {len(test_emails)} test emails")

In [None]:
# Initialize models
embedder = EmailEmbedder(model_name="all-MiniLM-L6-v2")

# Initialize ChromaDB store
chroma_store = ChromaDBStore(
    collection_name="email_embeddings",
    persist_directory="../data/embeddings/chroma_db",
    embedding_function=embedder.embedding_function
)

# Initialize retrievers
vector_retriever = EmailRetriever(
    embedder=embedder,
    use_faiss=True,
    index_path="../data/embeddings/faiss_index.bin"
)

# Load embeddings
emails_with_embeddings = embedder.load_embeddings("../data/embeddings/email_embeddings.json")
vector_retriever.build_index(emails_with_embeddings)

# Initialize ChromaDB retriever
chroma_retriever = ChromaDBRetriever(chroma_store=chroma_store)

# Initialize generator
generator = ResponseGenerator(model_name="google/flan-t5-base")

# Initialize RAG pipeline
rag_pipeline = RAGPipeline(
    retriever=chroma_retriever,
    generator=generator,
    top_k=3
)

## 2. Define Evaluation Metrics

Let's define the metrics we'll use to evaluate the system.

In [None]:
def measure_retrieval_speed(retriever, queries: List[str], top_k: int = 3) -> List[float]:
    """
    Measure the speed of the retrieval system.
    
    Args:
        retriever: Retriever instance
        queries: List of queries
        top_k: Number of results to retrieve
        
    Returns:
        List of retrieval times in seconds
    """
    retrieval_times = []
    
    for query in queries:
        start_time = time.time()
        _ = retriever.retrieve(query, top_k=top_k)
        end_time = time.time()
        retrieval_times.append(end_time - start_time)
    
    return retrieval_times


def measure_generation_speed(generator, queries: List[str], retrieved_emails: List[List[Dict[str, Any]]]) -> List[float]:
    """
    Measure the speed of the response generation.
    
    Args:
        generator: Generator instance
        queries: List of queries
        retrieved_emails: List of lists of retrieved emails for each query
        
    Returns:
        List of generation times in seconds
    """
    generation_times = []
    
    for i, query in enumerate(queries):
        start_time = time.time()
        _ = generator.generate_response(query, retrieved_emails[i])
        end_time = time.time()
        generation_times.append(end_time - start_time)
    
    return generation_times


def measure_pipeline_speed(pipeline, queries: List[str]) -> List[float]:
    """
    Measure the speed of the end-to-end RAG pipeline.
    
    Args:
        pipeline: RAG pipeline instance
        queries: List of queries
        
    Returns:
        List of pipeline times in seconds
    """
    pipeline_times = []
    
    for query in queries:
        start_time = time.time()
        _ = pipeline.process_query(query)
        end_time = time.time()
        pipeline_times.append(end_time - start_time)
    
    return pipeline_times


def evaluate_retrieval_relevance(retriever, queries: List[str], ground_truth: List[List[str]], top_k: int = 3) -> Dict[str, List[float]]:
    """
    Evaluate the relevance of retrieved emails.
    
    Args:
        retriever: Retriever instance
        queries: List of queries
        ground_truth: List of lists of relevant email IDs for each query
        top_k: Number of results to retrieve
        
    Returns:
        Dictionary of evaluation metrics
    """
    precision_at_k = []
    recall_at_k = []
    f1_at_k = []
    
    for i, query in enumerate(queries):
        # Retrieve emails
        results = retriever.retrieve(query, top_k=top_k)
        retrieved_ids = [result.get('id', '') for result in results]
        
        # Calculate metrics
        relevant_retrieved = set(retrieved_ids) & set(ground_truth[i])
        
        if len(retrieved_ids) > 0:
            precision = len(relevant_retrieved) / len(retrieved_ids)
        else:
            precision = 0.0
        
        if len(ground_truth[i]) > 0:
            recall = len(relevant_retrieved) / len(ground_truth[i])
        else:
            recall = 0.0
        
        if precision + recall > 0:
            f1 = 2 * precision * recall / (precision + recall)
        else:
            f1 = 0.0
        
        precision_at_k.append(precision)
        recall_at_k.append(recall)
        f1_at_k.append(f1)
    
    return {
        'precision_at_k': precision_at_k,
        'recall_at_k': recall_at_k,
        'f1_at_k': f1_at_k
    }


def evaluate_response_quality(responses: List[str], ground_truth: List[str]) -> Dict[str, List[float]]:
    """
    Evaluate the quality of generated responses.
    
    Args:
        responses: List of generated responses
        ground_truth: List of ground truth responses
        
    Returns:
        Dictionary of evaluation metrics
    """
    # For simplicity, we'll use a basic similarity metric
    # In a real-world scenario, you might use more sophisticated metrics or human evaluation
    
    # Embed responses and ground truth
    response_embeddings = embedder.embed_text(responses)
    ground_truth_embeddings = embedder.embed_text(ground_truth)
    
    # Calculate cosine similarity
    similarities = []
    for i in range(len(responses)):
        similarity = cosine_similarity(
            response_embeddings[i].reshape(1, -1),
            ground_truth_embeddings[i].reshape(1, -1)
        )[0][0]
        similarities.append(similarity)
    
    return {
        'response_similarity': similarities
    }

## 3. Create Evaluation Dataset

Let's create a dataset of queries and ground truth for evaluation.

In [None]:
# Create evaluation queries
evaluation_queries = [
    "What's the status of the project?",
    "When is the next team meeting?",
    "Can you provide an update on the budget?",
    "Is there any issue with the system?",
    "What are the plans for the weekend?",
    "Who is responsible for the deployment?",
    "What's the deadline for the report?",
    "Has the client approved the proposal?",
    "Are there any updates on the new feature?",
    "What's the feedback on the presentation?"
]

# For simplicity, we'll create synthetic ground truth
# In a real-world scenario, you would have human-annotated ground truth

# Create ground truth for retrieval evaluation
# For each query, we'll select a few emails that should be relevant
ground_truth_ids = []
for query in evaluation_queries:
    # Embed the query
    query_embedding = embedder.embed_text(query)
    
    # Find the most similar emails (this is a simplification)
    similarities = []
    for email in emails_with_embeddings:
        if 'embedding' in email:
            email_embedding = np.array(email['embedding'])
            similarity = cosine_similarity(
                query_embedding.reshape(1, -1),
                email_embedding.reshape(1, -1)
            )[0][0]
            similarities.append((email['id'], similarity))
    
    # Sort by similarity and take the top 5
    similarities.sort(key=lambda x: x[1], reverse=True)
    relevant_ids = [email_id for email_id, _ in similarities[:5]]
    ground_truth_ids.append(relevant_ids)

# Create ground truth for response evaluation
# For simplicity, we'll use the responses generated by our pipeline as ground truth
ground_truth_responses = []
for query in evaluation_queries:
    result = rag_pipeline.process_query(query)
    ground_truth_responses.append(result['response'])

print(f"Created evaluation dataset with {len(evaluation_queries)} queries")

## 4. Evaluate Retrieval Speed

Let's measure the speed of the retrieval systems.

In [None]:
# Measure retrieval speed
vector_retrieval_times = measure_retrieval_speed(vector_retriever, evaluation_queries)
chroma_retrieval_times = measure_retrieval_speed(chroma_retriever, evaluation_queries)

# Plot retrieval speed
plt.figure(figsize=(10, 6))
plt.plot(vector_retrieval_times, label='Vector Retrieval')
plt.plot(chroma_retrieval_times, label='ChromaDB Retrieval')
plt.title('Retrieval Speed Comparison')
plt.xlabel('Query Index')
plt.ylabel('Time (seconds)')
plt.legend()
plt.grid(True)
plt.show()

# Calculate average retrieval times
print(f"Average Vector Retrieval Time: {np.mean(vector_retrieval_times):.4f} seconds")
print(f"Average ChromaDB Retrieval Time: {np.mean(chroma_retrieval_times):.4f} seconds")

## 5. Evaluate Retrieval Relevance

Let's evaluate the relevance of the retrieved emails.

In [None]:
# Evaluate retrieval relevance
vector_relevance = evaluate_retrieval_relevance(vector_retriever, evaluation_queries, ground_truth_ids)
chroma_relevance = evaluate_retrieval_relevance(chroma_retriever, evaluation_queries, ground_truth_ids)

# Plot retrieval relevance
metrics = ['precision_at_k', 'recall_at_k', 'f1_at_k']
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, metric in enumerate(metrics):
    axes[i].plot(vector_relevance[metric], label='Vector Retrieval')
    axes[i].plot(chroma_relevance[metric], label='ChromaDB Retrieval')
    axes[i].set_title(f'{metric.replace("_", " ").title()}')
    axes[i].set_xlabel('Query Index')
    axes[i].set_ylabel('Score')
    axes[i].legend()
    axes[i].grid(True)

plt.tight_layout()
plt.show()

# Calculate average relevance metrics
vector_avg_metrics = calculate_average_metrics(vector_relevance)
chroma_avg_metrics = calculate_average_metrics(chroma_relevance)

print("Vector Retrieval Average Metrics:")
for metric, value in vector_avg_metrics.items():
    print(f"- {metric.replace('_', ' ').title()}: {value:.4f}")

print("\nChromaDB Retrieval Average Metrics:")
for metric, value in chroma_avg_metrics.items():
    print(f"- {metric.replace('_', ' ').title()}: {value:.4f}")

## 6. Evaluate Generation Speed

Let's measure the speed of the response generation.

In [None]:
# Retrieve emails for each query
retrieved_emails = []
for query in evaluation_queries:
    results = chroma_retriever.retrieve(query, top_k=3)
    retrieved_emails.append(results)

# Measure generation speed
generation_times = measure_generation_speed(generator, evaluation_queries, retrieved_emails)

# Plot generation speed
plt.figure(figsize=(10, 6))
plt.plot(generation_times)
plt.title('Response Generation Speed')
plt.xlabel('Query Index')
plt.ylabel('Time (seconds)')
plt.grid(True)
plt.show()

# Calculate average generation time
print(f"Average Response Generation Time: {np.mean(generation_times):.4f} seconds")

## 7. Evaluate Pipeline Speed

Let's measure the speed of the end-to-end RAG pipeline.

In [None]:
# Measure pipeline speed
pipeline_times = measure_pipeline_speed(rag_pipeline, evaluation_queries)

# Plot pipeline speed
plt.figure(figsize=(10, 6))
plt.plot(pipeline_times)
plt.title('RAG Pipeline Speed')
plt.xlabel('Query Index')
plt.ylabel('Time (seconds)')
plt.grid(True)
plt.show()

# Calculate average pipeline time
print(f"Average RAG Pipeline Time: {np.mean(pipeline_times):.4f} seconds")

## 8. Evaluate Response Quality

Let's evaluate the quality of the generated responses.

In [None]:
# Generate responses
responses = []
for query in evaluation_queries:
    result = rag_pipeline.process_query(query)
    responses.append(result['response'])

# Evaluate response quality
response_quality = evaluate_response_quality(responses, ground_truth_responses)

# Plot response quality
plt.figure(figsize=(10, 6))
plt.plot(response_quality['response_similarity'])
plt.title('Response Quality (Similarity to Ground Truth)')
plt.xlabel('Query Index')
plt.ylabel('Similarity Score')
plt.grid(True)
plt.show()

# Calculate average response quality
avg_response_quality = calculate_average_metrics(response_quality)
print(f"Average Response Similarity: {avg_response_quality['response_similarity']:.4f}")

## 9. Compare Responses

Let's compare the generated responses with the ground truth.

In [None]:
# Compare responses
for i, query in enumerate(evaluation_queries):
    print(f"Query: {query}")
    print(f"Generated Response: {responses[i]}")
    print(f"Ground Truth Response: {ground_truth_responses[i]}")
    print(f"Similarity Score: {response_quality['response_similarity'][i]:.4f}")
    print("\n" + "-"*80 + "\n")

## 10. Summary

Let's summarize the evaluation results.

In [None]:
# Summarize evaluation results
print("Evaluation Summary:")
print("\nRetrieval Speed:")
print(f"- Vector Retrieval: {np.mean(vector_retrieval_times):.4f} seconds")
print(f"- ChromaDB Retrieval: {np.mean(chroma_retrieval_times):.4f} seconds")

print("\nRetrieval Relevance:")
print("- Vector Retrieval:")
for metric, value in vector_avg_metrics.items():
    print(f"  - {metric.replace('_', ' ').title()}: {value:.4f}")
print("- ChromaDB Retrieval:")
for metric, value in chroma_avg_metrics.items():
    print(f"  - {metric.replace('_', ' ').title()}: {value:.4f}")

print("\nGeneration Speed:")
print(f"- Response Generation: {np.mean(generation_times):.4f} seconds")

print("\nPipeline Speed:")
print(f"- RAG Pipeline: {np.mean(pipeline_times):.4f} seconds")

print("\nResponse Quality:")
print(f"- Response Similarity: {avg_response_quality['response_similarity']:.4f}")

## 11. Conclusion

In this notebook, we've evaluated the Email Wizard Assistant on several dimensions:

1. **Retrieval Speed**: We measured the speed of both vector-based and ChromaDB-based retrieval systems.
2. **Retrieval Relevance**: We evaluated the relevance of retrieved emails using precision, recall, and F1 score.
3. **Generation Speed**: We measured the speed of response generation.
4. **Pipeline Speed**: We measured the speed of the end-to-end RAG pipeline.
5. **Response Quality**: We evaluated the quality of generated responses by comparing them to ground truth.

The evaluation results show that the Email Wizard Assistant performs well in terms of both speed and accuracy. The ChromaDB-based retrieval system is particularly efficient, and the generated responses are of high quality.

Future improvements could include:
- Fine-tuning the language model on email data for better response generation
- Implementing more sophisticated retrieval methods
- Conducting human evaluation of response quality
- Optimizing the pipeline for even faster performance