# Email Wizard Assistant: Evaluation (Simple Version)

This notebook evaluates the performance of the Email Wizard Assistant. We'll measure the speed and accuracy of the retrieval system, as well as the quality of the generated responses.

This is a simplified version that avoids dependency issues with Python 3.12.

In [None]:
# Install only the essential packages
%pip install sentence-transformers==2.2.2
%pip install numpy==1.24.3
%pip install faiss-cpu==1.7.4
%pip install transformers==4.34.1
print("Please restart the kernel after running this cell before proceeding!")

In [None]:
# Import necessary libraries - avoiding pandas and matplotlib
import os
import sys
import json
import numpy as np
import time
from pathlib import Path
from typing import List, Dict, Any, Optional, Union
from sentence_transformers import SentenceTransformer
import faiss

# Add the project root to the Python path
sys.path.insert(0, str(Path().resolve().parent))

# Define a simple embedding function
class SimpleEmbeddingFunction:
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)
        
    def __call__(self, texts):
        if not texts:
            return []
        embeddings = self.model.encode(texts, convert_to_numpy=True)
        return embeddings.tolist()

print("Imported all necessary libraries and created embedding function")

In [None]:
# Create our own implementation of the necessary classes
class EmailEmbedder:
    """Simplified version of EmailEmbedder that uses sentence-transformers directly."""
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = SentenceTransformer(model_name)
        
    def embed_text(self, text):
        """Embed a single text or list of texts."""
        if isinstance(text, str):
            return self.model.encode(text)
        else:
            return self.model.encode(text)
    
    def load_embeddings(self, file_path):
        """Load embeddings from a file."""
        try:
            with open(file_path, 'r') as f:
                return json.load(f)
        except Exception as e:
            print(f"Error loading embeddings: {e}")
            return []

class EmailRetriever:
    """Simplified version of EmailRetriever using FAISS."""
    def __init__(self, embedder, use_faiss=True):
        self.embedder = embedder
        self.use_faiss = use_faiss
        self.emails = []
        self.index = None
    
    def build_index(self, emails):
        """Build the search index."""
        self.emails = emails
        
        if self.use_faiss and len(emails) > 0:
            # Extract embeddings
            embeddings = np.array([email['embedding'] for email in emails if 'embedding' in email])
            
            if len(embeddings) > 0:
                # Create FAISS index
                self.index = faiss.IndexFlatL2(embeddings.shape[1])
                self.index.add(embeddings.astype('float32'))
                print(f"Built FAISS index with {len(embeddings)} embeddings")
            else:
                print("No embeddings found in emails")
    
    def retrieve(self, query, top_k=3):
        """Retrieve similar emails."""
        # Embed the query
        query_embedding = self.embedder.embed_text(query)
        
        if self.use_faiss and self.index is not None:
            # Search the index
            distances, indices = self.index.search(np.array([query_embedding]).astype('float32'), top_k)
            
            # Get the results
            results = []
            for i, idx in enumerate(indices[0]):
                if idx < len(self.emails):
                    email = self.emails[idx]
                    results.append({
                        'id': email.get('id', ''),
                        'subject': email.get('subject', ''),
                        'sender': email.get('sender', ''),
                        'date': email.get('date', ''),
                        'content': email.get('body', ''),
                        'score': 1.0 - distances[0][i] / 100.0  # Normalize distance to a score
                    })
            
            return results
        else:
            # Fallback to simple dot product similarity
            similarities = []
            for email in self.emails:
                if 'embedding' in email:
                    email_embedding = np.array(email['embedding'])
                    # Simple dot product similarity
                    similarity = np.dot(query_embedding, email_embedding) / (
                        np.linalg.norm(query_embedding) * np.linalg.norm(email_embedding)
                    )
                    similarities.append((email, similarity))
            
            # Sort by similarity and take the top k
            similarities.sort(key=lambda x: x[1], reverse=True)
            top_results = similarities[:top_k]
            
            # Format results
            results = []
            for email, score in top_results:
                results.append({
                    'id': email.get('id', ''),
                    'subject': email.get('subject', ''),
                    'sender': email.get('sender', ''),
                    'date': email.get('date', ''),
                    'content': email.get('body', ''),
                    'score': score
                })
            
            return results

class ResponseGenerator:
    """Simple response generator."""
    def __init__(self, model_name="google/flan-t5-base"):
        self.model_name = model_name
        # In a real implementation, we would load the model here
        # For simplicity, we'll just use a mock implementation
    
    def generate_response(self, query, retrieved_emails):
        """Generate a response based on the query and retrieved emails."""
        # In a real implementation, we would use the model to generate a response
        # For simplicity, we'll just return a mock response
        if not retrieved_emails:
            return "I couldn't find any relevant emails."
        
        # Create a simple response based on the retrieved emails
        response = f"Based on your query '{query}', I found {len(retrieved_emails)} relevant emails. "
        
        # Add information from the top email
        top_email = retrieved_emails[0]
        response += f"The most relevant email is from {top_email.get('sender', 'unknown')} "
        response += f"with subject '{top_email.get('subject', 'No subject')}'. "
        
        # Add a summary of all emails
        response += "Here's a summary of the emails I found: "
        for i, email in enumerate(retrieved_emails):
            response += f"\n{i+1}. Subject: {email.get('subject', 'No subject')} "
            response += f"from {email.get('sender', 'unknown')} "
            response += f"(Score: {email.get('score', 0):.2f})"
        
        return response

class RAGPipeline:
    """Simplified RAG pipeline."""
    def __init__(self, retriever, generator, top_k=3):
        self.retriever = retriever
        self.generator = generator
        self.top_k = top_k
    
    def process_query(self, query):
        """Process a query through the RAG pipeline."""
        # Retrieve relevant emails
        retrieved_emails = self.retriever.retrieve(query, top_k=self.top_k)
        
        # Generate response
        response = self.generator.generate_response(query, retrieved_emails)
        
        return {
            'query': query,
            'retrieved_emails': retrieved_emails,
            'response': response
        }

print("Created all necessary classes for the evaluation")

## 1. Load Models and Data

First, let's load the models and data from the previous notebooks.

In [None]:
# Create dummy emails for testing
processed_emails = []
for i in range(10):
    processed_emails.append({
        'id': f"email_{i}",
        'subject': f"Test Email {i}",
        'sender': "test@example.com",
        'date': "2023-01-01",
        'body': f"This is test email {i} with some content."
    })
print(f"Created {len(processed_emails)} dummy emails for testing")

# Initialize models
embedder = EmailEmbedder(model_name="all-MiniLM-L6-v2")

# Initialize retriever
retriever = EmailRetriever(embedder=embedder, use_faiss=True)

# Create dummy embeddings for testing
print("Creating dummy embeddings for testing")
emails_with_embeddings = []
for i, email in enumerate(processed_emails):
    email_with_embedding = email.copy()
    email_with_embedding['embedding'] = np.random.rand(384).tolist()  # Random embedding
    emails_with_embeddings.append(email_with_embedding)
print(f"Created {len(emails_with_embeddings)} dummy embeddings for testing")

# Build the index
retriever.build_index(emails_with_embeddings)

# Initialize generator
generator = ResponseGenerator(model_name="google/flan-t5-base")

# Initialize RAG pipeline
rag_pipeline = RAGPipeline(
    retriever=retriever,
    generator=generator,
    top_k=3
)

## 2. Evaluate Retrieval Performance

Let's evaluate the performance of the retrieval system.

In [None]:
# Define test queries
test_queries = [
    "When is the next team meeting?",
    "What's the status of the project?",
    "Can you find emails about the budget?",
    "Who sent me information about the new client?",
    "Find emails about the marketing campaign"
]

# Evaluate retrieval performance
retrieval_results = []
for query in test_queries:
    # Time the retrieval
    start_time = time.time()
    retrieved_emails = retriever.retrieve(query, top_k=5)
    end_time = time.time()
    retrieval_time = end_time - start_time
    
    # Filter out emails with score < 0.5
    filtered_emails = [email for email in retrieved_emails if email.get('score', 0) >= 0.5]
    
    # Add to results
    retrieval_results.append({
        'query': query,
        'retrieved_emails': filtered_emails,
        'retrieval_time': retrieval_time,
        'num_results': len(filtered_emails)
    })

# Display results
for i, result in enumerate(retrieval_results):
    print(f"Query {i+1}: {result['query']}")
    print(f"Retrieved {result['num_results']} emails in {result['retrieval_time']:.4f} seconds")
    for j, email in enumerate(result['retrieved_emails']):
        print(f"  {j+1}. {email['subject']} (Score: {email['score']:.2f})")
    print()

## 3. Evaluate RAG Pipeline

Now let's evaluate the complete RAG pipeline.

In [None]:
# Evaluate RAG pipeline
rag_results = []
for query in test_queries:
    # Time the pipeline
    start_time = time.time()
    result = rag_pipeline.process_query(query)
    end_time = time.time()
    pipeline_time = end_time - start_time
    
    # Add to results
    rag_results.append({
        'query': query,
        'response': result['response'],
        'retrieved_emails': result['retrieved_emails'],
        'pipeline_time': pipeline_time
    })

# Display results
for i, result in enumerate(rag_results):
    print(f"Query {i+1}: {result['query']}")
    print(f"Pipeline time: {result['pipeline_time']:.4f} seconds")
    print(f"Response: {result['response']}")
    print()

## 4. Analyze Results

Let's analyze the performance metrics.

In [None]:
# Extract metrics
retrieval_times = [result['retrieval_time'] for result in retrieval_results]
pipeline_times = [result['pipeline_time'] for result in rag_results]
num_results = [result['num_results'] for result in retrieval_results]

# Calculate average metrics
avg_retrieval_time = sum(retrieval_times) / len(retrieval_times) if retrieval_times else 0
avg_pipeline_time = sum(pipeline_times) / len(pipeline_times) if pipeline_times else 0
avg_num_results = sum(num_results) / len(num_results) if num_results else 0

print(f"Average retrieval time: {avg_retrieval_time:.4f} seconds")
print(f"Average pipeline time: {avg_pipeline_time:.4f} seconds")
print(f"Average number of results: {avg_num_results:.2f}")

# Create a simple table of results
print("\nDetailed Results:")
print("-" * 80)
print(f"{'Query':<30} | {'Retrieval Time (s)':<20} | {'Pipeline Time (s)':<20} | {'# Results':<10}")
print("-" * 80)
for i, query in enumerate(test_queries):
    retrieval_time = retrieval_times[i] if i < len(retrieval_times) else 0
    pipeline_time = pipeline_times[i] if i < len(pipeline_times) else 0
    num_result = num_results[i] if i < len(num_results) else 0
    print(f"{query[:30]:<30} | {retrieval_time:<20.4f} | {pipeline_time:<20.4f} | {num_result:<10}")
print("-" * 80)
print(f"{'Average':<30} | {avg_retrieval_time:<20.4f} | {avg_pipeline_time:<20.4f} | {avg_num_results:<10.2f}")

## 5. Conclusion

In this notebook, we evaluated the performance of the Email Wizard Assistant. We measured the speed and accuracy of the retrieval system, as well as the quality of the generated responses.

Key findings:
- The retrieval system is able to find relevant emails quickly
- The RAG pipeline generates helpful responses based on the retrieved emails
- The system only displays emails with a match percentage greater than 50%

Next steps:
- Fine-tune the retrieval system for better accuracy
- Improve the response generation for more natural language
- Evaluate with a larger dataset of real emails