# Email Wizard Assistant: Model Implementation

This notebook demonstrates the implementation of the RAG model for the Email Wizard Assistant. We'll embed the preprocessed emails, set up the retrieval system, and implement the response generation.

In [None]:
# Import necessary libraries
import os
import sys
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import time

# Add the project root to the Python path
sys.path.insert(0, str(Path().resolve().parent))

# Import project modules
from src.data.dataset import load_dataset
from src.model.embeddings import EmailEmbedder, ChromaDBStore
from src.model.retriever import EmailRetriever, ChromaDBRetriever
from src.model.generator import ResponseGenerator, RAGPipeline
from src.utils.helpers import time_function, save_json, load_json

## 1. Load Preprocessed Emails

First, let's load the preprocessed emails from the previous notebook.

In [None]:
# Load preprocessed emails
processed_emails = load_dataset(
    "../data/processed/processed_emails.json",
    is_processed=True
)

print(f"Loaded {len(processed_emails)} preprocessed emails")

## 2. Embed Emails

Now, let's embed the preprocessed emails using a pre-trained Sentence Transformer model.

In [None]:
# Initialize the embedder
embedder = EmailEmbedder(model_name="all-MiniLM-L6-v2")

# Embed the emails
@time_function
def embed_emails(emails):
    return embedder.embed_emails(emails)

emails_with_embeddings = embed_emails(processed_emails)

# Save the embeddings
os.makedirs("../data/embeddings", exist_ok=True)
embedder.save_embeddings(
    emails_with_embeddings,
    "../data/embeddings/email_embeddings.json"
)

print(f"Embedded and saved {len(emails_with_embeddings)} emails")

Let's examine the embeddings to understand their structure:

In [None]:
# Examine the embeddings
sample_email = emails_with_embeddings[0]

print(f"Email ID: {sample_email['id']}")
print(f"Embedding shape: {np.array(sample_email['embedding']).shape}")

if 'chunk_embeddings' in sample_email:
    print(f"Number of chunks: {len(sample_email['chunks'])}")
    print(f"Chunk embeddings shape: {np.array(sample_email['chunk_embeddings']).shape}")

## 3. Set Up ChromaDB for Vector Storage

Now, let's set up ChromaDB for efficient vector storage and retrieval.

In [None]:
# Initialize ChromaDB store
chroma_store = ChromaDBStore(
    collection_name="email_embeddings",
    persist_directory="../data/embeddings/chroma_db",
    embedding_function=embedder.embedding_function
)

# Add emails to ChromaDB
@time_function
def add_emails_to_chroma(emails):
    chroma_store.add_emails(emails)

# Check if collection is empty before adding
collection_stats = chroma_store.get_collection_stats()
print(f"Collection stats: {collection_stats}")

if collection_stats["count"] == 0:
    add_emails_to_chroma(processed_emails)
    print(f"Added {len(processed_emails)} emails to ChromaDB")
else:
    print(f"ChromaDB already contains {collection_stats['count']} emails")

## 4. Implement Similarity Search

Let's implement and test the similarity search functionality using both direct vector comparison and ChromaDB.

In [None]:
# Initialize retrievers
vector_retriever = EmailRetriever(
    embedder=embedder,
    use_faiss=True,
    index_path="../data/embeddings/faiss_index.bin"
)

# Build the index
@time_function
def build_index(emails):
    vector_retriever.build_index(emails)

build_index(emails_with_embeddings)

# Initialize ChromaDB retriever
chroma_retriever = ChromaDBRetriever(chroma_store=chroma_store)

Now, let's test the retrieval with some sample queries:

In [None]:
# Test queries
test_queries = [
    "What's the status of the project?",
    "When is the next team meeting?",
    "Can you provide an update on the budget?",
    "Is there any issue with the system?",
    "What are the plans for the weekend?"
]

# Test vector retrieval
print("Vector Retrieval Results:")
for query in test_queries:
    print(f"\nQuery: {query}")
    
    # Retrieve similar emails
    start_time = time.time()
    results = vector_retriever.retrieve(query, top_k=3)
    end_time = time.time()
    
    print(f"Retrieved {len(results)} emails in {end_time - start_time:.4f} seconds")
    
    # Display results
    for i, result in enumerate(results):
        metadata = result.get('metadata', {})
        similarity = result.get('similarity_score', 0.0)
        print(f"Result {i+1}: {metadata.get('subject', '')} (Similarity: {similarity:.4f})")

In [None]:
# Test ChromaDB retrieval
print("ChromaDB Retrieval Results:")
for query in test_queries:
    print(f"\nQuery: {query}")
    
    # Retrieve similar emails
    start_time = time.time()
    results = chroma_retriever.retrieve(query, top_k=3)
    end_time = time.time()
    
    print(f"Retrieved {len(results)} emails in {end_time - start_time:.4f} seconds")
    
    # Display results
    for i, result in enumerate(results):
        metadata = result.get('metadata', {})
        similarity = result.get('similarity_score', 0.0)
        print(f"Result {i+1}: {metadata.get('subject', '')} (Similarity: {similarity:.4f})")

## 5. Implement Response Generation

Now, let's implement the response generation using a pre-trained language model.

In [None]:
# Initialize the generator
generator = ResponseGenerator(model_name="google/flan-t5-base")

# Test response generation
print("Response Generation:")
for query in test_queries[:2]:  # Use only the first two queries to save time
    print(f"\nQuery: {query}")
    
    # Retrieve similar emails
    retrieved_emails = chroma_retriever.retrieve(query, top_k=3)
    
    # Generate response
    start_time = time.time()
    response = generator.generate_response(query, retrieved_emails)
    end_time = time.time()
    
    print(f"Generated response in {end_time - start_time:.4f} seconds")
    print(f"Response: {response}")

## 6. Implement End-to-End RAG Pipeline

Finally, let's implement the end-to-end RAG pipeline that combines retrieval and generation.

In [None]:
# Initialize the RAG pipeline
rag_pipeline = RAGPipeline(
    retriever=chroma_retriever,
    generator=generator,
    top_k=3
)

# Test the RAG pipeline
print("RAG Pipeline:")
for query in test_queries:
    print(f"\nQuery: {query}")
    
    # Process the query
    start_time = time.time()
    result = rag_pipeline.process_query(query)
    end_time = time.time()
    
    print(f"Processed query in {end_time - start_time:.4f} seconds")
    print(f"Response: {result['response']}")
    print(f"Retrieved {len(result['retrieved_emails'])} emails")

## 7. Save the Models

Let's save the models for later use in the API.

In [None]:
# ChromaDB is already saved in the persist_directory
print(f"ChromaDB is saved in: {chroma_store.persist_directory}")

# FAISS index is already saved
print(f"FAISS index is saved in: {vector_retriever.index_path}")

# The transformer models are cached by the Hugging Face library
print(f"Embedding model: {embedder.model_name}")
print(f"Generator model: {generator.model_name}")

## 8. Summary

In this notebook, we've:

1. Loaded the preprocessed emails from the previous notebook
2. Embedded the emails using a pre-trained Sentence Transformer model
3. Set up ChromaDB for efficient vector storage and retrieval
4. Implemented similarity search using both direct vector comparison and ChromaDB
5. Implemented response generation using a pre-trained language model
6. Created an end-to-end RAG pipeline that combines retrieval and generation
7. Saved the models for later use in the API

The RAG pipeline is now ready to be integrated into the API in the next steps.