In [1]:
import chromadb
chroma_client = chromadb.PersistentClient(path="vectordb")

In [2]:
# chroma_client.delete_collection(name="email_data")

In [None]:
collection = chroma_client.create_collection(name="email_data")

In [6]:
import re

def clean_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\xa0', '', text)
    text = re.sub(r'\u200c', '', text)

    return text

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_text(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=700,
        chunk_overlap=20
    )
    return text_splitter.split_text(text)

In [8]:
from authentication import create_service

client_secret_file = 'client_secret.json'
API_SERVICE_NAME = 'gmail'
API_VERSION = 'v1'
SCOPES = ['https://mail.google.com/']
service = create_service(client_secret_file, API_SERVICE_NAME, API_VERSION, SCOPES)

def fetch_emails(service, user_id='me', max_results=100):
    try:
        # Fetch the list of messages
        results = service.users().messages().list(userId=user_id, maxResults=max_results).execute()
        messages = results.get('messages', [])
        
        email_data = []
        for message in messages:
            msg = service.users().messages().get(userId=user_id, id=message['id']).execute()
            email_data.append(msg)
        
        return email_data
    except Exception as e:
        print(f'An error occurred: {e}')
        return []

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=913463864805-eh4f3jbgmevfjobftumd50c77opbcq7g.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A50273%2F&scope=https%3A%2F%2Fmail.google.com%2F&state=dJVUWqrd4AuLIC3djC5XWSOGCXAHxy&access_type=offline
gmail v1 service created successfully


In [3]:
import base64

def add_emails_to_collection(collection, emails):
    for email in emails:
        # For example, extract subject, sender, etc.
        subject = next(header['value'] for header in email['payload']['headers'] if header['name'] == 'Subject')
        sender = next(header['value'] for header in email['payload']['headers'] if header['name'] == 'From')
        snippet = email.get('snippet', '')
        mssg_id = email['id']

        # added for email chain context
        # thread_id = next(header['value'] for header in email['payload']['headers'] if header['name'] == 'Thread-Id')
        timestamp = email.get('internalDate', None)
        reply_depth = email.get('reply_depth', 0)  # reply depth to get nth order of reply. default to 0 for initial email


        # Clean the extracted text
        clean_subject = clean_text(subject)
        clean_sender = clean_text(sender)
        clean_snippet = clean_text(snippet)

        # Extract image data if available
        image_data = []
        if 'parts' in email['payload']:
            for part in email['payload']['parts']:
                if part['filename'] and 'image' in part['mimeType']:
                    # Decode the image data
                    img_data = base64.urlsafe_b64decode(part['body']['data'])
                    image_data.append(img_data)

        # Split the snippet into chunks
        snippet_chunks = split_text(clean_snippet)

        # Add each chunk to the collection
        for i, chunk in enumerate(snippet_chunks):
            chunk_id = f"{mssg_id}_{i}"
            collection.add(
                ids=[chunk_id],
                metadatas=[{
                    'subject': clean_subject,
                    'sender': clean_sender,
                    'chunk_index': i,
                    'timestamp': timestamp,  # Store timestamp for ordering
                    # 'thread_id': thread_id,
                    'reply_depth': reply_depth,
                }],
                documents=[chunk]
            )

In [11]:
emails = fetch_emails(service)
print(len(emails))
add_emails_to_collection(collection, emails)

100


In [None]:
thread_context = collection.query(
        query_texts=[email["subject"]],
        filter={"thread_id": email["thread_id"]},
        n_results=10
    )
    sorted_context = sorted(thread_context["documents"], key=lambda x: x['timestamp'])

In [76]:
results = collection.query(
    query_texts=["dropbox features"],
    n_results=4
)
print(results["documents"])

[['what are the new dropbox features', 'hello dropbox has recently introduced some exciting new features to help you better manage your digital content here are a few key updates 1 automated folders create folders that automatically', 'new features quality enhancements and much more', 'get creative cloud all apps one plan endless possibilities bring any idea to life with the creative cloud all apps plan get photoshop illustrator adobe express and the latest generative ai', 'create quickly and easily with templates from adobe express kick off the holiday spirit with a spectacular party invite making holidaythemed party invites is easy with adobe express browse from']]


# **Collection Exists!**

In [6]:
old_collection = chroma_client.get_collection(name="email_data")

In [7]:
results = old_collection.query(
    query_texts=["dropbox features"],
    n_results=5
)
print(results["documents"])

[['what are the new dropbox features', 'hello dropbox has recently introduced some exciting new features to help you better manage your digital content here are a few key updates 1 automated folders create folders that automatically', 'new features quality enhancements and much more', 'get creative cloud all apps one plan endless possibilities bring any idea to life with the creative cloud all apps plan get photoshop illustrator adobe express and the latest generative ai', 'create quickly and easily with templates from adobe express kick off the holiday spirit with a spectacular party invite making holidaythemed party invites is easy with adobe express browse from']]


# **Generate Reply**

In [1]:
from dotenv import dotenv_values

config = dotenv_values(".env")
sec_key = config["HF_TOKEN"]

In [2]:
from langchain_huggingface import HuggingFaceEndpoint

repo_id = "mistralai/Mistral-7B-Instruct-v0.3"
llm = HuggingFaceEndpoint(repo_id=repo_id, max_length=128, temperature=0.7)

                    max_length was transferred to model_kwargs.
                    Please make sure that max_length is what you intended.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [2]:
new_email = {
    "Subject": "test mail",
    "From": "Proxylol Account <proxylola193@gmail.com>",
    "To": "\"shogunmasters54@gmail.com\" <shogunmasters54@gmail.com>",
    "Body": "Hello how are you? I was thinking to complete the project this sunday. Confirm if you are available."
}

In [1]:
def draftEmail(email):
    # Perform a query search with the email body
    query_results = old_collection.query(
        query_texts=[email["Body"]],
        n_results=4
    )
    context_chromadb = query_results["documents"]
    print(context_chromadb)
    
    # reply for the mail
    reply_subject = f"Re: {email['Subject']}"

    # Prompt for the email body
    body_prompt = f"Email Body:\n{email['Body']}, Email Subject:\n{email['Subject']}\n\nRelevant Context:\n{context_chromadb}\n\nDraft a reply to this email. Include only the body of the email:"
    reply_draft = llm.invoke(body_prompt)

    # Combine subject and body drafts into the final email format
    final_email = {
        "Subject": reply_subject.strip(),
        "Body": reply_draft.strip(),
        "From": email["To"],
        "To": email["From"]
    }

    return final_email


In [18]:
print(draftEmail(new_email))

[['don39t miss out on your creative boost the creativity conference 1516 oct free online event adobe max two weeks until max  don39t miss out mark your calendars only two weeks to go until the', 'steam 1 game you39ve wished for is on sale the witness 75  849  212 week long deal offer ends 23 sep 1000pm ist you wake up alone on a strange island full of puzzles that will challenge and', 'create stunning designs in minutes for free dial up the diwali spirit with stunning designs this diwali illuminate your creativity create dazzling greetings social contents flyers and more with', 'stock up on everything you need to bring seasonal projects to life get in the spirit early it39s never too early to get a head start on your holiday projects with cozy festive content from adobe']]
{'Subject': 'Re: test mail', 'Body': "Hi there,\n\nYes, I am available this Sunday to complete the project. Let's make it happen!\n\nBest,\n[Your Name]", 'From': '"shogunmasters54@gmail.com" <shogunmasters54@gmail.co

# Image-To-Text

In [18]:
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

def generate_image_captions(img_path):
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

    raw_image = Image.open(img_path).convert('RGB')

    # conditional image captioning
    text = "an image of"
    inputs = processor(raw_image, text, return_tensors="pt")

    out = model.generate(**inputs)
    print("Conditional caption:", processor.decode(out[0], skip_special_tokens=True))

    # unconditional image captioning
    inputs = processor(raw_image, return_tensors="pt")

    out = model.generate(**inputs)
    print("Unconditional caption:", processor.decode(out[0], skip_special_tokens=True))

In [19]:
import requests

# Hugging Face API URL and your personal token
API_URL = "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large"
headers = {"Authorization": f"Bearer YOUR_API_TOKEN_HERE"}

def generate_image_caption_endpoint(img_path):
    """
    Sends an image to the Hugging Face endpoint to generate captions.
    """
    try:
        # Open the image in binary mode
        with open(img_path, "rb") as img_file:
            response = requests.post(API_URL, headers=headers, files={"image": img_file})

        # Handle the response
        if response.status_code == 200:
            return response.json()["generated_text"]  # Extract the caption
        else:
            return f"Error {response.status_code}: {response.text}"

    except FileNotFoundError:
        return "Error: The image file was not found. Please check the file path."
    except Exception as e:
        return f"Error: {e}"

# Example Usage
image_path = "path/to/your/image.jpg"  # Replace with your image file path
caption = generate_image_caption_endpoint(image_path)
print("Generated Caption:", caption)


Generated Caption: Error 400: {"error":["Error in `inputs`: Invalid image: b'--7672d76e43e58c520b..'"]}


In [None]:
# generate_image_captions(new_emails['attachments'])
generate_image_captions('image_attachments/kutta.jpg')

# Audio-To-Text

In [17]:
import requests

# Set your Hugging Face API token and model endpoint
MODEL_ID = 'openai/whisper-large-v3-turbo'
URL = f'https://api-inference.huggingface.co/models/openai/whisper-large-v3-turbo'

headers = {
    'Authorization': f'Bearer {sec_key}',
    'Content-Type': 'application/json'
}

# Load your audio file
audio_file_path = 'audio_attachments/who-are-you-talking-to.mp3'
with open(audio_file_path, 'rb') as audio_file:
    audio_data = audio_file.read()

# Send a POST request to the model
response = requests.post(URL, headers=headers, files={'file': audio_data})

# Check for successful response
if response.status_code == 200:
    transcription = response.json()
    print(transcription['text'])
else:
    print(f"Error: {response.status_code}, {response.text}")

 Who are you talking to right now? Do you know how much I make a year? I mean, even if I told you, you wouldn't believe it. Do you know what would happen if I suddenly decided to stop going into work? A business big enough that it could be listed on the NASDAQ goes belly up. Disappears. It ceases to exist without me. No, you clearly don't know who you're talking to, so let me clue you in. I am not in danger. I am the danger. A guy opens his door and gets shot and you think that of me? No. I am the one who knocks.


# Evaluating RAG System

In [2]:
old_collection = chroma_client.get_collection(name="email_data")

In [9]:
emails = fetch_emails(service)
print(len(emails))



In [13]:
# Extract email bodies and store them for evaluation
email_bodies = [email['snippet'] for email in emails]

In [18]:
def retrieve_documents_from_chromadb(query):
    docs = old_collection.query(query_texts=[query], n_results=5)
    return docs['documents']

data_queries = []

# Iterate through each query, retrieve documents and store in data structure
for query in queries:
     retrieved_documents = retrieve_documents_from_chromadb(query)
     data_query = {
         'query': query,
         'retrieved_documents': retrieved_documents
     }
     data_queries.append(data_query)


In [None]:
data_queries[0]

In [26]:
collection = chroma_client.get_collection(name="email_data")

# Retrieve all documents
all_docs = collection.get()

# Print the retrieved documents
print(all_docs["documents"])

['i am having problem in uploading to dropbox can you kindly help me', 'expt 1 to perform basic operations using tensorflow due january 18 2025 1159 pm instructions perform basic operations using tensorflow perform linear algebra operations using tensorflow perform', 'this is a copy of a security alert sent to lakshyahero20gmailcom shogunmasters54gmailcom is the recovery email for this account if you don39t recognise this account remove it a new signin', 'this is a copy of a security alert sent to lakshyahero20gmailcom shogunmasters54gmailcom is the recovery email for this account if you don39t recognise this account remove it account', 'google verification code dear google user we received a request to access your google account lakshyahero20gmailcom through your email address your google verification code is 870484 if you did', 'epic games thank you hi shogun thank you for your purchase invoice id f4002548430 your order information order id bill to f2501011624349856 shogunmasters54gm

# FAISS vectordb

In [5]:
import numpy as np

# Retrieve all embeddings
collection = chroma_client.get_collection(name="email_data")
data = collection.get(include=["embeddings"])  # Fetch stored embeddings
embeddings = np.array(data["embeddings"], dtype="float32")

print(f"Extracted {embeddings.shape[0]} embeddings of dimension {embeddings.shape[1]}")

Extracted 97 embeddings of dimension 384


In [6]:
import faiss
import os

# Define FAISS index storage path
faiss_folder = "faissDB"
os.makedirs(faiss_folder, exist_ok=True)
faiss_index_path = os.path.join(faiss_folder, "faiss_index")

# Create FAISS index (assuming L2 distance and known embedding dimension)
d = embeddings.shape[1]  # Get the dimension of vectors
index = faiss.IndexFlatL2(d)  # Use IndexFlatL2 (L2 distance) or another suitable index type
index.add(embeddings)  # Add extracted embeddings to FAISS

# Save the FAISS index
faiss.write_index(index, faiss_index_path)

print(f"FAISS index saved at {faiss_index_path}")

FAISS index saved at faissDB/faiss_index


In [7]:
# Load the saved FAISS index
index = faiss.read_index(faiss_index_path)

# Example: Search for nearest neighbors of a random query vector
query_vector = np.random.random((1, d)).astype("float32")  # Replace with actual query
D, I = index.search(query_vector, k=5)  # Get top 5 nearest neighbors

print("Nearest neighbor indices:", I)
print("Distances:", D)

Nearest neighbor indices: [[55 91 57 37 82]]
Distances: [[117.83128 118.42472 118.49402 118.56811 118.80153]]


# ChromaDB Evaluation

In [37]:
import chromadb
import pandas as pd
import numpy as np
from tqdm import tqdm

# Load Ground Truth Data
groundtruth_file = "ground_truth.csv"
groundtruth = pd.read_csv(groundtruth_file)

# Convert relevant_docs from string to list
groundtruth['relevant_docs'] = groundtruth['relevant_docs'].apply(lambda x: x.split("||"))

# Connect to ChromaDB
collection = chroma_client.get_collection(name="email_data")

In [39]:
# Define Evaluation Metrics
def recall_at_k(retrieved, relevant, k=10):
    retrieved_at_k = retrieved[:k]
    return len(set(retrieved_at_k) & set(relevant)) / len(relevant) if relevant else 0

def precision_at_k(retrieved, relevant, k=10):
    retrieved_at_k = retrieved[:k]
    return len(set(retrieved_at_k) & set(relevant)) / k

def reciprocal_rank(retrieved, relevant):
    for i, doc in enumerate(retrieved, start=1):
        if doc in relevant:
            return 1 / i
    return 0

# Run Evaluation
recall_scores = []
precision_scores = []
rr_scores = []

for _, row in tqdm(groundtruth.iterrows(), total=len(groundtruth)):
    query = row['query']
    relevant_docs = row['relevant_docs']
    
    # Query ChromaDB
    results = collection.query(query_texts=[query], n_results=10)
    
    # Extract retrieved document texts
    retrieved_docs = results['documents'][0] if 'documents' in results else []

    # Compute Metrics
    recall_scores.append(recall_at_k(retrieved_docs, relevant_docs, k=10))
    precision_scores.append(precision_at_k(retrieved_docs, relevant_docs, k=10))
    rr_scores.append(reciprocal_rank(retrieved_docs, relevant_docs))

100%|██████████| 10/10 [00:00<00:00, 11.34it/s]


In [40]:
# Print Final Metrics
print("\n=== VectorDB Evaluation Metrics ===")
print(f"Recall: {np.mean(recall_scores):.4f}")
print(f"Precision: {np.mean(precision_scores):.4f}")
print(f"Mean Reciprocal Rank (MRR): {np.mean(rr_scores):.4f}")


=== VectorDB Evaluation Metrics ===
Recall: 0.4400
Precision: 0.4400
Mean Reciprocal Rank (MRR): 0.8250


# PineCone Evaluation

In [2]:
import pinecone
import pandas as pd
import numpy as np
from tqdm import tqdm

# Pinecone Configuration
PINECONE_API_KEY = "88c2edde-2d71-480a-aeff-62c7061bd7f2"  # Replace with your API key
# PINECONE_ENVIRONMENT = "your-environment"
INDEX_NAME = "new-index"

# Initialize Pinecone Client
index = pinecone.Pinecone(api_key=PINECONE_API_KEY).Index(INDEX_NAME)

# Load Ground Truth Data
groundtruth_file = "ground_truth.csv"
groundtruth = pd.read_csv(groundtruth_file)

# Convert relevant_docs from string to list
groundtruth['relevant_docs'] = groundtruth['relevant_docs'].apply(lambda x: x.split(" || "))

# Define Evaluation Metrics
def recall_at_k(retrieved, relevant, k=10):
    retrieved_at_k = retrieved[:k]
    return len(set(retrieved_at_k) & set(relevant)) / len(relevant) if relevant else 0

def precision_at_k(retrieved, relevant, k=10):
    retrieved_at_k = retrieved[:k]
    return len(set(retrieved_at_k) & set(relevant)) / k

def reciprocal_rank(retrieved, relevant):
    for i, doc in enumerate(retrieved, start=1):
        if doc in relevant:
            return 1 / i
    return 0

# Function to Query Pinecone
def query_pinecone(query_vector, top_k=10):
    results = index.query(vector=query_vector, top_k=top_k, include_metadata=True)
    return [match['metadata']['text'] for match in results['matches'] if 'metadata' in match and 'text' in match['metadata']]

# Assume you have a function to convert text to embeddings
def get_embedding(text):
    # Replace with your embedding model (OpenAI, SentenceTransformers, etc.)
    return [0.1] * 768  # Example placeholder

# Run Evaluation
recall_scores = []
precision_scores = []
rr_scores = []

for _, row in tqdm(groundtruth.iterrows(), total=len(groundtruth)):
    query = row['query']
    relevant_docs = row['relevant_docs']
    
    # Convert query to embedding
    query_vector = get_embedding(query)
    
    # Query Pinecone
    retrieved_docs = query_pinecone(query_vector, top_k=10)

    # Compute Metrics
    recall_scores.append(recall_at_k(retrieved_docs, relevant_docs, k=10))
    precision_scores.append(precision_at_k(retrieved_docs, relevant_docs, k=10))
    rr_scores.append(reciprocal_rank(retrieved_docs, relevant_docs))

# Print Final Metrics
print("\n=== Pinecone Evaluation Metrics ===")
print(f"Recall: {np.mean(recall_scores):.4f}")
print(f"Precision: {np.mean(precision_scores):.4f}")
print(f"Mean Reciprocal Rank (MRR): {np.mean(rr_scores):.4f}")

100%|██████████| 10/10 [00:03<00:00,  2.92it/s]


=== Pinecone Evaluation Metrics ===
Recall: 0.0000
Precision: 0.0000
Mean Reciprocal Rank (MRR): 0.0000





# Embeddings Comparison

In [2]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Load existing email data
old_collection = chroma_client.get_collection(name="email_data")
all_data = old_collection.get()  # Fetch stored data

# Extract existing data
existing_ids = all_data["ids"]
existing_docs = all_data["documents"]
existing_metadata = all_data["metadatas"]

# Choose different Hugging Face embedding models
embedding_models = {
    "all-MiniLM-L6-v2": SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2"),
    "bge-base-en": SentenceTransformer("BAAI/bge-base-en"),
    "sentence-t5-base": SentenceTransformer("sentence-transformers/sentence-t5-base"),
}

In [3]:
# Load Ground Truth Data
groundtruth_file = "ground_truth.csv"
groundtruth = pd.read_csv(groundtruth_file)

# Convert relevant_docs from string to list
groundtruth['relevant_docs'] = groundtruth['relevant_docs'].apply(lambda x: x.split("||"))

In [4]:
# Define Evaluation Metrics
def recall_at_k(retrieved, relevant, k=10):
    return len(set(retrieved[:k]) & set(relevant)) / len(relevant) if relevant else 0

def precision_at_k(retrieved, relevant, k=10):
    return len(set(retrieved[:k]) & set(relevant)) / k

def reciprocal_rank(retrieved, relevant):
    for i, doc in enumerate(retrieved, start=1):
        if doc in relevant:
            return 1 / i
    return 0

def ndcg_at_k(retrieved, relevant, k=10):
    dcg = sum(1 / np.log2(i + 1) for i, doc in enumerate(retrieved[:k], start=1) if doc in relevant)
    idcg = sum(1 / np.log2(i + 1) for i in range(1, min(len(relevant), k) + 1))
    return dcg / idcg if idcg > 0 else 0

# Function to Compute Embeddings
def get_embedding(text, model):
    return model.encode(text).tolist()

# Store Evaluation Results
results_summary = {}

In [5]:
for model_name, model in embedding_models.items():
    print(f"\nProcessing ChromaDB with {model_name} embeddings...")

    # Create new collection
    collection_name = f"email_data_{model_name.replace('-', '_')}"
    chroma_client.delete_collection(collection_name)  # Clear previous data if exists
    new_collection = chroma_client.get_or_create_collection(name=collection_name)

    # Reindexing Emails with New Embeddings
    print(f"Indexing emails into {collection_name}...")
    all_embeddings = []
    
    for idx, doc in tqdm(enumerate(existing_docs), total=len(existing_docs)):
        embedding = get_embedding(doc, model)
        all_embeddings.append(embedding)

        new_collection.add(
            ids=[existing_ids[idx]],
            embeddings=[embedding],
            documents=[doc],
            metadatas=[existing_metadata[idx]],
        )

    # Convert embeddings to NumPy array for analysis
    all_embeddings_np = np.array(all_embeddings)

    # Compute Cosine Similarity Distribution
    similarity_matrix = cosine_similarity(all_embeddings_np)
    avg_cosine_similarity = np.mean(similarity_matrix)

    # Compute Clustering Quality (Silhouette Score)
    num_clusters = min(10, len(all_embeddings_np))  # Set number of clusters (max 10)
    kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10).fit(all_embeddings_np)
    silhouette_avg = silhouette_score(all_embeddings_np, kmeans.labels_) if len(all_embeddings_np) > 1 else 0

    # Run Retrieval Evaluation
    recall_scores, precision_scores, rr_scores, ndcg_scores = [], [], [], []
    query_times = []

    for _, row in tqdm(groundtruth.iterrows(), total=len(groundtruth)):
        query = row['query']
        relevant_docs = row['relevant_docs']

        query_embedding = get_embedding(query, model)

        # Measure Query Latency
        start_time = time.time()
        results = new_collection.query(query_embeddings=[query_embedding], n_results=10)
        query_times.append(time.time() - start_time)

        retrieved_docs = results['documents'][0] if 'documents' in results else []

        # Compute Metrics
        recall_scores.append(recall_at_k(retrieved_docs, relevant_docs, k=10))
        precision_scores.append(precision_at_k(retrieved_docs, relevant_docs, k=10))
        rr_scores.append(reciprocal_rank(retrieved_docs, relevant_docs))
        ndcg_scores.append(ndcg_at_k(retrieved_docs, relevant_docs, k=10))

    # Store Results
    results_summary[model_name] = {
        "Recall@10": np.mean(recall_scores),
        "Precision@10": np.mean(precision_scores),
        "MRR": np.mean(rr_scores),
        "NDCG@10": np.mean(ndcg_scores),
        "Avg Query Latency (ms)": np.mean(query_times) * 1000,
        "Avg Cosine Similarity": avg_cosine_similarity,
        "Silhouette Score": silhouette_avg,
    }


Processing ChromaDB with all-MiniLM-L6-v2 embeddings...
Indexing emails into email_data_all_MiniLM_L6_v2...


100%|██████████| 97/97 [00:03<00:00, 25.96it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 10/10 [00:00<00:00, 20.96it/s]



Processing ChromaDB with bge-base-en embeddings...
Indexing emails into email_data_bge_base_en...


100%|██████████| 97/97 [00:03<00:00, 26.20it/s]
100%|██████████| 10/10 [00:00<00:00, 18.99it/s]



Processing ChromaDB with sentence-t5-base embeddings...
Indexing emails into email_data_sentence_t5_base...


100%|██████████| 97/97 [00:07<00:00, 12.24it/s]
100%|██████████| 10/10 [00:00<00:00, 13.17it/s]


In [6]:
# Print Final Comparison
print("\n=== Embedding Model Comparison ===")
for model_name, metrics in results_summary.items():
    print(f"\nModel: {model_name}")
    print(f"  Recall@10: {metrics['Recall@10']:.4f}")
    print(f"  Precision@10: {metrics['Precision@10']:.4f}")
    print(f"  Mean Reciprocal Rank (MRR): {metrics['MRR']:.4f}")
    print(f"  NDCG@10: {metrics['NDCG@10']:.4f}")
    print(f"  Avg Query Latency: {metrics['Avg Query Latency (ms)']:.2f} ms")
    print(f"  Avg Cosine Similarity: {metrics['Avg Cosine Similarity']:.4f}")
    print(f"  Silhouette Score: {metrics['Silhouette Score']:.4f}")


=== Embedding Model Comparison ===

Model: all-MiniLM-L6-v2
  Recall@10: 0.4400
  Precision@10: 0.4400
  Mean Reciprocal Rank (MRR): 0.8250
  NDCG@10: 0.5736
  Avg Query Latency: 1.64 ms
  Avg Cosine Similarity: 0.2261
  Silhouette Score: 0.2807

Model: bge-base-en
  Recall@10: 0.4800
  Precision@10: 0.4800
  Mean Reciprocal Rank (MRR): 0.9500
  NDCG@10: 0.5679
  Avg Query Latency: 1.70 ms
  Avg Cosine Similarity: 0.7919
  Silhouette Score: 0.2890

Model: sentence-t5-base
  Recall@10: 0.4400
  Precision@10: 0.4400
  Mean Reciprocal Rank (MRR): 0.8167
  NDCG@10: 0.6057
  Avg Query Latency: 1.64 ms
  Avg Cosine Similarity: 0.7681
  Silhouette Score: 0.2424
