# Compare Embedding Function

## Create collections

In [1]:
import chromadb
from chromadb.utils.embedding_functions.sentence_transformer_embedding_function import SentenceTransformerEmbeddingFunction
from chromadb.utils.embedding_functions.ollama_embedding_function import OllamaEmbeddingFunction

client = chromadb.Client()
all_MiniLM_L6_v2 = client.get_or_create_collection(
    name="all_MiniLM_L6_v2", 
    embedding_function=SentenceTransformerEmbeddingFunction(
        "sentence-transformers/all-MiniLM-L6-v2", device="cuda") #fp32
)
all_MiniLM_L12_v2 = client.get_or_create_collection(
    name="all_MiniLM_L12_v2", 
    embedding_function=SentenceTransformerEmbeddingFunction(
        "sentence-transformers/all-MiniLM-L12-v2", device="cuda") #fp32
)
nomic_embed = client.get_or_create_collection(
    name="nomic_embed",
    embedding_function=OllamaEmbeddingFunction(
        url="http://localhost:11434/api/embeddings", 
        model_name="nomic-embed-text:latest") #fp16
)


## Define evaluation functions

Copied from https://github.com/ALucek/linear-adapter-embedding/blob/main/Linear_Adapter.ipynb

In [2]:

def reciprocal_rank(retrieved_docs, ground_truth, k):
    try:
        rank = retrieved_docs.index(ground_truth) + 1
        return 1.0 / rank if rank <= k else 0.0
    except ValueError:
        return 0.0


def hit_rate(retrieved_docs, ground_truth, k):
    return 1.0 if ground_truth in retrieved_docs[:k] else 0.0


## Prepare dataset

In [3]:
from utils import get_train_test

train_cq, test_cq = get_train_test("../../data/chunk_question_pairs")
all_chunks = train_cq['chunk'].tolist()
all_chunks.extend(test_cq['chunk'].tolist())

## Embed chunks

In [4]:
from tqdm import tqdm
def insert_documents(collection, all_chunks):
    i = 0
    for chunk in tqdm(all_chunks):
        collection.add(
        documents=[chunk],
        ids=[f"chunk_{i}"]
        )
        i += 1

In [None]:
import time
t0 = time.time()
insert_documents(all_MiniLM_L6_v2, all_chunks)
t1 = time.time()
insert_documents(all_MiniLM_L12_v2, all_chunks)
t2 = time.time()
insert_documents(nomic_embed, all_chunks)
t3 = time.time()
print(t1-t0)
print(t2-t1)
print(t3-t2)

100%|██████████| 24000/24000 [06:20<00:00, 63.04it/s]


## Evaluate result

In [6]:
import numpy as np

def validate_embedding_model(validation_data, collection, k=10):
    hit_rates = []
    reciprocal_ranks = []
    
    for _, row in validation_data.iterrows():
        question = row['question']
        ground_truth = row['chunk']
        
        results = collection.query(
            query_texts=[question],
            n_results=k
        )
        
        retrieved_docs = results["documents"][0]
        
        # Calculate metrics
        hr = hit_rate(retrieved_docs, ground_truth, k)
        rr = reciprocal_rank(retrieved_docs, ground_truth, k)
        
        hit_rates.append(hr)
        reciprocal_ranks.append(rr)
    
    # Calculate average metrics
    avg_hit_rate = np.mean(hit_rates)
    avg_reciprocal_rank = np.mean(reciprocal_ranks)
    
    return {
        'average_hit_rate': avg_hit_rate,
        'average_reciprocal_rank': avg_reciprocal_rank
    }

In [9]:
results = validate_embedding_model(test_cq, all_MiniLM_L6_v2)
print(f"Average Hit Rate @10: {results['average_hit_rate']}")
print(f"Mean Reciprocal Rank @10: {results['average_reciprocal_rank']}")

Average Hit Rate @10: 0.8414583333333333
Mean Reciprocal Rank @10: 0.6692891038359788


In [10]:
results = validate_embedding_model(test_cq, all_MiniLM_L12_v2)
print(f"Average Hit Rate @10: {results['average_hit_rate']}")
print(f"Mean Reciprocal Rank @10: {results['average_reciprocal_rank']}")

Average Hit Rate @10: 0.8245833333333333
Mean Reciprocal Rank @10: 0.6459053406084656


In [11]:
results = validate_embedding_model(test_cq, nomic_embed)
print(f"Average Hit Rate @10: {results['average_hit_rate']}")
print(f"Mean Reciprocal Rank @10: {results['average_reciprocal_rank']}")

Average Hit Rate @10: 0.8635416666666667
Mean Reciprocal Rank @10: 0.7007503306878307
