# Compare Embedding Function

## Create collections

In [1]:
import chromadb
from chromadb.utils.embedding_functions.sentence_transformer_embedding_function import SentenceTransformerEmbeddingFunction
from chromadb.utils.embedding_functions.ollama_embedding_function import OllamaEmbeddingFunction

client = chromadb.PersistentClient()
all_MiniLM_L6_v2 = client.get_or_create_collection(
    name="all_MiniLM_L6_v2", 
    embedding_function=SentenceTransformerEmbeddingFunction(
        "sentence-transformers/all-MiniLM-L6-v2", device="cuda") # fp32 80MB
)
all_MiniLM_L12_v2 = client.get_or_create_collection(
    name="all_MiniLM_L12_v2", 
    embedding_function=SentenceTransformerEmbeddingFunction(
        "sentence-transformers/all-MiniLM-L12-v2", device="cuda") # fp32 120MB
)
all_mpnet_base_v2 =client.get_or_create_collection(
    name="all_mpnet_base_v2", 
    embedding_function=SentenceTransformerEmbeddingFunction(
        "sentence-transformers/all-mpnet-base-v2", device="cuda") # fp32 420MB
)
nomic_embed = client.get_or_create_collection(
    name="nomic_embed",
    embedding_function=OllamaEmbeddingFunction(
        url="http://localhost:11434/api/embeddings", 
        model_name="nomic-embed-text:latest") #fp32 547MB or fp16 274MB
)

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

## Define evaluation functions

Copied from https://github.com/ALucek/linear-adapter-embedding/blob/main/Linear_Adapter.ipynb

In [2]:
def reciprocal_rank(retrieved_docs, ground_truth, k):
    try:
        rank = retrieved_docs.index(ground_truth) + 1
        return 1.0 / rank if rank <= k else 0.0
    except ValueError:
        return 0.0

def hit_rate(retrieved_docs, ground_truth, k):
    return 1.0 if ground_truth in retrieved_docs[:k] else 0.0


## Prepare dataset

In [3]:
from utils import get_train_test

train_cq, test_cq = get_train_test("../../data/chunk_question_pairs")
all_chunks = train_cq['chunk'].tolist()
all_chunks.extend(test_cq['chunk'].tolist())

## Embed chunks

In [4]:
from tqdm import tqdm
def insert_documents(collection, all_chunks):
    i = 0
    for chunk in tqdm(all_chunks):
        collection.add(
        documents=[chunk],
        ids=[f"chunk_{i}"]
        )
        i += 1

In [5]:
import time
t0 = time.time()
insert_documents(all_MiniLM_L6_v2, all_chunks)
t1 = time.time()
insert_documents(all_MiniLM_L12_v2, all_chunks)
t2 = time.time()
insert_documents(all_mpnet_base_v2, all_chunks)
t3 = time.time()
insert_documents(nomic_embed, all_chunks)
t4 = time.time()
print(t1-t0)
print(t2-t1)
print(t3-t2)
print(t4-t3)

100%|██████████| 24000/24000 [04:42<00:00, 84.85it/s] 
100%|██████████| 24000/24000 [06:09<00:00, 65.01it/s]
100%|██████████| 24000/24000 [09:11<00:00, 43.54it/s]
100%|██████████| 24000/24000 [08:53<00:00, 44.99it/s]

282.8640055656433
369.1847939491272
551.1855845451355
533.4819731712341





## Evaluate result

In [6]:
import numpy as np

def validate_embedding_model(validation_data, collection, k=10):
    hit_rates = []
    reciprocal_ranks = []
    
    for _, row in validation_data.iterrows():
        question = row['question']
        ground_truth = row['chunk']
        
        results = collection.query(
            query_texts=[question],
            n_results=k
        )
        
        retrieved_docs = results["documents"][0]
        
        # Calculate metrics
        hr = hit_rate(retrieved_docs, ground_truth, k)
        rr = reciprocal_rank(retrieved_docs, ground_truth, k)
        
        hit_rates.append(hr)
        reciprocal_ranks.append(rr)
    
    # Calculate average metrics
    avg_hit_rate = np.mean(hit_rates)
    avg_reciprocal_rank = np.mean(reciprocal_ranks)
    
    return {
        'average_hit_rate': avg_hit_rate,
        'average_reciprocal_rank': avg_reciprocal_rank
    }

In [7]:
results = validate_embedding_model(test_cq, all_MiniLM_L6_v2, k=10)
print(f"Average Hit Rate @10: {results['average_hit_rate']}")
print(f"Mean Reciprocal Rank @10: {results['average_reciprocal_rank']}")

results = validate_embedding_model(test_cq, all_MiniLM_L6_v2, k=5)
print(f"Average Hit Rate @5: {results['average_hit_rate']}")
print(f"Mean Reciprocal Rank @5: {results['average_reciprocal_rank']}")

Average Hit Rate @10: 0.84125
Mean Reciprocal Rank @10: 0.6688880621693121
Average Hit Rate @5: 0.781875
Mean Reciprocal Rank @5: 0.6608333333333334


In [8]:
results = validate_embedding_model(test_cq, all_MiniLM_L12_v2, k=10)
print(f"Average Hit Rate @10: {results['average_hit_rate']}")
print(f"Mean Reciprocal Rank @10: {results['average_reciprocal_rank']}")

results = validate_embedding_model(test_cq, all_MiniLM_L12_v2, k=5)
print(f"Average Hit Rate @5: {results['average_hit_rate']}")
print(f"Mean Reciprocal Rank @5: {results['average_reciprocal_rank']}")

Average Hit Rate @10: 0.8241666666666667
Mean Reciprocal Rank @10: 0.6451802248677247
Average Hit Rate @5: 0.7554166666666666
Mean Reciprocal Rank @5: 0.6360312499999999


In [9]:
results = validate_embedding_model(test_cq, all_mpnet_base_v2, k=10)
print(f"Average Hit Rate @10: {results['average_hit_rate']}")
print(f"Mean Reciprocal Rank @10: {results['average_reciprocal_rank']}")

results = validate_embedding_model(test_cq, all_mpnet_base_v2, k=5)
print(f"Average Hit Rate @5: {results['average_hit_rate']}")
print(f"Mean Reciprocal Rank @5: {results['average_reciprocal_rank']}")

Average Hit Rate @10: 0.83
Mean Reciprocal Rank @10: 0.6479026124338625
Average Hit Rate @5: 0.7652083333333334
Mean Reciprocal Rank @5: 0.6392708333333333


In [10]:
results = validate_embedding_model(test_cq, nomic_embed, k=10)
print(f"Average Hit Rate @10: {results['average_hit_rate']}")
print(f"Mean Reciprocal Rank @10: {results['average_reciprocal_rank']}")

results = validate_embedding_model(test_cq, nomic_embed, k=5)
print(f"Average Hit Rate @5: {results['average_hit_rate']}")
print(f"Mean Reciprocal Rank @5: {results['average_reciprocal_rank']}")

Average Hit Rate @10: 0.8620833333333333
Mean Reciprocal Rank @10: 0.6999638723544973
Average Hit Rate @5: 0.8016666666666666
Mean Reciprocal Rank @5: 0.6917743055555555
