# Finding the best Open-Source embedding model, for my particular use case

In [7]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
df = pd.read_csv('../data/models_hf/tmp9pxbn5uo.csv')
df

Unnamed: 0.1,Unnamed: 0,Rank (Borda),Model,Zero-shot,Memory Usage (MB),Number of Parameters,Embedding Dimensions,Max Tokens,Mean (Task),Mean (TaskType),Bitext Mining,Classification,Clustering,Instruction Retrieval,Multilabel Classification,Pair Classification,Reranking,Retrieval,STS
0,0,1,[gte-Qwen2-7B-instruct](https://huggingface.co...,⚠️ NA,29040,7B,3584,32768,62.51,55.93,73.92,61.55,52.77,4.94,25.48,85.13,65.55,60.08,73.98
1,1,2,[Linq-Embed-Mistral](https://huggingface.co/Li...,99%,13563,7B,4096,32768,61.47,54.14,70.34,62.24,50.60,0.94,24.77,80.43,64.37,58.69,74.86
2,2,3,[multilingual-e5-large-instruct](https://huggi...,99%,1068,560M,1024,514,63.22,55.08,80.13,64.94,50.75,-0.40,22.91,80.86,62.61,57.12,76.81
3,3,4,[SFR-Embedding-Mistral](https://huggingface.co...,96%,13563,7B,4096,32768,60.90,53.92,70.00,60.02,51.84,0.16,24.55,80.29,64.19,59.44,74.79
4,4,5,[e5-mistral-7b-instruct](https://huggingface.c...,99%,13563,7B,4096,32768,60.25,53.08,70.58,60.31,50.57,-0.62,22.20,81.12,63.82,55.75,74.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,175,176,[m3e-small](https://huggingface.co/moka-ai/m3e...,99%,Unknown,Unknown,512,512,,,,,,,,,,,
176,176,177,[Ops-MoA-Yuan-embedding-1.0](https://huggingfa...,99%,2000,343M,1536,512,,,,,,,,,,,
177,177,178,[Ops-MoA-Conan-embedding-v1](https://huggingfa...,99%,2000,343M,1536,512,,,,,,,,,,,
178,178,178,[Qodo-Embed-1-7B](https://huggingface.co/Qodo/...,⚠️ NA,29040,7B,3584,32768,,,,,,,,,,,


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Unnamed: 0                 180 non-null    int64  
 1   Rank (Borda)               180 non-null    int64  
 2   Model                      180 non-null    object 
 3   Zero-shot                  180 non-null    object 
 4   Memory Usage (MB)          180 non-null    object 
 5   Number of Parameters       180 non-null    object 
 6   Embedding Dimensions       180 non-null    object 
 7   Max Tokens                 180 non-null    object 
 8   Mean (Task)                99 non-null     float64
 9   Mean (TaskType)            99 non-null     float64
 10  Bitext Mining              101 non-null    float64
 11  Classification             99 non-null     float64
 12  Clustering                 100 non-null    float64
 13  Instruction Retrieval      107 non-null    float64

In [11]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Rank (Borda),Mean (Task),Mean (TaskType),Bitext Mining,Classification,Clustering,Instruction Retrieval,Multilabel Classification,Pair Classification,Reranking,Retrieval,STS
count,180.0,180.0,99.0,99.0,101.0,99.0,100.0,107.0,101.0,99.0,99.0,102.0,100.0
mean,89.5,90.483333,46.154646,39.852424,38.425842,49.977677,38.4365,-1.580654,16.890792,73.734848,47.026869,37.28902,60.1662
std,52.105662,52.081423,8.497908,7.928574,21.270886,6.092785,6.546331,1.786547,3.231985,4.573255,10.247814,14.596642,11.138227
min,0.0,1.0,28.04,24.17,0.9,34.2,21.12,-4.32,10.39,62.58,27.61,6.63,30.07
25%,44.75,45.75,41.47,35.065,22.04,46.305,35.2325,-2.905,14.84,71.25,39.86,31.74,55.215
50%,89.5,90.5,45.0,39.06,28.37,48.18,39.075,-1.86,16.32,72.25,45.89,38.02,60.595
75%,134.25,135.25,52.345,45.785,58.56,54.62,41.4125,-0.66,18.63,77.295,53.82,49.0175,69.805
max,179.0,180.0,63.22,55.93,80.13,64.94,53.65,5.36,25.48,85.13,65.55,66.48,77.81


In [14]:
# Convert object columns to numeric (removing non-numeric characters where needed)
def clean_numeric_column(col):
    return pd.to_numeric(df[col].astype(str).str.replace(r'[^\d.]', '', regex=True), errors='coerce')

# Clean and convert relevant columns
df['Memory Usage (MB)'] = clean_numeric_column('Memory Usage (MB)')
df['Embedding Dimensions'] = clean_numeric_column('Embedding Dimensions')
df['Max Tokens'] = clean_numeric_column('Max Tokens')

# Clean and safely convert 'Number of Parameters' column
df['Number of Parameters (B)'] = pd.to_numeric(
    df['Number of Parameters'].str.replace(r'[^\d.]', '', regex=True),
    errors='coerce'
)

# Drop rows with missing critical fields
df_filtered = df.dropna(subset=['Mean (Task)', 'Mean (TaskType)', 'Memory Usage (MB)', 'Number of Parameters (B)'])

# Sort by Mean(Task) descending (higher is better)
top_models = df_filtered.sort_values(by='Mean (Task)', ascending=False).reset_index(drop=True)

# Show top 10 models with performance & resource usage
top_10_summary = top_models.loc[:9, ['Model', 'Mean (Task)', 'Mean (TaskType)', 'Memory Usage (MB)', 'Number of Parameters (B)', 'Embedding Dimensions', 'Max Tokens']]
top_10_summary

Unnamed: 0,Model,Mean (Task),Mean (TaskType),Memory Usage (MB),Number of Parameters (B),Embedding Dimensions,Max Tokens
0,[multilingual-e5-large-instruct](https://huggi...,63.22,55.08,1068.0,560.0,1024.0,514.0
1,[gte-Qwen2-7B-instruct](https://huggingface.co...,62.51,55.93,29040.0,7.0,3584.0,32768.0
2,[Linq-Embed-Mistral](https://huggingface.co/Li...,61.47,54.14,13563.0,7.0,4096.0,32768.0
3,[bilingual-embedding-large](https://huggingfac...,60.96,52.92,2136.0,559.0,1024.0,514.0
4,[SFR-Embedding-Mistral](https://huggingface.co...,60.9,53.92,13563.0,7.0,4096.0,32768.0
5,[e5-mistral-7b-instruct](https://huggingface.c...,60.25,53.08,13563.0,7.0,4096.0,32768.0
6,[SFR-Embedding-2_R](https://huggingface.co/Sal...,59.8,52.83,13563.0,7.0,4096.0,32768.0
7,[bge-m3](https://huggingface.co/BAAI/bge-m3),59.56,52.18,2167.0,568.0,4096.0,8194.0
8,[gte-Qwen2-1.5B-instruct](https://huggingface....,59.45,52.69,6776.0,1.0,8960.0,32768.0
9,[multilingual-e5-large](https://huggingface.co...,58.55,51.34,2136.0,560.0,1024.0,514.0
