In [9]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('Snowflake/snowflake-arctic-embed-m-v1.5')
input_texts = [
    'make me a sweet protein shake recipe',
    'a chiken recipe with high protein',
    "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
    "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments."
]
embeddings = model.encode(input_texts, normalize_embeddings=True)

# Compute similarity matrix (cosine similarity since embeddings are normalized)
similarity_matrix = np.dot(embeddings, embeddings.T)

# Display the similarity matrix
print("Similarity Matrix:")
print(similarity_matrix)

# For a prettier display with labels
print("\n\nFormatted Similarity Matrix:")
labels = ['protein query', 'summit query', 'protein passage', 'summit passage']
print(f"{'':20}", end='')
for label in labels:
    print(f"{label:20}", end='')
print()

for i, label_i in enumerate(labels):
    print(f"{label_i:20}", end='')
    for j in range(len(labels)):
        print(f"{similarity_matrix[i][j]:20.4f}", end='')
    print()

  from .autonotebook import tqdm as notebook_tqdm


Similarity Matrix:
[[1.0000002  0.84136367 0.5222373  0.35231265]
 [0.84136367 1.         0.50696504 0.33000165]
 [0.5222373  0.50696504 1.0000001  0.24097013]
 [0.35231265 0.33000165 0.24097013 1.0000002 ]]


Formatted Similarity Matrix:
                    protein query       summit query        protein passage     summit passage      
protein query                     1.0000              0.8414              0.5222              0.3523
summit query                      0.8414              1.0000              0.5070              0.3300
protein passage                   0.5222              0.5070              1.0000              0.2410
summit passage                    0.3523              0.3300              0.2410              1.0000


In [26]:
import pandas as pd

df = pd.read_csv('../experiments/exp1/data/raw/recipes_samples.csv')
df.head()

Unnamed: 0,NAME,TAGS,INGREDIENTS,STEPS,DESCRIPTION,SEARCH_TERMS,FILTERS
0,recipe NAME: sweet potato hash.,"recipe TAGS: 30minutesorless, timetomake, cour...","recipe INGREDIENTS: sweet potatoes, olive oil,...","recipe STEPS: grate the sweet potatoes, heat t...",recipe DESCRIPTION: i love this simple sweet p...,"recipe SEARCH_TERMS: lowsodium, lowcalorie, ve...","recipe FILTERS: vegan, vegetarian, lowcholeste..."
1,recipe NAME: uncle bill s balsamic vinegar dre...,"recipe TAGS: 15minutesorless, timetomake, cour...","recipe INGREDIENTS: balsamic vinegar, lemon ju...","recipe STEPS: in a food processor or blender, ...",recipe DESCRIPTION: this dressing is a nice ch...,"recipe SEARCH_TERMS: lowsodium, lowcarb.","recipe FILTERS: lowcarb, lowprotein, lowsodium."
2,recipe NAME: bread machine apricot nutmeg bread.,"recipe TAGS: timetomake, course, preparation, ...","recipe INGREDIENTS: water, vegetable oil, lemo...",recipe STEPS: put ingredients beginning with w...,recipe DESCRIPTION: always on the look out for...,"recipe SEARCH_TERMS: healthy, bread.","recipe FILTERS: lowcholesterol, lowsaturatedfat."
3,recipe NAME: sesame salad dressing.,"recipe TAGS: 15minutesorless, timetomake, cour...","recipe INGREDIENTS: extra virgin olive oil, sh...",recipe STEPS: combine first 5 ingredients in b...,recipe DESCRIPTION: this has been my favourite...,"recipe SEARCH_TERMS: lowcarb, vegetarian, vega...","recipe FILTERS: vegan, vegetarian, lowcarb."
4,recipe NAME: dinner in a hurry club roll up.,"recipe TAGS: 30minutesorless, timetomake, cour...","recipe INGREDIENTS: avocado, mayonnaise, lime ...","recipe STEPS: peel avocado, cut in half and re...",recipe DESCRIPTION: forget the bread keep a st...,"recipe SEARCH_TERMS: lunch, healthy, glutenfre...","recipe FILTERS: lowcholesterol, glutenfree."


In [6]:
import json 

CONFIG_FILE_PATH = "../config/config.json"

with open(CONFIG_FILE_PATH, "r", encoding="utf-8") as f:
    config = json.load(f)

In [10]:
MODELS_CONFIG = config["embedding_models"]

MODELS_LIST = [SentenceTransformer(model_id) for model_id in MODELS_CONFIG]
MODEL_DICT = dict(zip(MODELS_CONFIG, MODELS_LIST))

In [11]:
COLUMNS_TO_EMBEDDE = config["embedding_config"]

In [None]:
df['config_1'] = df["NAME"] + " " + df["TAGS"] + " " + df["SEARCH_TERMS"] + " " + df["FILTERS"] + " " + df["INGREDIENTS"] + " " + df["STEPS"] + " " + df["DESCRIPTION"]


In [21]:
df.drop(columns=['config_1'], inplace=True)

In [25]:
df.head()

Unnamed: 0,NAME,TAGS,INGREDIENTS,STEPS,DESCRIPTION,SEARCH_TERMS,FILTERS,intfloat/e5-base-v2/config_1_EMB_NUMBER_TOKEN,sentence-transformers/all-MiniLM-L6-v2/config_1_EMB_NUMBER_TOKEN,thenlper/gte-small/config_1_EMB_NUMBER_TOKEN,...,thenlper/gte-small/config_SEARCH_TERMS_EMB_NUMBER_TOKEN,BAAI/bge-small-en-v1.5/config_SEARCH_TERMS_EMB_NUMBER_TOKEN,Snowflake/snowflake-arctic-embed-m/config_SEARCH_TERMS_EMB_NUMBER_TOKEN,Snowflake/snowflake-arctic-embed-m-v1.5/config_SEARCH_TERMS_EMB_NUMBER_TOKEN,intfloat/e5-base-v2/config_FILTERS_EMB_NUMBER_TOKEN,sentence-transformers/all-MiniLM-L6-v2/config_FILTERS_EMB_NUMBER_TOKEN,thenlper/gte-small/config_FILTERS_EMB_NUMBER_TOKEN,BAAI/bge-small-en-v1.5/config_FILTERS_EMB_NUMBER_TOKEN,Snowflake/snowflake-arctic-embed-m/config_FILTERS_EMB_NUMBER_TOKEN,Snowflake/snowflake-arctic-embed-m-v1.5/config_FILTERS_EMB_NUMBER_TOKEN
0,recipe NAME: sweet potato hash.,"recipe TAGS: 30minutesorless, timetomake, cour...","recipe INGREDIENTS: sweet potatoes, olive oil,...","recipe STEPS: grate the sweet potatoes, heat t...",recipe DESCRIPTION: i love this simple sweet p...,"recipe SEARCH_TERMS: lowsodium, lowcalorie, ve...","recipe FILTERS: vegan, vegetarian, lowcholeste...",442,442,442,...,25,25,25,25,32,32,32,32,32,32
1,recipe NAME: uncle bill s balsamic vinegar dre...,"recipe TAGS: 15minutesorless, timetomake, cour...","recipe INGREDIENTS: balsamic vinegar, lemon ju...","recipe STEPS: in a food processor or blender, ...",recipe DESCRIPTION: this dressing is a nice ch...,"recipe SEARCH_TERMS: lowsodium, lowcarb.","recipe FILTERS: lowcarb, lowprotein, lowsodium.",284,284,284,...,13,13,13,13,15,15,15,15,15,15
2,recipe NAME: bread machine apricot nutmeg bread.,"recipe TAGS: timetomake, course, preparation, ...","recipe INGREDIENTS: water, vegetable oil, lemo...",recipe STEPS: put ingredients beginning with w...,recipe DESCRIPTION: always on the look out for...,"recipe SEARCH_TERMS: healthy, bread.","recipe FILTERS: lowcholesterol, lowsaturatedfat.",239,239,239,...,9,9,9,9,14,14,14,14,14,14
3,recipe NAME: sesame salad dressing.,"recipe TAGS: 15minutesorless, timetomake, cour...","recipe INGREDIENTS: extra virgin olive oil, sh...",recipe STEPS: combine first 5 ingredients in b...,recipe DESCRIPTION: this has been my favourite...,"recipe SEARCH_TERMS: lowcarb, vegetarian, vega...","recipe FILTERS: vegan, vegetarian, lowcarb.",301,301,301,...,16,16,16,16,12,12,12,12,12,12
4,recipe NAME: dinner in a hurry club roll up.,"recipe TAGS: 30minutesorless, timetomake, cour...","recipe INGREDIENTS: avocado, mayonnaise, lime ...","recipe STEPS: peel avocado, cut in half and re...",recipe DESCRIPTION: forget the bread keep a st...,"recipe SEARCH_TERMS: lunch, healthy, glutenfre...","recipe FILTERS: lowcholesterol, glutenfree.",371,371,371,...,18,18,18,18,13,13,13,13,13,13


In [27]:
from tqdm import tqdm 



#count number of token for each config and model
for col in df.columns:
        for model_id, model in MODEL_DICT.items():
            if model_id == "intfloat/e5-base-v2":  # Only for E5 model

                embedding_col = f"{model_id}/config_{col}_EMB"
                tokens_col = f"{embedding_col}_NUMBER_TOKEN" 

                number_token = []
                for text in tqdm(df[f'{col}'], desc=f"count number token config_{col} with {model_id}"):
                    num_tokens = len(model.tokenizer.tokenize(text))
                    number_token.append(num_tokens)

                df[f"{embedding_col}_NUMBER_TOKEN"] = number_token

count number token config_NAME with intfloat/e5-base-v2: 100%|██████████| 5000/5000 [00:00<00:00, 24141.66it/s]
count number token config_TAGS with intfloat/e5-base-v2: 100%|██████████| 5000/5000 [00:00<00:00, 7352.33it/s]
count number token config_INGREDIENTS with intfloat/e5-base-v2: 100%|██████████| 5000/5000 [00:00<00:00, 12820.46it/s]
count number token config_STEPS with intfloat/e5-base-v2: 100%|██████████| 5000/5000 [00:01<00:00, 3949.25it/s]
count number token config_DESCRIPTION with intfloat/e5-base-v2: 100%|██████████| 5000/5000 [00:00<00:00, 8620.67it/s]
count number token config_SEARCH_TERMS with intfloat/e5-base-v2: 100%|██████████| 5000/5000 [00:00<00:00, 21250.58it/s]
count number token config_FILTERS with intfloat/e5-base-v2: 100%|██████████| 5000/5000 [00:00<00:00, 22727.29it/s]


In [30]:
import pandas as pd

# Select all columns with 'EMB' in their name
emb_columns = [col for col in df.columns if 'EMB' in col]

print("Columns with EMB:")
for col in emb_columns:
    print(f"  - {col}")

print("\n" + "="*80 + "\n")

# Calculate average token length and count rows exceeding 512
results = []
for col in emb_columns:
    avg_tokens = df[col].mean()
    exceed_512 = (df[col] > 512).sum()
    total_rows = len(df)
    percentage = (exceed_512 / total_rows) * 100
    
    results.append({
        'Column': col,
        'Average Tokens': avg_tokens,
        'Rows > 512': exceed_512,
        'Percentage > 512': percentage
    })
    
    print(f"{col}")
    print(f"  Average tokens: {avg_tokens:.2f}")
    print(f"  Rows exceeding 512 tokens: {exceed_512} ({percentage:.2f}%)")
    print()

# Create a summary dataframe
summary_df = pd.DataFrame(results)
print("="*80)
print("\nSummary Table:")
print(summary_df.to_string(index=False))

# Optional: Show distribution statistics for each column
print("\n" + "="*80 + "\n")
print("Detailed Statistics:")
print(df[emb_columns].describe())

Columns with EMB:
  - intfloat/e5-base-v2/config_NAME_EMB_NUMBER_TOKEN
  - intfloat/e5-base-v2/config_TAGS_EMB_NUMBER_TOKEN
  - intfloat/e5-base-v2/config_INGREDIENTS_EMB_NUMBER_TOKEN
  - intfloat/e5-base-v2/config_STEPS_EMB_NUMBER_TOKEN
  - intfloat/e5-base-v2/config_DESCRIPTION_EMB_NUMBER_TOKEN
  - intfloat/e5-base-v2/config_SEARCH_TERMS_EMB_NUMBER_TOKEN
  - intfloat/e5-base-v2/config_FILTERS_EMB_NUMBER_TOKEN


intfloat/e5-base-v2/config_NAME_EMB_NUMBER_TOKEN
  Average tokens: 10.22
  Rows exceeding 512 tokens: 0 (0.00%)

intfloat/e5-base-v2/config_TAGS_EMB_NUMBER_TOKEN
  Average tokens: 74.49
  Rows exceeding 512 tokens: 0 (0.00%)

intfloat/e5-base-v2/config_INGREDIENTS_EMB_NUMBER_TOKEN
  Average tokens: 33.57
  Rows exceeding 512 tokens: 0 (0.00%)

intfloat/e5-base-v2/config_STEPS_EMB_NUMBER_TOKEN
  Average tokens: 128.53
  Rows exceeding 512 tokens: 21 (0.42%)

intfloat/e5-base-v2/config_DESCRIPTION_EMB_NUMBER_TOKEN
  Average tokens: 50.30
  Rows exceeding 512 tokens: 1 (0.02%)

i