In [1]:
import os
import sys
import pandas as pd
import numpy as np
import json

original_cwd = os.getcwd()

backend_path = os.path.abspath(os.path.join(original_cwd, "../backend"))
added_backend = False

if not any("backend" in p for p in sys.path):
    sys.path.insert(0, backend_path)
    added_backend = True
    print(f"Added backend to sys.path: {backend_path}")
else:
    print("Backend already in sys.path, skipping.")

from shared.snowflake.client import SnowflakeClient

os.chdir(original_cwd)
print(f"Returned to original working directory: {os.getcwd()}")

Added backend to sys.path: c:\Users\hamma\Desktop\project\NutriRAG\backend
Returned to original working directory: c:\Users\hamma\Desktop\project\NutriRAG\experiments


**read the config file**

In [2]:
CONFIG_FILE_PATH = "config/base_config.json"

with open(CONFIG_FILE_PATH, "r", encoding="utf-8") as f:
    config = json.load(f)

**get the data**

In [3]:
INPUT_RECIPIES_FILE = config['input_recipies_file']
df_recipes = pd.DataFrame()

#check if the file already exists read it
if os.path.exists(INPUT_RECIPIES_FILE):
    print(f"File {INPUT_RECIPIES_FILE} already exists. Skipping save.")
    df_recipes = pd.read_csv(INPUT_RECIPIES_FILE)
else:
    print(f"Saving DataFrame to {INPUT_RECIPIES_FILE}")
    client = SnowflakeClient()

    conn = client._conn
    table = "RECIPES_SAMPLE_EVAL_EMBEDDING"

    df_recipes = pd.read_sql(f"SELECT * FROM {table}", conn)

    client.close()
    
    df_recipes.to_csv(INPUT_RECIPIES_FILE, index=False)

df_recipes.head()

File data/recipes_samples.csv already exists. Skipping save.


Unnamed: 0,ID,NAME_CLEAND,INGREDIENTS_CLEAND,STEPS_CLEAND,DESCRIPTION_CLEAND
0,421766,recipe name: stuffed mussels in spicy tomato s...,"recipe ingredients: mussels, ground beef, crus...","recipe steps: beard and clean mussels, open mu...",recipe description: stuffed mussels is a favor...
1,148350,recipe name: bob s meat.,"recipe ingredients: puff pastry, bacon, onion,...",recipe steps: remove package of puff pastry fr...,recipe description: this is a wonderfully tast...
2,351663,recipe name: grilled turkey breast.,"recipe ingredients: cider vinegar, garlic, gro...","recipe steps: in a medium bowl, combine the ci...",recipe description: source cooking with the di...
3,45766,recipe name: raisin banana bread.,"recipe ingredients: water, bread flour, banana...","recipe steps: measure carefully, placing all i...",recipe description: sounds like a winner to me...
4,163701,recipe name: lime pound cake 1968.,"recipe ingredients: butter, granulated sugar, ...","recipe steps: grease a ten inch tube pan, line...",recipe description: this pound cake takes baki...


**define a function that clean the columns used for embedding** 

In [4]:
import re

def clean_columns_to_embedd(tag_value: any, col_name: str) -> str:
    """
    Format text of the columns used for embedding

    Args:
        tag_value (any): The input value to clean. Can be a string, list, number, or None.
                        Will be converted to string before processing.
        col_name (str): The label/prefix to add before the cleaned text 
                       (e.g., "NAME", "TAGS", "INGREDIENTS").
    
    Returns:
        str: Cleaned and formatted text in the format "{col_name}: {cleaned_text}."
             Returns empty string if input is None or empty.
    """
    
    if tag_value is None or tag_value == "":
        return ""
    
    text = str(tag_value)
    
    # Remove list brackets and quotes
    text = re.sub(r"[\[\]'\"]", "", text)
    
    text = text.replace("|", ",")
    
    # Convert to lowercase
    text = text.lower()
    
    # Keep only alphanumeric, spaces, and . , ? !
    text = re.sub(r"[^a-z0-9 .,?!]+", "", text)
    
    # Remove excess spaces
    text = re.sub(r" +", " ", text)
    
    # Remove spaces before commas
    text = re.sub(r" ,", ",", text)
    
    # Clean up spaces around punctuation
    text = text.strip()
    
    # Return formatted text
    return f"{col_name}: {text}."

**extract only required columns for embedding**

In [5]:
#extract only required columns for embedding
COLUMNS_TO_CLEAN = config["columns_to_clean"]

for col in COLUMNS_TO_CLEAN:
    col_clean_name = COLUMNS_TO_CLEAN[col]['column_name']
    start_text = COLUMNS_TO_CLEAN[col]['start_text']

    df_recipes[col_clean_name] = df_recipes[col].apply(clean_columns_to_embedd, args=(start_text, ))

df_recipes_cleaned = df_recipes[ [col['column_name'] for col in COLUMNS_TO_CLEAN.values()] ]

#add the id to keep track of the recepies
df_recipes_cleaned['ID'] = df_recipes['ID'] 
df_recipes_cleaned.head()

Unnamed: 0,NAME_CLEAND,INGREDIENTS_CLEAND,STEPS_CLEAND,DESCRIPTION_CLEAND,ID
0,recipe name: recipe name stuffed mussels in sp...,recipe ingredients: recipe ingredients mussels...,recipe steps: recipe steps beard and clean mus...,recipe description: recipe description stuffed...,421766
1,recipe name: recipe name bob s meat..,recipe ingredients: recipe ingredients puff pa...,recipe steps: recipe steps remove package of p...,recipe description: recipe description this is...,148350
2,recipe name: recipe name grilled turkey breast..,recipe ingredients: recipe ingredients cider v...,"recipe steps: recipe steps in a medium bowl, c...",recipe description: recipe description source ...,351663
3,recipe name: recipe name raisin banana bread..,"recipe ingredients: recipe ingredients water, ...","recipe steps: recipe steps measure carefully, ...",recipe description: recipe description sounds ...,45766
4,recipe name: recipe name lime pound cake 1968..,"recipe ingredients: recipe ingredients butter,...",recipe steps: recipe steps grease a ten inch t...,recipe description: recipe description this po...,163701


**create a column for each combinaison of embedding columns**

In [6]:
#initialize a columns for each configuration of columns to embedd
COLUMNS_TO_EMBEDDE = config["columns_embedding"]

for col_config_name, cols_list in COLUMNS_TO_EMBEDDE.items():
    df_recipes_cleaned[col_config_name] = ""

    for col in cols_list:
        column_name_cleaned = COLUMNS_TO_CLEAN[col]['column_name']
        df_recipes_cleaned[col_config_name] += df_recipes_cleaned[f"{column_name_cleaned}"] + " "
        
df_recipes_cleaned.head()

Unnamed: 0,NAME_CLEAND,INGREDIENTS_CLEAND,STEPS_CLEAND,DESCRIPTION_CLEAND,ID,config_1,config_2,config_3,config_4,config_5,config_6
0,recipe name: recipe name stuffed mussels in sp...,recipe ingredients: recipe ingredients mussels...,recipe steps: recipe steps beard and clean mus...,recipe description: recipe description stuffed...,421766,recipe name: recipe name stuffed mussels in sp...,recipe name: recipe name stuffed mussels in sp...,recipe name: recipe name stuffed mussels in sp...,recipe name: recipe name stuffed mussels in sp...,recipe name: recipe name stuffed mussels in sp...,recipe name: recipe name stuffed mussels in sp...
1,recipe name: recipe name bob s meat..,recipe ingredients: recipe ingredients puff pa...,recipe steps: recipe steps remove package of p...,recipe description: recipe description this is...,148350,recipe name: recipe name bob s meat.. recipe i...,recipe name: recipe name bob s meat.. recipe i...,recipe name: recipe name bob s meat.. recipe i...,recipe name: recipe name bob s meat.. recipe s...,recipe name: recipe name bob s meat.. recipe i...,recipe name: recipe name bob s meat.. recipe s...
2,recipe name: recipe name grilled turkey breast..,recipe ingredients: recipe ingredients cider v...,"recipe steps: recipe steps in a medium bowl, c...",recipe description: recipe description source ...,351663,recipe name: recipe name grilled turkey breast...,recipe name: recipe name grilled turkey breast...,recipe name: recipe name grilled turkey breast...,recipe name: recipe name grilled turkey breast...,recipe name: recipe name grilled turkey breast...,recipe name: recipe name grilled turkey breast...
3,recipe name: recipe name raisin banana bread..,"recipe ingredients: recipe ingredients water, ...","recipe steps: recipe steps measure carefully, ...",recipe description: recipe description sounds ...,45766,recipe name: recipe name raisin banana bread.....,recipe name: recipe name raisin banana bread.....,recipe name: recipe name raisin banana bread.....,recipe name: recipe name raisin banana bread.....,recipe name: recipe name raisin banana bread.....,recipe name: recipe name raisin banana bread.....
4,recipe name: recipe name lime pound cake 1968..,"recipe ingredients: recipe ingredients butter,...",recipe steps: recipe steps grease a ten inch t...,recipe description: recipe description this po...,163701,recipe name: recipe name lime pound cake 1968....,recipe name: recipe name lime pound cake 1968....,recipe name: recipe name lime pound cake 1968....,recipe name: recipe name lime pound cake 1968....,recipe name: recipe name lime pound cake 1968....,recipe name: recipe name lime pound cake 1968....


In [7]:
#test
df_recipes_cleaned['config_1'].values[0]

'recipe name: recipe name stuffed mussels in spicy tomato sauce.. recipe ingredients: recipe ingredients mussels, ground beef, crusty bread, broth, flat leaf parsley, garlic clove, ground black pepper, parmesan cheese, egg, salt, tomatoes, garlic cloves, chili pepper, olive oil.. recipe steps: recipe steps beard and clean mussels, open mussels over bowl to collect liquid, set aside, cut stale bread up into cubes, place in bowl and moisten with broth, let stand till bread has softened, squeeze bread well of all liquid, mix all the stuffing ingredients well in a bowl, stuff mussels placing a heaping tablespoon of the stuffing on an open shell, press the two shells shut and wipe off the stuffing that squishes out, make the tomato sauce saut garlic, chili pepper and parsley in the oil over medium heat, add the crushed peeled tomatoes, stir and then add the mussel juice youve kept aside, let sauce simmer for about 15 minutes, gently place mussels into pan, cover and let mussels stew for abo

**load the embedding models**

In [8]:
#load the models
import torch
from sentence_transformers import SentenceTransformer
from torch.nn.functional import normalize

MODELS_CONFIG = config["models"]

#create a dict {name model : model} 
MODELS_LIST = [SentenceTransformer(model_id) for model_id in MODELS_CONFIG]
MODEL_DICT = dict(zip(MODELS_CONFIG, MODELS_LIST))

  from .autonotebook import tqdm as notebook_tqdm


**define a function that count number of token for each config and model**

In [9]:
def compute_token_size(text: str, model: SentenceTransformer) -> int:
    """
    Compute the number of tokens in the given text using the specified SentenceTransformer model.

    Args:
        text (str): The input text to tokenize.
        model (SentenceTransformer): The SentenceTransformer model used for tokenization.

    Returns:
        int: The number of tokens in the input text.
    """

    tokens = model.tokenizer.tokenize(text)
    
    return len(tokens)

**set the experience id**

In [10]:
EXPERIENCE_ID = config["experiments_specifique_params"]["experiment_id"]

print(EXPERIENCE_ID)

1


**set folder and file path for embedding**

In [11]:
OUTPUT_EMBEDDING_FOLDER = config["output_experiments_dir"].format(
    experiment_id=EXPERIENCE_ID
)

os.makedirs(OUTPUT_EMBEDDING_FOLDER, exist_ok=True)

OUTPUT_EMBEDDING_FILE = config["output_recipies_embedding_file"].format(
    experiment_id=EXPERIENCE_ID
)

print(OUTPUT_EMBEDDING_FOLDER)
print(OUTPUT_EMBEDDING_FILE)

experiments/exp1/
experiments/exp1/recipies_samples_embeddings.csv


**calculate number of token per recipie for each config and model**

In [12]:
from tqdm import tqdm 

df_recipes_embedding = df_recipes_cleaned.copy()

#count number of token for each config and model
for col_name, cols_list in COLUMNS_TO_EMBEDDE.items():
    for model_id, model in MODEL_DICT.items():

        embedding_col = f"{model_id}/{col_name}_EMB"
        tokens_col = f"{embedding_col}_NUMBER_TOKEN" 

        number_token = []
        for text in tqdm(df_recipes_embedding[col_name], desc=f"count number token {col_name} with {model_id}"):
            num_tokens = compute_token_size(text, model)
            number_token.append(num_tokens)

        df_recipes_embedding[f"{embedding_col}_NUMBER_TOKEN"] = number_token

count number token config_1 with intfloat/e5-small-v2:   0%|          | 0/1000 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (583 > 512). Running this sequence through the model will result in indexing errors
count number token config_1 with intfloat/e5-small-v2: 100%|██████████| 1000/1000 [00:00<00:00, 2541.07it/s]
count number token config_1 with intfloat/e5-base-v2:   0%|          | 0/1000 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (583 > 512). Running this sequence through the model will result in indexing errors
count number token config_1 with intfloat/e5-base-v2: 100%|██████████| 1000/1000 [00:00<00:00, 2582.50it/s]
count number token config_1 with sentence-transformers/all-MiniLM-L6-v2:   0%|          | 0/1000 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (331 > 256). Runnin

In [13]:
df_recipes_embedding['intfloat/e5-base-v2/config_1_EMB_NUMBER_TOKEN'].describe()

count    1000.000000
mean      230.147000
std       102.020829
min        61.000000
25%       158.000000
50%       212.000000
75%       280.250000
max       737.000000
Name: intfloat/e5-base-v2/config_1_EMB_NUMBER_TOKEN, dtype: float64

In [14]:
len(df_recipes_embedding[df_recipes_embedding['intfloat/e5-base-v2/config_1_EMB_NUMBER_TOKEN'] > 512])

16

In [15]:
#test
model = SentenceTransformer("all-MiniLM-L6-v2")

token_lengths = pd.DataFrame(index=df_recipes.index)

for col in COLUMNS_TO_CLEAN:

    token_lengths[col + "_token_len"] = df_recipes[col].fillna("").astype(str).apply(lambda x: compute_token_size(x, model))

for col in COLUMNS_TO_CLEAN:
    print(f"Column '{col}':")
    print(token_lengths[col + "_token_len"].describe())
    print()

Token indices sequence length is longer than the specified maximum sequence length for this model (405 > 256). Running this sequence through the model will result in indexing errors


Column 'NAME_CLEAND':
count    1000.000000
mean       13.164000
std         2.518223
min         8.000000
25%        11.000000
50%        13.000000
75%        15.000000
max        23.000000
Name: NAME_CLEAND_token_len, dtype: float64

Column 'INGREDIENTS_CLEAND':
count    1000.00000
mean       37.18700
std        13.40805
min        10.00000
25%        27.00000
50%        36.00000
75%        45.00000
max        90.00000
Name: INGREDIENTS_CLEAND_token_len, dtype: float64

Column 'STEPS_CLEAND':
count    1000.000000
mean      128.052000
std        79.443443
min        12.000000
25%        74.000000
50%       110.000000
75%       165.250000
max       633.000000
Name: STEPS_CLEAND_token_len, dtype: float64

Column 'DESCRIPTION_CLEAND':
count    1000.000000
mean       51.744000
std        41.617502
min         8.000000
25%        25.000000
50%        41.000000
75%        64.000000
max       319.000000
Name: DESCRIPTION_CLEAND_token_len, dtype: float64



**define function to calculate embedding of a text**

In [16]:

def compute_embedding(model: SentenceTransformer, texts: list[str]) -> torch.Tensor:
    """
    Compute normalized embeddings for a list of texts using the specified model.

    Args:
        model (SentenceTransformer): The pre-trained sentence transformer model to use.
        texts (list[str]): A list of input texts to compute embeddings for.

    Returns:
        torch.Tensor: A tensor containing the normalized embeddings for the input texts.
    """

    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"

    model = model.to(device)

    # Compute embeddings
    embeddings = model.encode(texts, convert_to_tensor=True, device=device)
    
    # Normalize embeddings to unit length
    normalized_embeddings = normalize(embeddings, p=2, dim=1)
    
    return normalized_embeddings

**compute embedding for each config and model**

In [17]:
# #create embedding cols 
from tqdm import tqdm  

for col_name, cols_list in COLUMNS_TO_EMBEDDE.items():
    for model_id, model in MODEL_DICT.items():

        embedding_col = f"{model_id}/{col_name}_EMB"
                
        embeddings = []
        for text in tqdm(df_recipes_embedding[col_name], desc=f"Embedding {col_name} with {model_id}"):
            emb = compute_embedding(model, [text])[0].cpu().numpy()                         
            embeddings.append(emb)
        
        # Save embeddings to new column
        df_recipes_embedding[embedding_col] = embeddings



Embedding config_1 with intfloat/e5-small-v2: 100%|██████████| 1000/1000 [00:13<00:00, 74.97it/s]
Embedding config_1 with intfloat/e5-base-v2: 100%|██████████| 1000/1000 [00:18<00:00, 53.89it/s]
Embedding config_1 with sentence-transformers/all-MiniLM-L6-v2: 100%|██████████| 1000/1000 [00:06<00:00, 148.92it/s]
Embedding config_1 with thenlper/gte-small: 100%|██████████| 1000/1000 [00:11<00:00, 85.95it/s]
Embedding config_1 with thenlper/gte-base: 100%|██████████| 1000/1000 [00:18<00:00, 54.27it/s]
Embedding config_1 with BAAI/bge-small-en: 100%|██████████| 1000/1000 [00:11<00:00, 86.39it/s]
Embedding config_1 with BAAI/bge-base-en: 100%|██████████| 1000/1000 [00:18<00:00, 53.44it/s]
Embedding config_1 with BAAI/llm-embedder: 100%|██████████| 1000/1000 [00:18<00:00, 54.38it/s]
Embedding config_1 with Snowflake/snowflake-arctic-embed-m: 100%|██████████| 1000/1000 [00:18<00:00, 54.52it/s]
Embedding config_1 with Snowflake/snowflake-arctic-embed-m-v1.5: 100%|██████████| 1000/1000 [00:18<00

**write the file of embedding**

In [18]:
os.makedirs(OUTPUT_EMBEDDING_FOLDER, exist_ok=True)

df_recipes_embedding.to_csv(OUTPUT_EMBEDDING_FILE, index=False)