In [64]:
import pandas as pd
import numpy as np
import json
import os
# import snowflake.connector

**define connexion parameters**

In [65]:
from dotenv import load_dotenv
import re

load_dotenv(override=True)

user = os.getenv("USER")
password = os.getenv("PASSWORD")
passcode = os.getenv("PASSCODE")
account = os.getenv("ACCOUNT")
warehouse = os.getenv("WAREHOUSE") 
database = os.getenv("DATABASE")
schema = os.getenv("SCHEMA")
table = os.getenv("TABLE")

print(user, re.sub(r'.', '*', password), passcode, account, warehouse, database, schema, table)

       


**get the data from snowflake**

In [43]:
conn = snowflake.connector.connect(
    user = user,
    password = password,
    passcode = passcode,
    account = account,
    warehouse = warehouse,
    database = database,
    schema = schema
)

df = pd.read_sql(f"SELECT * FROM {table}", conn)
    
conn.close()

NameError: name 'snowflake' is not defined

**open the config file**

In [66]:
CONFIG_FILE_PATH = os.getenv("CONFIG_FILE_PATH")

with open(CONFIG_FILE_PATH, "r", encoding="utf-8") as f:
    config = json.load(f)

**write the data locally**

In [67]:
INPUT_RECIPIES_FILE = config['input_recipies_file']

df.to_csv(INPUT_RECIPIES_FILE, index=False)
df.head()

NameError: name 'df' is not defined

**read the data if already exist locally**

In [68]:
#or directly load the file if already exist
df_recipes = pd.read_csv(INPUT_RECIPIES_FILE)
df_recipes.head()

Unnamed: 0,NAME,ID,MINUTES,CONTRIBUTOR_ID,SUBMITTED,TAGS,NUTRITION,N_STEPS,STEPS,DESCRIPTION,INGREDIENTS,N_INGREDIENTS,HAS_IMAGE,IMAGE_URL,INGREDIENTS_RAW_STR,SERVING_SIZE,SERVINGS,SEARCH_TERMS
0,crab filled crescent snacks,94947,70,111448,2004-07-03,"[\n ""time-to-make"",\n ""course"",\n ""main-ing...","[\n 69.2,\n 3,\n 9,\n 6,\n 5,\n 4,\n 3\n]",16,"[\n ""heat over to 375 degrees"",\n ""spray lar...",found in a crescent roll recipe magazine.,"[\n ""crabmeat"",\n ""cream cheese"",\n ""green ...",9,0,,"[\n ""1 (6 ounce) can crabmeat, rinsed,wel...",1010,1,"[\n ""lunch"",\n ""snack""\n]"
1,curried bean salad,429010,20,300249,2010-06-08,"[\n ""curries"",\n ""30-minutes-or-less"",\n ""t...","[\n 256,\n 2,\n 40,\n 18,\n 18,\n 1,\n ...",4,"[\n ""drain & rinse beans"",\n ""stir all ingre...",serve this flavorful and refreshing salad as a...,"[\n ""garbanzo beans"",\n ""black beans"",\n ""o...",12,0,,"[\n ""1 (15 ounce) can garbanzo beans, dra...",271,8,"[\n ""low-calorie"",\n ""vegetarian"",\n ""salad..."
2,delicious steak with onion marinade,277542,25,234062,2008-01-08,"[\n ""lactose"",\n ""30-minutes-or-less"",\n ""t...","[\n 58.6,\n 5,\n 19,\n 0,\n 0,\n 2,\n 2\n]",6,"[\n ""heat the oil in a heavy-based pan and co...","another i've not tried, but looks good! times ...","[\n ""olive oil"",\n ""red onion"",\n ""light br...",5,1,https://img.sndimg.com/food/image/upload/c_thu...,"[\n ""1 tablespoon olive oil, plus extra ...",152,4,"[\n ""dinner"",\n ""lactose-free""\n]"
3,pork tenderloin with hoisin,78450,15,42651,2003-12-10,"[\n ""15-minutes-or-less"",\n ""time-to-make"",\...","[\n 241.5,\n 12,\n 20,\n 45,\n 62,\n 13,...",7,"[\n ""cut pork into 1 / 4-inch slices"",\n ""in...",another keeper from bonnie stern's heartsmart ...,"[\n ""pork tenderloin"",\n ""soy sauce"",\n ""ho...",10,0,,"[\n ""1 1/4 lbs pork tenderloin"",\n ""3 ...",187,4,"[\n ""pork"",\n ""dinner""\n]"
4,mixed baby greens with oranges grapefruit and...,80012,15,1533,2004-01-01,"[\n ""15-minutes-or-less"",\n ""time-to-make"",\...","[\n 212.8,\n 24,\n 30,\n 0,\n 4,\n 11,\n...",2,"[\n ""in a salad bowl combine the lettuce with...",i love grapefruit in a salad and this one is p...,"[\n ""mixed baby greens"",\n ""oranges"",\n ""gr...",8,0,,"[\n ""1 lb mixed baby greens, salad "",\n ...",199,4,"[\n ""vegetarian""\n]"


**define a function that clean the columns used for embedding** 

In [69]:
import re

def clean_columns_to_embedd(tag_value: any, col_name: str) -> str:
    """
    Format text of the columns used for embedding

    Args:
        tag_value (any): The input value to clean. Can be a string, list, number, or None.
                        Will be converted to string before processing.
        col_name (str): The label/prefix to add before the cleaned text 
                       (e.g., "NAME", "TAGS", "INGREDIENTS").
    
    Returns:
        str: Cleaned and formatted text in the format "{col_name}: {cleaned_text}."
             Returns empty string if input is None or empty.
    """
    
    if tag_value is None or tag_value == "":
        return ""
    
    text = str(tag_value)
    
    # Remove list brackets and quotes
    text = re.sub(r"[\[\]'\"]", "", text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Keep only alphanumeric, spaces, and . , ? !
    text = re.sub(r"[^a-z0-9 .,?!]+", "", text)
    
    # Remove excess spaces
    text = re.sub(r" +", " ", text)
    
    # Clean up spaces around punctuation
    text = text.strip()
    
    # Return formatted text
    return f"{col_name}: {text}."

**extract only required columns for embedding**

In [70]:
#extract only required columns for embedding
COLUMNS_TO_CLEAN = config["columns_to_clean"]

for col in COLUMNS_TO_CLEAN:
    col_clean_name = COLUMNS_TO_CLEAN[col]['column_name']
    start_text = COLUMNS_TO_CLEAN[col]['start_text']

    df_recipes[col_clean_name] = df_recipes[col].apply(clean_columns_to_embedd, args=(start_text, ))

df_recipes_cleaned = df_recipes[ [col['column_name'] for col in COLUMNS_TO_CLEAN.values()] ]

#add the id to keep track of the recepies
df_recipes_cleaned['ID'] = df_recipes['ID'] 
df_recipes_cleaned.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recipes_cleaned['ID'] = df_recipes['ID']


Unnamed: 0,NAME_CLEAND,TAGS_CLEAND,INGREDIENTS_CLEAND,STEPS_CLEAND,DESCRIPTION_CLEAND,ID
0,recipe name: crab filled crescent snacks.,"recipe tags: timetomake, course, mainingredien...","recipe ingredients: crabmeat, cream cheese, gr...","recipe steps: heat over to 375 degrees, spray ...",recipe description: found in a crescent roll r...,94947
1,recipe name: curried bean salad.,"recipe tags: curries, 30minutesorless, timetom...","recipe ingredients: garbanzo beans, black bean...","recipe steps: drain rinse beans, stir all ingr...",recipe description: serve this flavorful and r...,429010
2,recipe name: delicious steak with onion marinade.,"recipe tags: lactose, 30minutesorless, timetom...","recipe ingredients: olive oil, red onion, ligh...",recipe steps: heat the oil in a heavybased pan...,"recipe description: another ive not tried, but...",277542
3,recipe name: pork tenderloin with hoisin.,"recipe tags: 15minutesorless, timetomake, cour...","recipe ingredients: pork tenderloin, soy sauce...","recipe steps: cut pork into 1 4inch slices, in...",recipe description: another keeper from bonnie...,78450
4,recipe name: mixed baby greens with oranges gr...,"recipe tags: 15minutesorless, timetomake, cour...","recipe ingredients: mixed baby greens, oranges...",recipe steps: in a salad bowl combine the lett...,recipe description: i love grapefruit in a sal...,80012


**create a column for each combinaison of embedding columns**

In [71]:
#initialize a columns for each configuration of columns to embedd
COLUMNS_TO_EMBEDDE = config["columns_embedding"]

for col_config_name, cols_list in COLUMNS_TO_EMBEDDE.items():
    df_recipes_cleaned[col_config_name] = ""

    for col in cols_list:
        column_name_cleaned = COLUMNS_TO_CLEAN[col]['column_name']
        df_recipes_cleaned[col_config_name] += df_recipes_cleaned[f"{column_name_cleaned}"] + " "
        
df_recipes_cleaned.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recipes_cleaned[col_config_name] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recipes_cleaned[col_config_name] += df_recipes_cleaned[f"{column_name_cleaned}"] + " "
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_recipes_cleaned[col_config_name] = ""
A value is trying to be set on a 

Unnamed: 0,NAME_CLEAND,TAGS_CLEAND,INGREDIENTS_CLEAND,STEPS_CLEAND,DESCRIPTION_CLEAND,ID,config_1,config_2,config_3,config_4,config_5,config_6
0,recipe name: crab filled crescent snacks.,"recipe tags: timetomake, course, mainingredien...","recipe ingredients: crabmeat, cream cheese, gr...","recipe steps: heat over to 375 degrees, spray ...",recipe description: found in a crescent roll r...,94947,recipe name: crab filled crescent snacks. reci...,"recipe tags: timetomake, course, mainingredien...","recipe tags: timetomake, course, mainingredien...","recipe tags: timetomake, course, mainingredien...","recipe tags: timetomake, course, mainingredien...","recipe tags: timetomake, course, mainingredien..."
1,recipe name: curried bean salad.,"recipe tags: curries, 30minutesorless, timetom...","recipe ingredients: garbanzo beans, black bean...","recipe steps: drain rinse beans, stir all ingr...",recipe description: serve this flavorful and r...,429010,recipe name: curried bean salad. recipe tags: ...,"recipe tags: curries, 30minutesorless, timetom...","recipe tags: curries, 30minutesorless, timetom...","recipe tags: curries, 30minutesorless, timetom...","recipe tags: curries, 30minutesorless, timetom...","recipe tags: curries, 30minutesorless, timetom..."
2,recipe name: delicious steak with onion marinade.,"recipe tags: lactose, 30minutesorless, timetom...","recipe ingredients: olive oil, red onion, ligh...",recipe steps: heat the oil in a heavybased pan...,"recipe description: another ive not tried, but...",277542,recipe name: delicious steak with onion marina...,"recipe tags: lactose, 30minutesorless, timetom...","recipe tags: lactose, 30minutesorless, timetom...","recipe tags: lactose, 30minutesorless, timetom...","recipe tags: lactose, 30minutesorless, timetom...","recipe tags: lactose, 30minutesorless, timetom..."
3,recipe name: pork tenderloin with hoisin.,"recipe tags: 15minutesorless, timetomake, cour...","recipe ingredients: pork tenderloin, soy sauce...","recipe steps: cut pork into 1 4inch slices, in...",recipe description: another keeper from bonnie...,78450,recipe name: pork tenderloin with hoisin. reci...,"recipe tags: 15minutesorless, timetomake, cour...","recipe tags: 15minutesorless, timetomake, cour...","recipe tags: 15minutesorless, timetomake, cour...","recipe tags: 15minutesorless, timetomake, cour...","recipe tags: 15minutesorless, timetomake, cour..."
4,recipe name: mixed baby greens with oranges gr...,"recipe tags: 15minutesorless, timetomake, cour...","recipe ingredients: mixed baby greens, oranges...",recipe steps: in a salad bowl combine the lett...,recipe description: i love grapefruit in a sal...,80012,recipe name: mixed baby greens with oranges gr...,"recipe tags: 15minutesorless, timetomake, cour...","recipe tags: 15minutesorless, timetomake, cour...","recipe tags: 15minutesorless, timetomake, cour...","recipe tags: 15minutesorless, timetomake, cour...","recipe tags: 15minutesorless, timetomake, cour..."


In [49]:
#test
df_recipes_cleaned['config_1'].values[0]

'recipe name: crab filled crescent snacks. recipe tags: timetomake, course, mainingredient, preparation, occasion, lunch, snacks, seafood, oven, potluck, picnic, crab, dietary, shellfish, togo, equipment, 4hoursorless. recipe ingredients: crabmeat, cream cheese, green onions, garlic salt, refrigerated crescent dinner rolls, egg yolk, water, sesame seeds, sweet and sour sauce. recipe steps: heat over to 375 degrees, spray large cookie sheet with nonstick cooking spray, in small bowl , combine crabmeat , cream cheese , onions and garlic salt and mix well, unroll both cans of dough, separate into 16 triangles, cut each triangle in half lengthwise to make 32 triangles, place 1 teaspoon crab mixture on center of each triangle about 1 inch from short side of triangle, fold short ends of each triangle over filling, pinch sides to seal, roll up, place on sprayed cookie sheet, in small bowl , combine egg yolk and water and mix well, brush egg mixture over snacks, sprinkle with sesame seed, bake

**load the embedding models**

In [72]:
#load the models
import torch
from sentence_transformers import SentenceTransformer
from torch.nn.functional import normalize

MODELS_CONFIG = config["models"]

#create a dict {name model : model} 
MODELS_LIST = [SentenceTransformer(model_id) for model_id in MODELS_CONFIG]
MODEL_DICT = dict(zip(MODELS_CONFIG, MODELS_LIST))

**define a function that count number of token for each config and model**

In [73]:
def compute_token_size(text: str, model: SentenceTransformer) -> int:
    """
    Compute the number of tokens in the given text using the specified SentenceTransformer model.

    Args:
        text (str): The input text to tokenize.
        model (SentenceTransformer): The SentenceTransformer model used for tokenization.

    Returns:
        int: The number of tokens in the input text.
    """

    tokens = model.tokenizer.tokenize(text)
    
    return len(tokens)

**set the experience id**

In [74]:
EXPERIENCE_ID = config["experiments_specifique_params"]["experiment_id"]

print(EXPERIENCE_ID)

1


**set folder and file path for embedding**

In [75]:
OUTPUT_EMBEDDING_FOLDER = config["output_experiments_dir"].format(
    experiment_id=EXPERIENCE_ID
)

os.makedirs(OUTPUT_EMBEDDING_FOLDER, exist_ok=True)

OUTPUT_EMBEDDING_FILE = config["output_recipies_embedding_file"].format(
    experiment_id=EXPERIENCE_ID
)

print(OUTPUT_EMBEDDING_FOLDER)
print(OUTPUT_EMBEDDING_FILE)

experiments/exp1/
experiments/exp1/recipies_samples_embeddings.csv


**calculate number of token per recipie for each config and model**

In [76]:
from tqdm import tqdm 

df_recipes_embedding = df_recipes_cleaned.copy()

#count number of token for each config and model
for col_name, cols_list in COLUMNS_TO_EMBEDDE.items():
    for model_id, model in MODEL_DICT.items():

        embedding_col = f"{model_id}/{col_name}_EMB"
        tokens_col = f"{embedding_col}_NUMBER_TOKEN" 

        number_token = []
        for text in tqdm(df_recipes_embedding[col_name], desc=f"count number token {col_name} with {model_id}"):
            num_tokens = compute_token_size(text, model)
            number_token.append(num_tokens)

        df_recipes_embedding[f"{embedding_col}_NUMBER_TOKEN"] = number_token

count number token config_1 with intfloat/e5-base-v2:   0%|          | 0/1000 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (560 > 512). Running this sequence through the model will result in indexing errors
count number token config_1 with intfloat/e5-base-v2: 100%|██████████| 1000/1000 [00:00<00:00, 1617.85it/s]
count number token config_1 with sentence-transformers/all-MiniLM-L6-v2:   0%|          | 0/1000 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (265 > 256). Running this sequence through the model will result in indexing errors
count number token config_1 with sentence-transformers/all-MiniLM-L6-v2: 100%|██████████| 1000/1000 [00:00<00:00, 1626.02it/s]
count number token config_1 with BAAI/bge-base-en-v1.5:   0%|          | 0/1000 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model 

In [77]:
df_recipes_embedding['intfloat/e5-base-v2/config_1_EMB_NUMBER_TOKEN'].describe()

count    1000.000000
mean      293.170000
std       107.572873
min       101.000000
25%       220.000000
50%       275.000000
75%       341.000000
max       989.000000
Name: intfloat/e5-base-v2/config_1_EMB_NUMBER_TOKEN, dtype: float64

In [78]:
len(df_recipes_embedding[df_recipes_embedding['intfloat/e5-base-v2/config_1_EMB_NUMBER_TOKEN'] > 512])

38

**define function to calculate embedding of a text**

In [79]:

def compute_embedding(model: SentenceTransformer, texts: list[str]) -> torch.Tensor:
    """
    Compute normalized embeddings for a list of texts using the specified model.

    Args:
        model (SentenceTransformer): The pre-trained sentence transformer model to use.
        texts (list[str]): A list of input texts to compute embeddings for.

    Returns:
        torch.Tensor: A tensor containing the normalized embeddings for the input texts.
    """

    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"

    model = model.to(device)

    # Compute embeddings
    embeddings = model.encode(texts, convert_to_tensor=True, device=device)
    
    # Normalize embeddings to unit length
    normalized_embeddings = normalize(embeddings, p=2, dim=1)
    
    return normalized_embeddings

**compute embedding for each config and model**

In [80]:
# #create embedding cols 
from tqdm import tqdm  

for col_name, cols_list in COLUMNS_TO_EMBEDDE.items():
    for model_id, model in MODEL_DICT.items():

        embedding_col = f"{model_id}/{col_name}_EMB"
                
        embeddings = []
        for text in tqdm(df_recipes_embedding[col_name], desc=f"Embedding {col_name} with {model_id}"):
            emb = compute_embedding(model, [text])[0].cpu().numpy()                         
            embeddings.append(emb)
        
        # Save embeddings to new column
        df_recipes_embedding[embedding_col] = embeddings



Embedding config_1 with intfloat/e5-base-v2: 100%|██████████| 1000/1000 [01:44<00:00,  9.61it/s]
Embedding config_1 with sentence-transformers/all-MiniLM-L6-v2: 100%|██████████| 1000/1000 [00:26<00:00, 37.32it/s]
Embedding config_1 with BAAI/bge-base-en-v1.5: 100%|██████████| 1000/1000 [01:35<00:00, 10.52it/s]
Embedding config_1 with Snowflake/snowflake-arctic-embed-m: 100%|██████████| 1000/1000 [01:49<00:00,  9.10it/s]
Embedding config_1 with Snowflake/snowflake-arctic-embed-m-v1.5: 100%|██████████| 1000/1000 [01:50<00:00,  9.03it/s]
Embedding config_2 with intfloat/e5-base-v2: 100%|██████████| 1000/1000 [00:59<00:00, 16.74it/s]
Embedding config_2 with sentence-transformers/all-MiniLM-L6-v2: 100%|██████████| 1000/1000 [00:28<00:00, 34.91it/s]
Embedding config_2 with BAAI/bge-base-en-v1.5: 100%|██████████| 1000/1000 [00:52<00:00, 19.19it/s]
Embedding config_2 with Snowflake/snowflake-arctic-embed-m: 100%|██████████| 1000/1000 [00:51<00:00, 19.30it/s]
Embedding config_2 with Snowflake/s

**write the file of embedding**

In [81]:
os.makedirs(OUTPUT_EMBEDDING_FOLDER, exist_ok=True)

df_recipes_embedding.to_csv(OUTPUT_EMBEDDING_FILE, index=False)