In [None]:
import pandas as pd
import numpy as np
import json
import os
import snowflake.connector

**define connexion parameters**

In [None]:
from dotenv import load_dotenv
import re

load_dotenv(override=True)

user = os.getenv("USER")
password = os.getenv("PASSWORD")
passcode = os.getenv("PASSCODE")
account = os.getenv("ACCOUNT")
warehouse = os.getenv("WAREHOUSE") 
database = os.getenv("DATABASE")
schema = os.getenv("SCHEMA")
table = os.getenv("TABLE")

print(user, re.sub(r'.', '*', password), passcode, account, warehouse, database, schema, table)

**get the data from snowflake**

In [None]:
conn = snowflake.connector.connect(
    user = user,
    password = password,
    passcode = passcode,
    account = account,
    warehouse = warehouse,
    database = database,
    schema = schema
)

df = pd.read_sql(f"SELECT * FROM {table}", conn)
    
conn.close()

**open the config file**

In [None]:
CONFIG_FILE_PATH = os.getenv("CONFIG_FILE_PATH")

with open(CONFIG_FILE_PATH, "r", encoding="utf-8") as f:
    config = json.load(f)

**write the data locally**

In [None]:
INPUT_RECIPIES_FILE = config['input_recipies_file']

df.to_csv(INPUT_RECIPIES_FILE, index=False)
df.head()

**read the data if already exist locally**

In [None]:
#or directly load the file if already exist
df_recipes = pd.read_csv(INPUT_RECIPIES_FILE)
df_recipes.head()

**define a function that clean the columns used for embedding** 

In [None]:
import re

def clean_columns_to_embedd(tag_value: any, col_name: str) -> str:
    """
    Format text of the columns used for embedding

    Args:
        tag_value (any): The input value to clean. Can be a string, list, number, or None.
                        Will be converted to string before processing.
        col_name (str): The label/prefix to add before the cleaned text 
                       (e.g., "NAME", "TAGS", "INGREDIENTS").
    
    Returns:
        str: Cleaned and formatted text in the format "{col_name}: {cleaned_text}."
             Returns empty string if input is None or empty.
    """
    
    if tag_value is None or tag_value == "":
        return ""
    
    text = str(tag_value)
    
    # Remove list brackets and quotes
    text = re.sub(r"[\[\]'\"]", "", text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Keep only alphanumeric, spaces, and . , ? !
    text = re.sub(r"[^a-z0-9 .,?!]+", "", text)
    
    # Remove excess spaces
    text = re.sub(r" +", " ", text)
    
    # Clean up spaces around punctuation
    text = text.strip()
    
    # Return formatted text
    return f"{col_name}: {text}."

**extract only required columns for embedding**

In [None]:
#extract only required columns for embedding
COLUMNS_TO_CLEAN = config["columns_to_clean"]

for col in COLUMNS_TO_CLEAN:
    col_clean_name = COLUMNS_TO_CLEAN[col]['column_name']
    start_text = COLUMNS_TO_CLEAN[col]['start_text']

    df_recipes[col_clean_name] = df_recipes[col].apply(clean_columns_to_embedd, args=(start_text, ))

df_recipes_cleaned = df_recipes[ [col['column_name'] for col in COLUMNS_TO_CLEAN.values()] ]

#add the id to keep track of the recepies
df_recipes_cleaned['ID'] = df_recipes['ID'] 
df_recipes_cleaned.head()

**create a column for each combinaison of embedding columns**

In [None]:
#initialize a columns for each configuration of columns to embedd
COLUMNS_TO_EMBEDDE = config["columns_embedding"]

for col_config_name, cols_list in COLUMNS_TO_EMBEDDE.items():
    df_recipes_cleaned[col_config_name] = ""

    for col in cols_list:
        column_name_cleaned = COLUMNS_TO_CLEAN[col]['column_name']
        df_recipes_cleaned[col_config_name] += df_recipes_cleaned[f"{column_name_cleaned}"] + " "
        
df_recipes_cleaned.head()

In [None]:
#test
df_recipes_cleaned['config_1'].values[0]

**load the embedding models**

In [None]:
#load the models
import torch
from sentence_transformers import SentenceTransformer
from torch.nn.functional import normalize

MODELS_CONFIG = config["models"]

#create a dict {name model : model} 
MODELS_LIST = [SentenceTransformer(model_id) for model_id in MODELS_CONFIG]
MODEL_DICT = dict(zip(MODELS_CONFIG, MODELS_LIST))

**define a function that count number of token for each config and model**

In [None]:
def compute_token_size(text: str, model: SentenceTransformer) -> int:
    """
    Compute the number of tokens in the given text using the specified SentenceTransformer model.

    Args:
        text (str): The input text to tokenize.
        model (SentenceTransformer): The SentenceTransformer model used for tokenization.

    Returns:
        int: The number of tokens in the input text.
    """

    tokens = model.tokenizer.tokenize(text)
    
    return len(tokens)

**set the experience id**

In [None]:
EXPERIENCE_ID = config["experiments_specifique_params"]["experiment_id"]

print(EXPERIENCE_ID)

**set folder and file path for embedding**

In [None]:
OUTPUT_EMBEDDING_FOLDER = config["output_embedding_dir"].format(
    experiment_id=EXPERIENCE_ID
)

os.makedirs(OUTPUT_EMBEDDING_FOLDER, exist_ok=True)

OUTPUT_EMBEDDING_FILE = config["output_recipies_embedding_file"].format(
    experiment_id=EXPERIENCE_ID
)

print(OUTPUT_EMBEDDING_FOLDER)
print(OUTPUT_EMBEDDING_FILE)

**calculate number of token per recipie for each config and model**

In [None]:
from tqdm import tqdm 

df_recipes_embedding = df_recipes_cleaned.copy()

#count number of token for each config and model
for col_name, cols_list in COLUMNS_TO_EMBEDDE.items():
    for model_id, model in MODEL_DICT.items():

        embedding_col = f"{model_id}/{col_name}_EMB"
        tokens_col = f"{embedding_col}_NUMBER_TOKEN" 

        number_token = []
        for text in tqdm(df_recipes_embedding[col_name], desc=f"count number token {col_name} with {model_id}"):
            num_tokens = compute_token_size(text, model)
            number_token.append(num_tokens)

        df_recipes_embedding[f"{embedding_col}_NUMBER_TOKEN"] = number_token

In [None]:
df_recipes_embedding['intfloat/e5-base-v2/config_1_EMB_NUMBER_TOKEN'].describe()

In [None]:
len(df_recipes_embedding[df_recipes_embedding['intfloat/e5-base-v2/config_1_EMB_NUMBER_TOKEN'] > 512])

**define function to calculate embedding of a text**

In [None]:
def compute_embedding(model: SentenceTransformer, texts: list[str]) -> torch.Tensor:
    """
    Compute normalized embeddings for a list of texts using the specified model.

    Args:
        model (SentenceTransformer): The pre-trained sentence transformer model to use.
        texts (list[str]): A list of input texts to compute embeddings for.

    Returns:
        torch.Tensor: A tensor containing the normalized embeddings for the input texts.
    """

    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"

    model = model.to(device)

    # Compute embeddings
    embeddings = model.encode(texts, convert_to_tensor=True)
    
    # Normalize embeddings to unit length
    normalized_embeddings = normalize(embeddings, p=2, dim=1)
    
    return normalized_embeddings

**compute embedding for each config and model**

In [None]:
# #create embedding cols 
from tqdm import tqdm  

for col_name, cols_list in COLUMNS_TO_EMBEDDE.items():
    for model_id, model in MODEL_DICT.items():

        embedding_col = f"{model_id}/{col_name}_EMB"
                
        embeddings = []
        for text in tqdm(df_recipes_embedding[col_name], desc=f"Embedding {col_name} with {model_id}"):
            emb = compute_embedding(model, [text])[0].cpu().numpy()                         
            embeddings.append(emb)
        
        # Save embeddings to new column
        df_recipes_embedding[embedding_col] = embeddings



**write the file of embedding**

In [None]:
os.makedirs(OUTPUT_EMBEDDING_FOLDER, exist_ok=True)

df_recipes_embedding.to_csv(OUTPUT_EMBEDDING_FILE, index=False)