In [322]:
import pandas as pd
import numpy as np
import os
import json

**laod the config file**

In [323]:
import dotenv

dotenv.load_dotenv(override=True)

CONFIG_FILE_PATH = os.getenv("CONFIG_FILE_PATH")

with open(CONFIG_FILE_PATH, "r", encoding="utf-8") as f:
    config = json.load(f)

**set the experience id** 

In [324]:
EXPERIENCE_ID = config["experiments_specifique_params"]["experiment_id"]

print(EXPERIENCE_ID)

1


**load the embedding file**

In [325]:
INPUT_EMBEDDINGS_FILE = config["output_recipies_embedding_file"].format(
    experiment_id=EXPERIENCE_ID
)
df_recipes_cleaned = pd.read_csv(INPUT_EMBEDDINGS_FILE)


emb_columns = [col for col in df_recipes_cleaned.columns if col.endswith('_EMB')]
print(f"Found embedding columns: {emb_columns}")

for col in emb_columns:
    df_recipes_cleaned[col] = df_recipes_cleaned[col].apply(
        lambda x: np.fromstring(x.strip('[]'), sep=' ', dtype=np.float32)
    )

for col in emb_columns:
    print(f"{col} -> first embedding shape: {df_recipes_cleaned[col][0].shape}")

Found embedding columns: ['intfloat/e5-base-v2/config_1_EMB', 'sentence-transformers/all-MiniLM-L6-v2/config_1_EMB', 'BAAI/bge-base-en-v1.5/config_1_EMB', 'Snowflake/snowflake-arctic-embed-m/config_1_EMB', 'Snowflake/snowflake-arctic-embed-m-v1.5/config_1_EMB', 'intfloat/e5-base-v2/config_2_EMB', 'sentence-transformers/all-MiniLM-L6-v2/config_2_EMB', 'BAAI/bge-base-en-v1.5/config_2_EMB', 'Snowflake/snowflake-arctic-embed-m/config_2_EMB', 'Snowflake/snowflake-arctic-embed-m-v1.5/config_2_EMB', 'intfloat/e5-base-v2/config_3_EMB', 'sentence-transformers/all-MiniLM-L6-v2/config_3_EMB', 'BAAI/bge-base-en-v1.5/config_3_EMB', 'Snowflake/snowflake-arctic-embed-m/config_3_EMB', 'Snowflake/snowflake-arctic-embed-m-v1.5/config_3_EMB', 'intfloat/e5-base-v2/config_4_EMB', 'sentence-transformers/all-MiniLM-L6-v2/config_4_EMB', 'BAAI/bge-base-en-v1.5/config_4_EMB', 'Snowflake/snowflake-arctic-embed-m/config_4_EMB', 'Snowflake/snowflake-arctic-embed-m-v1.5/config_4_EMB', 'intfloat/e5-base-v2/config_5_

**load the models**

In [326]:
#load models

import torch
from sentence_transformers import SentenceTransformer
from torch.nn.functional import normalize

MODELS_CONFIG = config["models"]
COLUMNS_TO_EMBEDDE = config["columns_embedding"]

#create a dict {name model : model} 
MODELS_LIST = [SentenceTransformer(model_id) for model_id in MODELS_CONFIG]
MODEL_DICT = dict(zip(MODELS_CONFIG, MODELS_LIST))

**get the ground truth for query retrival**

In [327]:
#create query dict
QUERY_FILE_PATH = config["query_file_path"]

with open(QUERY_FILE_PATH, "r", encoding="utf-8") as f:
    query_documents_dicts = json.load(f)

print(QUERY_FILE_PATH)
print(query_documents_dicts)

data/query/query_test.json
[{'query_text': 'easy breakfast smoothie vegan protein-packed', 'documents': [105717, 195909, 251917, 343717, 288453, 39520, 126940, 124000]}]


**set retrival metric folder and file path**

In [328]:
METRIC_FOLDER_PATH = config["output_retrival_dir"].format(
    experiment_id=EXPERIENCE_ID 
)

#create the folder if not exists
os.makedirs(METRIC_FOLDER_PATH, exist_ok=True)

RETRIVED_DOCUMENTS_FILE = config["output_retrival_file"].format(
    experiment_id=EXPERIENCE_ID 
)

METRIC_QUERY_FILE = config["output_query_metrics_file"].format(
    experiment_id=EXPERIENCE_ID
)

METRIC_PER_QUERY_FILE = config["output_per_query_file"].format(
    experiment_id=EXPERIENCE_ID 
)


print(METRIC_FOLDER_PATH)
print(METRIC_QUERY_FILE)
print(METRIC_PER_QUERY_FILE)

experiments/exp1/metrics
experiments/exp1/metrics/retrival_metrics.json
experiments/exp1/metrics/retrival_per_query.json


**define function to retrive documents**

In [329]:
def retrieve_documents(query: str, model: SentenceTransformer, documents: list, df: pd.DataFrame, top_k: int) -> pd.DataFrame:
    """
    Retrieve top-k documents relevant to the query using the specified embedding model.
    """

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    # Generate query embedding
    query_embedding = model.encode([query], convert_to_tensor=True, device=device)
    query_embedding = normalize(query_embedding, p=2, dim=1)

    # Build document embedding tensor on the same device
    document_embeddings = torch.stack(
        [torch.tensor(doc, dtype=query_embedding.dtype, device=device) for doc in documents]
    )
    document_embeddings = normalize(document_embeddings, p=2, dim=1)

    # Compute cosine similarity
    cosine_scores = torch.matmul(query_embedding, document_embeddings.T).squeeze(0)

    # Top-k
    top_k_results = torch.topk(cosine_scores, k=top_k)
    top_k_indices = top_k_results.indices.tolist()
    top_k_scores = top_k_results.values.tolist()

    retrieved_df = df.iloc[top_k_indices].copy()
    retrieved_df["similarity_score"] = top_k_scores

    return retrieved_df


**define functions that calculate metrics**

In [330]:
#compute precison at K
def compute_precision_at_k(retrieved_docs: list, relevant_docs: list, k: int) -> float:
    """
    Compute Precision at K for the retrieved documents.
    
    Args:
        retrieved_docs (list): List of retrieved document IDs.
        relevant_docs (list): List of relevant document IDs.
        k (int): The cutoff rank K.

    Returns:
        float: Precision at K value.
    """
    # Get the top-k retrieved documents
    top_k_retrieved = retrieved_docs[:k]
    
    # Count the number of relevant documents in the top-k retrieved documents
    relevant_retrieved_count = sum(1 for doc_id in top_k_retrieved if doc_id in relevant_docs)
    
    # Calculate Precision at K
    precision_at_k = relevant_retrieved_count / k
    
    return precision_at_k

#compute recall at K
def compute_recall_at_k(retrieved_docs: list, relevant_docs: list, k: int) -> float:
    """
    Compute Recall at K for the retrieved documents.
    
    Args:
        retrieved_docs (list): List of retrieved document IDs.
        relevant_docs (list): List of relevant document IDs.
        k (int): The cutoff rank K.

    Returns:
        float: Recall at K value.
    """
    # Get the top-k retrieved documents
    top_k_retrieved = retrieved_docs[:k]
    
    # Count the number of relevant documents in the top-k retrieved documents
    relevant_retrieved_count = sum(1 for doc_id in top_k_retrieved if doc_id in relevant_docs)
    
    recall_at_k = relevant_retrieved_count / len(relevant_docs) if relevant_docs else 0.0
    
    return recall_at_k

#compute the hit rate at k
def compute_hit_rate_at_k(retrieved_docs: list, relevant_docs: list, k: int) -> float:
    """
    Compute Hit Rate at K for the retrieved documents.
    
    Args:
        retrieved_docs (list): List of retrieved document IDs.
        relevant_docs (list): List of relevant document IDs.
        k (int): The cutoff rank K.

    Returns:
        float: Hit Rate at K value.
    """
    # Get the top-k retrieved documents
    top_k_retrieved = retrieved_docs[:k]
    
    # Check if any relevant document is in the top-k retrieved documents
    hit = any(doc_id in relevant_docs for doc_id in top_k_retrieved)
    
    return 1.0 if hit else 0.0

**define functions that calculate mean metrics over all queries**

In [331]:
def compute_mean_metrics(metrics: list) -> float:
    """
    Compute the mean of all metrics.
    
    Args:
        metrics (list): A list of dictionaries containing metric names and their values.

    Returns:
        dict: A dictionary containing the mean off the metrics.
    """
    
    mean_metrics = {}
    for metric_name in metrics[0].keys():
        mean_metrics[f'mean_{metric_name}'] = np.mean([m[metric_name] for m in metrics])
    return mean_metrics

**define function that write metrics**

In [332]:
def store_metrics(output_json: dict, top_k: int, emb_col: str, metrics: dict, query_text=None):
    """
    Write the metrics per query to a JSON file.
    
    Args:
        output_json (dict): The dictionary to store the metrics.
        top_k (int): The value of K for precision/recall at K.
        query_text (str, optional): The query text. Defaults to None when we calculate the aggregated metrics over all queries.
        emb_col (str): The embedding column used.
        metrics (dict): A dictionary representing metric names and their values.
    """
    
    if top_k not in output_json:
        output_json[top_k] = {}
    
    if emb_col not in output_json[top_k]:
        output_json[top_k][emb_col] = {}
    
    if query_text is not None:
        if query_text not in output_json[top_k][emb_col]:
            output_json[top_k][emb_col][query_text] = {}
            
        for metric_name, value in metrics.items():
            output_json[top_k][emb_col][query_text][metric_name] = value  
    
    #we are aggreating over all queries
    else:
        for metric_name, value in metrics.items():
            output_json[top_k][emb_col][metric_name] = value  

**define a function that write retrived documents**

In [333]:
#write retrived documents to json
def store_retrived_documents(retrived_json: dict, top_k: int, query_text: str, emb_col: str) -> None:
    """
    Write the retrieved documents to a JSON file.
    
    Args:
        retrived_json (dict): Dictionary of retrieved documents.
        top_k (int): The number of top documents retrieved.
        query_text (str): The input query string.
        emb_col (str): The embedding column name.
    """
    
    results_list = []
    rank = 1
    
    for _, row in retrieved_df.iterrows():
        results_list.append({
            "RANK": rank,
            "ID": row["ID"],
            "NAME": row["NAME_CLEAND"],
            "TAGS": row["TAGS_CLEAND"],
            "INGREDIENTS": row["INGREDIENTS_CLEAND"],
            "STEPS": row["STEPS_CLEAND"],
            "DESCRIPTION": row["DESCRIPTION_CLEAND"],
            # "similarity_score": float(row["similarity_score"])
            })
        rank += 1
            
        if top_k not in retrived_json:
            retrived_json[top_k] = {}
        if emb_col not in retrived_json[top_k]:
            retrived_json[top_k][emb_col] = {}
        
        retrived_json[top_k][emb_col][query_text] = results_list   

**get the top_k**

In [334]:
TOP_K_LIST = config["experiments_specifique_params"]["top_k"]

print(TOP_K_LIST)

[200]


**calculate the metrics**

In [335]:
emb_cols = [col for col in df_recipes_cleaned.columns if col.endswith("_EMB")]

output_json_per_query = {}
output_json_all_query = {}
output_retrived_result_per_query = {}
output_test = {}


for top_k in TOP_K_LIST:
    
    for emb_col in emb_cols:
            
        # Convert embedding column to a list of numpy arrays
        documents = df_recipes_cleaned[emb_col].apply(np.array).to_list()
        
        # Build model name + config name from column name
        model_name = "/".join(emb_col.split("/")[:-1])   # Snowflake/snowflake-arctic-embed-m
        config_name = emb_col.split("/")[-1].replace("_EMB", "")
        
        model = MODEL_DICT[model_name]
        
        metrics_all_queries = []

        for query in query_documents_dicts:
            query_text = query['query_text']
            document_ids = query['documents']
            
            retrieved_df = retrieve_documents(query_text, model, documents, df_recipes_cleaned, top_k=top_k)
            
            if emb_col == "intfloat/e5-base-v2/config_1_EMB":
                store_retrived_documents(
                    retrived_json=output_test,
                    top_k=top_k,
                    query_text=query_text,
                    emb_col=emb_col
                )
            
            #calculate metrics
            metrics_per_query = {
                'precision_at_k': compute_precision_at_k([doc for doc in retrieved_df['ID'].tolist()], document_ids, top_k),
                'recall_at_k': compute_recall_at_k([doc for doc in retrieved_df['ID'].tolist()], document_ids, top_k),
                'hit_rate_at_k': compute_hit_rate_at_k([doc for doc in retrieved_df['ID'].tolist()], document_ids, top_k),
            }
            
            #store the metrics per query
            metrics_all_queries.append(metrics_per_query)
            
            #store metrics per query
            store_metrics(
                output_json=output_json_per_query,
                top_k=top_k,
                query_text=query_text,
                emb_col=emb_col,
                metrics=metrics_per_query
            )
        
        with open("output.json", "w", encoding="utf-8") as f:
            json.dump(output_test, f, indent=4)
            
        #claculate mean precision per query
        mean_metrics = compute_mean_metrics(metrics=metrics_all_queries)
        
        #store aggregated metrics over all queries
        store_metrics(
            output_json=output_json_all_query,
            top_k=top_k,
            emb_col=emb_col,
            metrics=mean_metrics
        )

with open(RETRIVED_DOCUMENTS_FILE, "w", encoding="utf-8") as f:
    json.dump(output_retrived_result_per_query, f, indent=4)

# Write all results once at the end
with open(METRIC_PER_QUERY_FILE, "w", encoding="utf-8") as f:
    json.dump(output_json_per_query, f, indent=4)
    
# Write all results once at the end
with open(METRIC_QUERY_FILE, "w", encoding="utf-8") as f:
    json.dump(output_json_all_query, f, indent=4)

**write the config specifique file for the experience**

In [336]:
OUPUT_EXPERIMENT_DIR = config["output_experiments_dir"].format(
    experiment_id=EXPERIENCE_ID 
)

# Write the config file
with open(os.path.join(OUPUT_EXPERIMENT_DIR, "config.json"), "w", encoding="utf-8") as f:
    json.dump(config, f, indent=4)