In [110]:
import pandas as pd
import numpy as np
import os
import json

**laod the config file**

In [111]:
import dotenv

dotenv.load_dotenv(override=True)

CONFIG_FILE_PATH = os.getenv("CONFIG_FILE_PATH")

with open(CONFIG_FILE_PATH, "r", encoding="utf-8") as f:
    config = json.load(f)

**load clustering metrics**

In [112]:
EXPERIENCE_ID = os.getenv("EXPERIENCE_ID")

In [113]:
METRICS_CLUSTERING_DIR = config["output_clustering_dir"].format(
    experiment_id=EXPERIENCE_ID
)

METRICS_FILE = config["metric_clustering_file"]

MODELS_NAME = config["models"]
COLUMNS_TO_EMBEDDED = config["columns_embedding"]

In [114]:
embedding_config_metrics = {}

for col_name, cols_list in COLUMNS_TO_EMBEDDED.items():
    for model in MODELS_NAME:

        embedding_col = f"{model}/{col_name}_EMB"

        json_path = METRICS_FILE = config["metric_clustering_file"].format(
            experiment_id=EXPERIENCE_ID,
            model_config=embedding_col
        )

        print(json_path)
        
        try:
            with open(json_path, "r") as f:
                embedding_config_metrics[embedding_col] = json.load(f)
        except Exception as e:
            print(f"‚ùå Error loading {json_path}: {e}")

metrics/clustering/exp1/Snowflake/snowflake-arctic-embed-m-v1.5/config_1_EMB/consencus_clustering.json
metrics/clustering/exp1/Snowflake/snowflake-arctic-embed-m/config_1_EMB/consencus_clustering.json
metrics/clustering/exp1/intfloat/e5-base-v2/config_1_EMB/consencus_clustering.json
metrics/clustering/exp1/sentence-transformers/all-MiniLM-L6-v2/config_1_EMB/consencus_clustering.json
metrics/clustering/exp1/BAAI/bge-base-en-v1.5/config_1_EMB/consencus_clustering.json
metrics/clustering/exp1/Snowflake/snowflake-arctic-embed-m-v1.5/config_2_EMB/consencus_clustering.json
metrics/clustering/exp1/Snowflake/snowflake-arctic-embed-m/config_2_EMB/consencus_clustering.json
metrics/clustering/exp1/intfloat/e5-base-v2/config_2_EMB/consencus_clustering.json
metrics/clustering/exp1/sentence-transformers/all-MiniLM-L6-v2/config_2_EMB/consencus_clustering.json
metrics/clustering/exp1/BAAI/bge-base-en-v1.5/config_2_EMB/consencus_clustering.json
metrics/clustering/exp1/Snowflake/snowflake-arctic-embed-m

In [115]:
ranked_results = sorted(
        embedding_config_metrics.items(),
        key=lambda x: x[1].get("score_clustering", 0),
        reverse=True
    )

# ---- Display ----
print("üèÜ Ranking of Embedding Configurations (Best ‚Üí Worst)\n")
for rank, (name, vals) in enumerate(ranked_results, start=1):
    print(
        f"{rank:2d}. {name:40} "
        f"score={vals['score_clustering']:.4f}  "
        f"silhouette={vals['silhouette']:.4f}  "
        f"intra_cluster_cos={vals['intra_cluster_cosine_mean']:.4f}  "
        f"inter_cluster_cos={vals['inter_cluster_cosine_mean']:.4f}  "
        f"cosine gap = {vals['intra_cluster_cosine_mean'] - vals['inter_cluster_cosine_mean']:.4f}  "
        f"k={vals['n_clusters']}"
    )

üèÜ Ranking of Embedding Configurations (Best ‚Üí Worst)

 1. BAAI/bge-base-en-v1.5/config_4_EMB       score=0.5969  silhouette=0.0826  intra_cluster_cos=0.8187  inter_cluster_cos=0.7537  cosine gap = 0.0650  k=13
 2. intfloat/e5-base-v2/config_4_EMB         score=0.5911  silhouette=0.0765  intra_cluster_cos=0.9054  inter_cluster_cos=0.8808  cosine gap = 0.0246  k=7
 3. BAAI/bge-base-en-v1.5/config_5_EMB       score=0.5831  silhouette=0.0605  intra_cluster_cos=0.7807  inter_cluster_cos=0.7232  cosine gap = 0.0575  k=13
 4. sentence-transformers/all-MiniLM-L6-v2/config_4_EMB score=0.5825  silhouette=0.0233  intra_cluster_cos=0.7918  inter_cluster_cos=0.7237  cosine gap = 0.0681  k=11
 5. intfloat/e5-base-v2/config_2_EMB         score=0.5815  silhouette=0.0597  intra_cluster_cos=0.8794  inter_cluster_cos=0.8555  cosine gap = 0.0239  k=7
 6. intfloat/e5-base-v2/config_5_EMB         score=0.5814  silhouette=0.0650  intra_cluster_cos=0.8857  inter_cluster_cos=0.8609  cosine gap = 0.0248  k

**retrive documents relative to query**

In [116]:
#load the data
import ast

INPUT_EMBEDDINGS_FILE = config["output_recipies_embedding_file"].format(
    experiment_id=EXPERIENCE_ID
)

df_recipes_cleaned = pd.read_csv(INPUT_EMBEDDINGS_FILE)

emb_columns = [col for col in df_recipes_cleaned.columns if col.endswith('_EMB')]

for col in emb_columns:
    df_recipes_cleaned[col] = df_recipes_cleaned[col].apply(
        lambda x: np.fromstring(x.strip('[]'), sep=' ')
    )

df_recipes_cleaned.head()

Unnamed: 0,NAME_CLEAND,TAGS_CLEAND,INGREDIENTS_CLEAND,STEPS_CLEAND,DESCRIPTION_CLEAND,config_1,config_2,config_3,config_4,config_5,...,Snowflake/snowflake-arctic-embed-m-v1.5/config_5_EMB,Snowflake/snowflake-arctic-embed-m/config_5_EMB,intfloat/e5-base-v2/config_5_EMB,sentence-transformers/all-MiniLM-L6-v2/config_5_EMB,BAAI/bge-base-en-v1.5/config_5_EMB,Snowflake/snowflake-arctic-embed-m-v1.5/config_6_EMB,Snowflake/snowflake-arctic-embed-m/config_6_EMB,intfloat/e5-base-v2/config_6_EMB,sentence-transformers/all-MiniLM-L6-v2/config_6_EMB,BAAI/bge-base-en-v1.5/config_6_EMB
0,recipe name: crab filled crescent snacks.,"recipe tags: timetomake, course, mainingredien...","recipe ingredients: crabmeat, cream cheese, gr...","recipe steps: heat over to 375 degrees, spray ...",recipe description: found in a crescent roll r...,recipe name: crab filled crescent snacks. reci...,"recipe tags: timetomake, course, mainingredien...","recipe tags: timetomake, course, mainingredien...","recipe tags: timetomake, course, mainingredien...","recipe tags: timetomake, course, mainingredien...",...,"[-0.000112489972, 0.092555888, 0.044074107, -0...","[-0.0325349346, 0.0726739094, 0.012400113, -0....","[-0.014183538, -0.0204121303, -0.0474909544, -...","[-0.0334200487, 0.0660522282, -0.00879483018, ...","[-0.02223087, 0.0171403, -0.05698212, 0.016229...","[0.0271100886, 0.0519786365, 0.0155910579, -0....","[-0.03760105, 0.0446092524, -0.0198613591, -0....","[-0.0239876248, -0.0387270637, -0.0681184903, ...","[-0.0313935541, 0.0931395069, -0.0168046467, -...","[-0.017173633, -0.00434506917, -0.0693777129, ..."
1,recipe name: curried bean salad.,"recipe tags: curries, 30minutesorless, timetom...","recipe ingredients: garbanzo beans, black bean...","recipe steps: drain rinse beans, stir all ingr...",recipe description: serve this flavorful and r...,recipe name: curried bean salad. recipe tags: ...,"recipe tags: curries, 30minutesorless, timetom...","recipe tags: curries, 30minutesorless, timetom...","recipe tags: curries, 30minutesorless, timetom...","recipe tags: curries, 30minutesorless, timetom...",...,"[0.0383950397, 0.0181233715, 0.11421448, -0.00...","[0.02366637, 0.00079082, 0.05553786, 0.0148864...","[-0.01237197, -0.04495471, -0.02482142, -0.009...","[-0.0552039146, -0.0277715605, 0.00933567807, ...","[-0.000431435183, -0.0547247566, 0.00185950252...","[0.0406921282, -0.0286415275, 0.0669359937, 0....","[-0.00192602968, -0.032134261, 0.0460976809, 0...","[-0.01952695, -0.05960128, -0.040347, -0.00886...","[-0.0794401765, -0.038131129, -0.00477592088, ...","[-0.0123381084, -0.0830701962, -0.007970348, -..."
2,recipe name: delicious steak with onion marinade.,"recipe tags: lactose, 30minutesorless, timetom...","recipe ingredients: olive oil, red onion, ligh...",recipe steps: heat the oil in a heavybased pan...,"recipe description: another ive not tried, but...",recipe name: delicious steak with onion marina...,"recipe tags: lactose, 30minutesorless, timetom...","recipe tags: lactose, 30minutesorless, timetom...","recipe tags: lactose, 30minutesorless, timetom...","recipe tags: lactose, 30minutesorless, timetom...",...,"[0.0182225239, 0.0382581875, 0.136593059, 0.01...","[-0.0253467523, 0.0263651982, 0.00715303421, 0...","[-0.0444376282, -0.0375359692, -0.0298079625, ...","[-0.0692065954, -0.0641314313, 0.00777433813, ...","[-0.0251103155, 0.00192668661, -0.00584613858,...","[0.0629829466, 0.0026503047, 0.117520534, -0.0...","[-0.02078526, 0.02693653, 0.01444969, 0.005435...","[-0.0248615984, -0.0340739489, -0.0317234546, ...","[-0.0537171178, -0.0656957775, 0.0394559428, 0...","[0.00241703098, -0.00614305167, -0.00425187266..."
3,recipe name: pork tenderloin with hoisin.,"recipe tags: 15minutesorless, timetomake, cour...","recipe ingredients: pork tenderloin, soy sauce...","recipe steps: cut pork into 1 4inch slices, in...",recipe description: another keeper from bonnie...,recipe name: pork tenderloin with hoisin. reci...,"recipe tags: 15minutesorless, timetomake, cour...","recipe tags: 15minutesorless, timetomake, cour...","recipe tags: 15minutesorless, timetomake, cour...","recipe tags: 15minutesorless, timetomake, cour...",...,"[0.0204694644, 0.0677850842, 0.0440402813, -0....","[-0.0246525072, 0.00834974833, 0.012334385, 0....","[-0.000314318313, -0.0302229393, -0.0557601079...","[-0.0635362044, 0.0556964725, 0.058865495, -0....","[-0.028353557, 0.00101052935, -0.00194109231, ...","[0.00907324068, 0.0125132818, 0.0400328673, 0....","[-0.026464982, 0.000221537906, 0.0159761515, 0...","[-0.00365850609, -0.0452148579, -0.0532100387,...","[-0.069036603, 0.0444690734, -0.0180649757, 0....","[-0.0294371713, -0.03445106, -0.0103820097, 0...."
4,recipe name: mixed baby greens with oranges gr...,"recipe tags: 15minutesorless, timetomake, cour...","recipe ingredients: mixed baby greens, oranges...",recipe steps: in a salad bowl combine the lett...,recipe description: i love grapefruit in a sal...,recipe name: mixed baby greens with oranges gr...,"recipe tags: 15minutesorless, timetomake, cour...","recipe tags: 15minutesorless, timetomake, cour...","recipe tags: 15minutesorless, timetomake, cour...","recipe tags: 15minutesorless, timetomake, cour...",...,"[0.0274216328, -0.00814707391, 0.076023072, 0....","[0.0114473281, 0.00293102534, 0.00191029755, 0...","[-0.0221814215, -0.0495240167, -0.0567183867, ...","[-0.025078008, 0.013611393, 0.00171509013, 0.0...","[-0.00976417586, -0.0103993863, -0.0483738706,...","[0.0227223989, -0.0190538112, 0.0975191668, 0....","[-0.0128676854, -0.00735520106, 0.00878883898,...","[-0.0317776389, -0.0562590286, -0.0586722381, ...","[-0.0271002483, 0.0166807566, -0.0290076192, 0...","[-0.0264904667, -0.035189338, -0.0516120046, 0..."


In [117]:
#load models

import torch
from sentence_transformers import SentenceTransformer
from torch.nn.functional import normalize

MODELS_CONFIG = config["models"]
COLUMNS_TO_EMBEDDE = config["columns_embedding"]

#create a dict {name model : model} 
MODELS_LIST = [SentenceTransformer(model_id) for model_id in MODELS_CONFIG]
MODEL_DICT = dict(zip(MODELS_CONFIG, MODELS_LIST))

In [118]:
#create query dict

QUERY_FILE_PATH = config["query_file_path"]

with open(QUERY_FILE_PATH, "r", encoding="utf-8") as f:
    config_query = json.load(f)

query_dicts = config_query

document_queries = {}

for configuration, fields in COLUMNS_TO_EMBEDDE.items():
    document_queries[configuration] = []
    for q in query_dicts:
        doc_parts = []
        for field in fields:
            if field in q:
                doc_parts.append(f"{field}: {q[field]}")
        # Join fields into a single string
        document_queries[configuration].append(". ".join(doc_parts))


In [119]:
config

{'input_recipies_file': 'data/recipes_samples.csv',
 'output_recipies_embedding_file': 'data/embedding/exp{experiment_id}/recipies_samples_embeddings.csv',
 'output_clustering_dir': 'metrics/clustering/exp{experiment_id}/',
 'metric_clustering_file': 'metrics/clustering/exp{experiment_id}/{model_config}/consencus_clustering.json',
 'output_retrival_dir': 'metrics/retrival/exp{experiment_id}/',
 'output_query_metrics_file': 'metrics/retrival/exp{experiment_id}/retrival_metrics.json',
 'output_per_query_file': 'metrics/retrival/exp{experiment_id}/retrival_per_query.json',
 'query_file_path': 'config/query_test.json',
 'models': ['Snowflake/snowflake-arctic-embed-m-v1.5',
  'Snowflake/snowflake-arctic-embed-m',
  'intfloat/e5-base-v2',
  'sentence-transformers/all-MiniLM-L6-v2',
  'BAAI/bge-base-en-v1.5'],
 'columns_embedding': {'config_1': ['NAME',
   'TAGS',
   'INGREDIENTS',
   'STEPS',
   'DESCRIPTION'],
  'config_2': ['TAGS', 'INGREDIENTS', 'DESCRIPTION'],
  'config_3': ['TAGS', 'STE

In [122]:
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

TOP_K = 10

OUTPUT_RETRIVAL_METRICS_PATH = config["output_retrival_dir"].format(
    experiment_id=EXPERIENCE_ID
)

OUTPUT_RETRIVAL_METRICS_FILE = config["output_query_metrics_file"].format(
    experiment_id=EXPERIENCE_ID
)
OUTPUT_PER_QUERY_FILE = config["output_per_query_file"].format(
    experiment_id=EXPERIENCE_ID
)

os.makedirs(OUTPUT_RETRIVAL_METRICS_PATH, exist_ok=True)

summary_results = {}
query_results = {}

# Select embedding columns
emb_cols = [col for col in df_recipes_cleaned.columns if col.endswith("_EMB")]

for col in tqdm(emb_cols, desc="Processing embeddings"):
    recipe_embeddings = np.vstack(df_recipes_cleaned[col].apply(np.array).to_list())

    per_query_metrics = []

    # Extract model name from column to get the right embedding model
    model_name = "/".join(col.split("/")[:-1])  # e.g., "Snowflake/snowflake-arctic-embed-m"
    config_name = col.split("/")[-1].replace("_EMB", "")

    if model_name not in MODEL_DICT:
        print(f"‚ö†Ô∏è Model not found in MODEL_DICT: {model_name}")
        continue
    model = MODEL_DICT[model_name]

    # Get queries for this config
    config_queries = document_queries[config_name]

    for q_idx, query_text in enumerate(config_queries):
        # Encode query using the correct model
        query_emb = model.encode(query_text, normalize_embeddings=True).reshape(1, -1)

        # Compute cosine similarity
        sims = cosine_similarity(query_emb, recipe_embeddings)[0]

        # Top-K indices
        top_idx = sims.argsort()[::-1][:TOP_K]
        top_sims = sims[top_idx]

        # Get recipe names
        top_recipes = df_recipes_cleaned.iloc[top_idx][["NAME_CLEAND", "TAGS_CLEAND", "INGREDIENTS_CLEAND"]].to_dict(orient="records")

        # Add similarity to each retrieved recipe
        for i, rec in enumerate(top_recipes):
            rec["similarity"] = float(top_sims[i])

        # Average top-K similarity
        avg_topk_sim = float(np.mean(top_sims))

        # Inter-top-K similarity
        top_embeddings = recipe_embeddings[top_idx]
        inter_topk_sim_matrix = cosine_similarity(top_embeddings)
        inter_topk_sim = float((inter_topk_sim_matrix.sum() - np.trace(inter_topk_sim_matrix)) / (TOP_K * (TOP_K - 1)))

        per_query_metrics.append({
            "query_text": query_text,
            "top_k_retrieval": top_recipes,
            "avg_topk_similarity": avg_topk_sim,
            "inter_topk_similarity": inter_topk_sim
        })

    # Aggregate metrics over all queries
    avg_topk_all = float(np.mean([m["avg_topk_similarity"] for m in per_query_metrics]))
    avg_inter_topk_all = float(np.mean([m["inter_topk_similarity"] for m in per_query_metrics]))

    summary_results[col] = {
        "mean_avg_topk_similarity": avg_topk_all,
        "mean_inter_topk_similarity": avg_inter_topk_all
    }

    query_results[col] = per_query_metrics

# Save summary JSON (metrics per config)
with open(OUTPUT_RETRIVAL_METRICS_FILE, "w") as f:
    json.dump(summary_results, f, indent=2)

# Save detailed per-query JSON with names and similarities
with open(OUTPUT_PER_QUERY_FILE, "w") as f:
    json.dump(query_results, f, indent=2)

Processing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:18<00:00,  1.58it/s]
