In [4]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN, SpectralClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
import umap.umap_ as umap
import hdbscan
import plotly.express as px
import time, json, os

**load the config file**

In [22]:
import dotenv

dotenv.load_dotenv(override=True)

CONFIG_FILE_PATH = os.getenv("CONFIG_FILE_PATH")

with open(CONFIG_FILE_PATH, "r", encoding="utf-8") as f:
    config = json.load(f)

**load the data**

In [23]:
EXPERIENCE_ID = os.getenv("EXPERIENCE_ID")

INPUT_EMBEDDINGS_FILE = config["output_recipies_embedding_file"].format(
    experiment_id=EXPERIENCE_ID
)

df_recipes_cleaned = pd.read_csv(INPUT_EMBEDDINGS_FILE)

In [24]:
import ast

emb_columns = [col for col in df_recipes_cleaned.columns if col.endswith('_EMB')]

for col in emb_columns:
    df_recipes_cleaned[col] = df_recipes_cleaned[col].apply(
        lambda x: np.fromstring(x.strip('[]'), sep=' ')
    )

**define clustering algorithms**

In [25]:
OUT_DIR = config["output_clustering_dir"].format(
    experiment_id=EXPERIENCE_ID
)

# --------------------------
# UMAP GRID - more configurations
# --------------------------
UMAP_GRID = [
    {"n_neighbors": 10, "min_dist": 0.1, "n_components": 20, "metric": "cosine"},
    # {"n_neighbors": 15, "min_dist": 0.1, "n_components": 20, "metric": "cosine"},
    # {"n_neighbors": 30, "min_dist": 0.1, "n_components": 30, "metric": "cosine"},
    # {"n_neighbors": 15, "min_dist": 0.3, "n_components": 30, "metric": "cosine"},
    # {"n_neighbors": 50, "min_dist": 0.5, "n_components": 50, "metric": "cosine"},
]

# --------------------------
# Clustering GRID - more diversity
# --------------------------
CLUSTERING_GRID = [
    # KMeans
    {"name": "kmeans", "params": {"n_clusters": 6}},
    {"name": "kmeans", "params": {"n_clusters": 8}},
    {"name": "kmeans", "params": {"n_clusters": 10}},
    {"name": "kmeans", "params": {"n_clusters": 12}},
    {"name": "kmeans", "params": {"n_clusters": 15}},

    # Agglomerative
    {"name": "agglomerative", "params": {"n_clusters": 6, "linkage": "average", "metric": "cosine"}},
    {"name": "agglomerative", "params": {"n_clusters": 8, "linkage": "average", "metric": "cosine"}},
    {"name": "agglomerative", "params": {"n_clusters": 10, "linkage": "complete", "metric": "cosine"}},
    {"name": "agglomerative", "params": {"n_clusters": 12, "linkage": "complete", "metric": "cosine"}},

    # HDBSCAN - density-based
    {"name": "hdbscan", "params": {"min_cluster_size": 5, "min_samples": 1}},
    {"name": "hdbscan", "params": {"min_cluster_size": 10, "min_samples": 1}},
    {"name": "hdbscan", "params": {"min_cluster_size": 15, "min_samples": 5}},
    {"name": "hdbscan", "params": {"min_cluster_size": 20, "min_samples": 5}},

    # DBSCAN - optional alternative
    {"name": "dbscan", "params": {"eps": 0.2, "min_samples": 3, "metric": "cosine"}},
    {"name": "dbscan", "params": {"eps": 0.3, "min_samples": 5, "metric": "cosine"}},

    # Spectral clustering
    {"name": "spectral", "params": {"n_clusters": 8}},
    {"name": "spectral", "params": {"n_clusters": 10}},
    {"name": "spectral", "params": {"n_clusters": 12}},
]


**calculate metrics**

In [26]:
def compute_metrics(emb_for_metric, labels):
    res = {}

    labels = np.array(labels)
    unique_labels = set(labels)
    n_clusters = len([l for l in unique_labels if l != -1])

    res["n_clusters"] = n_clusters
    res["n_points"] = len(labels)

    try:
        if n_clusters >= 2:
            res["silhouette"] = silhouette_score(
                emb_for_metric, labels, metric="cosine"
            )
        else:
            res["silhouette"] = float("nan")
    except Exception:
        res["silhouette"] = float("nan")

    try:
        if n_clusters >= 2:
            res["davies_bouldin"] = davies_bouldin_score(emb_for_metric, labels)
        else:
            res["davies_bouldin"] = float("nan")
    except Exception:
        res["davies_bouldin"] = float("nan")

    try:
        if n_clusters >= 2:
            res["calinski_harabasz"] = calinski_harabasz_score(
                emb_for_metric, labels
            )
        else:
            res["calinski_harabasz"] = float("nan")
    except Exception:
        res["calinski_harabasz"] = float("nan")

    try:
        emb = np.array(emb_for_metric)
        cos_sim = cosine_similarity(emb)

        # ---- Intra-cluster ----
        intra_sims = []
        for cl in unique_labels:
            if cl == -1:  # skip noise
                continue
            idx = np.where(labels == cl)[0]
            if len(idx) > 1:
                sims = cos_sim[np.ix_(idx, idx)]
                # exclude self-similarity (diag=1)
                sims = sims[np.triu_indices_from(sims, k=1)]
                if len(sims) > 0:
                    intra_sims.append(np.mean(sims))

        res["intra_cluster_cosine_mean"] = (
            float(np.mean(intra_sims)) if len(intra_sims) > 0 else float("nan")
        )

        # ---- Inter-cluster ----
        inter_sims = []
        clusters = [cl for cl in unique_labels if cl != -1]

        for i, c1 in enumerate(clusters):
            idx1 = np.where(labels == c1)[0]
            for c2 in clusters[i+1:]:
                idx2 = np.where(labels == c2)[0]
                sims = cos_sim[np.ix_(idx1, idx2)].reshape(-1)
                if len(sims) > 0:
                    inter_sims.append(np.mean(sims))

        res["inter_cluster_cosine_mean"] = (
            float(np.mean(inter_sims)) if len(inter_sims) > 0 else float("nan")
        )

    except Exception:
        res["intra_cluster_cosine_mean"] = float("nan")
        res["inter_cluster_cosine_mean"] = float("nan")

    return res


In [27]:
def calculate_clustering_weight(
        sil,
        db,
        ch,
        intra,
        inter
):
    """
    Calculate a normalized clustering quality weight from multiple metrics.

    Parameters
    ----------
    sil : float
        Silhouette score.
    db : float
        Davies-Bouldin index (lower is better).
    ch : float
        Calinski-Harabasz index (higher is better).
    intra : float
        Intra-cluster cosine similarity (higher is better).
    inter : float
        Inter-cluster cosine similarity (lower is better).

    Returns
    -------
    float
        Normalized weight in range [0, 1].
    """

    # If any metric is missing return a very low weight
    if any(np.isnan(x) for x in [sil, db, ch, intra, inter]):
        return 0.0

    sil_norm = (sil + 1) / 2  

    db_norm = 1.0 / (1.0 + db)

    ch_norm = np.log1p(max(ch, 0)) / 10
    ch_norm = min(ch_norm, 1.0)

    # intra-cluster cosine ∈ [-1, 1] → [0, 1]
    intra_norm = (intra + 1) / 2  

    # inter-cluster cosine: lower is better → invert → [0, 1]
    # If inter is near zero or negative → good
    inter_norm = 1.0 - ((inter + 1) / 2)
    inter_norm = max(min(inter_norm, 1.0), 0.0)

    W_SIL = 0.3
    W_DBI = 0.1
    W_CH  = 0.1
    W_INTRA = 0.3
    W_INTER = 0.2

    # Weighted sum
    weight = (
        W_SIL   * sil_norm +
        W_DBI   * db_norm +
        W_CH    * ch_norm +
        W_INTRA * intra_norm +
        W_INTER * inter_norm
    )

    # Ensure final weight ∈ [0,1]
    return float(min(max(weight, 0.0), 1.0))


**execute the clustering**

In [28]:
import warnings

def run_clustering_grid(
    emebdding_config,
    df_vectors,
    umap_grid,
    clustering_grid,
    output_path=OUT_DIR
):
    """
    Runs UMAP + clustering grid search, computes metrics and weights,
    and writes the output to JSON.

    Parameters
    ----------
    emebdding_config : str
        Config of embedding model and columns used for embedding
    df_vectors : np.ndarray or DataFrame
        Your high-dimensional recipe vectors.
    umap_grid : list of dict
        Grid of UMAP configurations.
    clustering_grid : list of dict
        Grid of clustering configurations.
    output_path : str
        Path where results JSON will be saved.

    Returns
    -------
    tuple (all_weights, all_labels)
        Tuple containing the weight for each clustering algo and the labels of each one.
    """

    recipe_vectors = np.array(df_vectors)
    results_json = []
    all_labels = []
    all_weights = []
    
    warnings.filterwarnings('ignore')
    
    for umap_cfg in tqdm(umap_grid, total = len(umap_grid), desc=f"executing dimension reduction + clustering"):
        reducer = umap.UMAP(
            n_neighbors=umap_cfg["n_neighbors"],
            min_dist=umap_cfg["min_dist"],
            n_components=umap_cfg["n_components"],
            metric=umap_cfg.get("metric", "cosine"),
            random_state=42
        )
        emb_reduced = reducer.fit_transform(recipe_vectors)

        for cl_cfg in clustering_grid:
            method = cl_cfg["name"]
            params = cl_cfg["params"].copy()
            labels = None

            # ===== CLUSTERING =====
            if method == "kmeans":
                model = KMeans(n_clusters=params["n_clusters"], random_state=42, n_init=10)
                labels = model.fit_predict(emb_reduced)

            elif method == "agglomerative":
                model = AgglomerativeClustering(
                    n_clusters=params["n_clusters"],
                    metric=params.get("metric", "cosine"),
                    linkage=params.get("linkage", "average")
                )
                labels = model.fit_predict(emb_reduced)

            elif method == "hdbscan":
                clusterer = hdbscan.HDBSCAN(
                    min_cluster_size=params.get("min_cluster_size", 10),
                    min_samples=params.get("min_samples", 1),
                    metric='euclidean'
                )
                labels = clusterer.fit_predict(emb_reduced)

            elif method == "dbscan":
                clusterer = DBSCAN(
                    eps=params.get("eps", 0.2),
                    min_samples=params.get("min_samples", 3),
                    metric=params.get("metric", "cosine")
                )
                labels = clusterer.fit_predict(emb_reduced)

            elif method == "spectral":
                model = SpectralClustering(
                    n_clusters=params["n_clusters"],
                    affinity="nearest_neighbors",
                    random_state=42,
                    assign_labels="kmeans"
                )
                labels = model.fit_predict(emb_reduced)

            else:
                labels = np.array([-1] * len(recipe_vectors))

            # ===== METRICS =====
            metrics = compute_metrics(emb_reduced, labels)
            sil = metrics["silhouette"]
            db = metrics["davies_bouldin"]
            ch = metrics["calinski_harabasz"]
            intra = metrics["intra_cluster_cosine_mean"]
            inter = metrics["inter_cluster_cosine_mean"]

            # ===== WEIGHT =====
            weight = calculate_clustering_weight(sil, db, ch, intra, inter)

            # Store valid clustering
            if labels is not None and len(np.unique(labels[labels >= 0])) > 1 and weight > 0:
                all_labels.append(labels)
                all_weights.append(weight)

            results_json.append({
                "model_name": "recipes_clustering_grid",
                "umap": umap_cfg,
                "clustering": {"method": method, "params": params},
                "results": {
                    "n_clusters_found": metrics["n_clusters"],
                    "silhouette": sil,
                    "davies_bouldin": db,
                    "calinski_harabasz": ch,
                    "intra_cluster_cosine_mean": intra,
                    "inter_cluster_cosine_mean": inter,
                    "quality_weight": float(weight)
                }
            })

    os.makedirs(f"{output_path}/{emebdding_config}", exist_ok=True)
    with open(f"{output_path}/{emebdding_config}/clustering.json", "w") as f:
        json.dump(results_json, f, indent=2)

    return all_weights, all_labels


**execute consencus**

In [29]:
def claculate_consencus_stability(
    best_consensus_labels,
    similarity_matrix
):

    stability_scores = []
    for i in range(len(similarity_matrix)):
        my_cluster = best_consensus_labels[i]
        
        # Find all points in the same consensus cluster
        cluster_members = np.where(best_consensus_labels == my_cluster)[0]
        
        if len(cluster_members) > 1:
            # Average co-occurrence with other members of my cluster
            cooccur_with_cluster = similarity_matrix[i, cluster_members].sum() - similarity_matrix[i, i]
            avg_cooccur = cooccur_with_cluster / (len(cluster_members) - 1)
            stability_scores.append(avg_cooccur)
        else:
            # Singleton cluster
            stability_scores.append(0.0)

    stability_scores = np.array(stability_scores)
    
    return stability_scores.mean()

In [30]:
def concensure_clustering(
    emebdding_config,
    df_vectors,
    all_weights, 
    all_labels,
    output_path=OUT_DIR
):
    """
    Runs a clustering on those result to find the best K number of cluster taking 
    into account the weight of each clustering

    Parameters
    ----------
    embedding_config : str
        the configuration of model and columns used for embedding
    df_vectors : np.ndarray or DataFrame
        Your high-dimensional recipe vectors.
    all_weights : list of weight
        Grid of UMAP Weight of the clustering algorithm.
    all_labels : list of labels
        Labels of point for each clustering algorithm.
    output_path : str
        Path where results JSON will be saved.

    Returns
    -------
    dict
        Dictionary containing all run results and consensus result.
    """

    n_samples = len(df_vectors)
    all_weights = np.array(all_weights)
    n_clusterings = len(all_labels)

    #create the consoncuse matric
    co_occur = np.zeros((n_samples, n_samples), dtype=float)

    for labels, weight in tqdm(zip(all_labels, all_weights),
                               total=n_clusterings, desc=f"calculating_consensus_matrix"):
        for i in range(n_samples):
            if labels[i] == -1:
                continue
            for j in range(i + 1, n_samples):
                if labels[j] == -1:
                    continue
                if labels[i] == labels[j]:
                    co_occur[i, j] += weight
                    co_occur[j, i] += weight

    similarity_matrix = co_occur / all_weights.sum()
    distance_matrix = 1 - similarity_matrix

    # Select best k using silhouette score
    best_sil = -1
    best_k = None
    best_consensus_labels = None
    max_cluster = 30

    for k in tqdm(range(2, min(max_cluster, n_samples)), total=max_cluster, desc="calculating_optimal_K"):
        clusterer = AgglomerativeClustering(n_clusters=k, linkage="average")

        consensus_labels = clusterer.fit_predict(distance_matrix)
        sil = silhouette_score(distance_matrix, consensus_labels)
    
        if sil > best_sil:
            best_sil = sil
            best_k = k
            best_consensus_labels = consensus_labels

    consensus_metrics = compute_metrics(df_vectors, best_consensus_labels)
    stability = claculate_consencus_stability(best_consensus_labels, similarity_matrix)
    score_clustering = calculate_clustering_weight(consensus_metrics["silhouette"],
                                        consensus_metrics["davies_bouldin"],
                                        consensus_metrics["calinski_harabasz"],
                                        consensus_metrics["intra_cluster_cosine_mean"],
                                        consensus_metrics["inter_cluster_cosine_mean"]) + stability * 0.1 

    consensus_result = {
        "n_clusters": best_k,
        "silhouette": consensus_metrics["silhouette"],
        "davies_bouldin": consensus_metrics["davies_bouldin"],
        "calinski_harabasz": consensus_metrics["calinski_harabasz"],
        "intra_cluster_cosine_mean": consensus_metrics["intra_cluster_cosine_mean"],
        "inter_cluster_cosine_mean": consensus_metrics["inter_cluster_cosine_mean"],
        "stability": stability,
        "score_clustering": score_clustering
    }

    os.makedirs(f"{output_path}/{emebdding_config}", exist_ok=True)

    with open(f"{output_path}/{emebdding_config}/consencus_clustering.json", "w") as f:
        json.dump(consensus_result, f, indent=2)

    return best_consensus_labels

**calculate metrics**

In [32]:
emb_cols = [col for col in df_recipes_cleaned.columns if col.endswith("EMB")]
labels_config = {}

for col in emb_cols:
    print(f"____{col}____")
    emnbedding_vector = np.vstack(df_recipes_cleaned[col].values)
    all_weights, all_labels = run_clustering_grid(col, emnbedding_vector, UMAP_GRID, CLUSTERING_GRID, OUT_DIR)
    best_consensus_labels = concensure_clustering(col, emnbedding_vector, all_weights, all_labels, OUT_DIR) 
    labels_config[col] = best_consensus_labels


____Snowflake/snowflake-arctic-embed-m-v1.5/config_1_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.96s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  9.64it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.03it/s]


____Snowflake/snowflake-arctic-embed-m/config_1_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.96s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  9.48it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.20it/s]


____intfloat/e5-base-v2/config_1_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.78s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  8.74it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.15it/s]


____sentence-transformers/all-MiniLM-L6-v2/config_1_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.67s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  8.85it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.21it/s]


____BAAI/bge-base-en-v1.5/config_1_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.87s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00, 10.25it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.19it/s]


____Snowflake/snowflake-arctic-embed-m-v1.5/config_2_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.96s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  8.16it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.08it/s]


____Snowflake/snowflake-arctic-embed-m/config_2_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.96s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  9.81it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.08it/s]


____intfloat/e5-base-v2/config_2_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.83s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  9.10it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.30it/s]


____sentence-transformers/all-MiniLM-L6-v2/config_2_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.73s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  9.90it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.25it/s]


____BAAI/bge-base-en-v1.5/config_2_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.89s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  8.31it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.23it/s]


____Snowflake/snowflake-arctic-embed-m-v1.5/config_3_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.92s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  9.38it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.14it/s]


____Snowflake/snowflake-arctic-embed-m/config_3_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:03<00:00,  3.02s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  9.52it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.17it/s]


____intfloat/e5-base-v2/config_3_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.85s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  9.80it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.23it/s]


____sentence-transformers/all-MiniLM-L6-v2/config_3_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.69s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  9.93it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.31it/s]


____BAAI/bge-base-en-v1.5/config_3_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.91s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00, 10.03it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.26it/s]


____Snowflake/snowflake-arctic-embed-m-v1.5/config_4_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.84s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00, 10.17it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.24it/s]


____Snowflake/snowflake-arctic-embed-m/config_4_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:03<00:00,  3.03s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  9.83it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.23it/s]


____intfloat/e5-base-v2/config_4_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.81s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  8.47it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.18it/s]


____sentence-transformers/all-MiniLM-L6-v2/config_4_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.69s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  9.85it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.21it/s]


____BAAI/bge-base-en-v1.5/config_4_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.81s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  9.37it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.14it/s]


____Snowflake/snowflake-arctic-embed-m-v1.5/config_5_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.99s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  9.90it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.07it/s]


____Snowflake/snowflake-arctic-embed-m/config_5_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.93s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00, 10.03it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.15it/s]


____intfloat/e5-base-v2/config_5_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.98s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  9.23it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  5.96it/s]


____sentence-transformers/all-MiniLM-L6-v2/config_5_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.77s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  9.65it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.08it/s]


____BAAI/bge-base-en-v1.5/config_5_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.89s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  9.75it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.09it/s]


____Snowflake/snowflake-arctic-embed-m-v1.5/config_6_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:03<00:00,  3.03s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00, 10.20it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.13it/s]


____Snowflake/snowflake-arctic-embed-m/config_6_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:03<00:00,  3.01s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00, 10.32it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.03it/s]


____intfloat/e5-base-v2/config_6_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:03<00:00,  3.05s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  8.73it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.15it/s]


____sentence-transformers/all-MiniLM-L6-v2/config_6_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.87s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00,  8.31it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.20it/s]


____BAAI/bge-base-en-v1.5/config_6_EMB____


executing dimension reduction + clustering: 100%|██████████| 1/1 [00:02<00:00,  2.95s/it]
calculating_consensus_matrix: 100%|██████████| 16/16 [00:01<00:00, 10.24it/s]
calculating_optimal_K:  93%|█████████▎| 28/30 [00:04<00:00,  6.21it/s]


**visualise cluster**

In [33]:
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
embedding_2d = reducer.fit_transform(emnbedding_vector)

df_recipes_cleaned['UMAP_1'] = embedding_2d[:, 0]
df_recipes_cleaned['UMAP_2'] = embedding_2d[:, 1]

In [35]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "browser"

fig = px.scatter(
    df_recipes_cleaned, 
    x='UMAP_1', 
    y='UMAP_2', 
    color=best_consensus_labels.astype(str),
    hover_data=['NAME_CLEAND'],
    title='UMAP projection of recipe embeddings'
)

fig.show()