In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('ingredients_with_clusters.csv')

# Tests de base

In [3]:
df.head(1)

Unnamed: 0,NDB_No,Descrip,Energy_kcal,Protein_g,Saturated_fats_g,Fat_g,Carb_g,Fiber_g,Sugar_g,Calcium_mg,...,VitD2_mcg,PCA_micro_1,PCA_micro_2,Cluster_micro,Dist_micro_centroid,PCA_macro_1,PCA_macro_2,PCA_macro_3,Cluster_macro,Dist_macro_centroid
0,1001,butter with salt,717.0,0.85,51.368,81.11,0.06,0.0,0.06,24.0,...,0.0,-1.392859,0.822349,0,1.592123,7.123991,-4.674879,-2.088006,1,4.474671


In [4]:
def get_neighbors(df, ingredient_name, micro_weight=0.3, macro_weight=0.7, k=5):
    """
    Trouve les k meilleurs substituts d'un ingrédient selon :
    - PCA macro (dans même cluster macro)
    - PCA micro (dans même cluster micro)
    - Score combiné macro + micro (global)

    Args:
        df (pd.DataFrame): tableau PCA.
        ingredient_name (str): ingrédient recherché.
        micro_weight (float): poids micro.
        macro_weight (float): poids macro.
        k (int): nombre de voisins à retourner.

    Returns:
        dict : voisins macro, micro, global.
    """

    row = df[df['Descrip'].str.lower() == ingredient_name.lower()].iloc[0]

    macro_cols = ['PCA_macro_1', 'PCA_macro_2', 'PCA_macro_3']
    micro_cols = ['PCA_micro_1', 'PCA_micro_2']

    macro_vec = row[macro_cols].values
    micro_vec = row[micro_cols].values

    def euclid(a, b): 
        return np.linalg.norm(a - b)

    # --------------------------
    # macro
    # --------------------------
    macro_cluster_df = df[df['Cluster_macro'] == row['Cluster_macro']].copy()
    macro_cluster_df = macro_cluster_df[macro_cluster_df['Descrip'] != row['Descrip']]
    macro_cluster_df['dist_macro'] = macro_cluster_df[macro_cols].apply(
        lambda x: euclid(macro_vec, x.values), axis=1)

    top_macro = (
        macro_cluster_df
        .sort_values('dist_macro')
        .head(k)[['Descrip', 'dist_macro']]
        .to_dict('records')
    )

    # --------------------------
    # micro
    # --------------------------
    micro_cluster_df = df[df['Cluster_micro'] == row['Cluster_micro']].copy()
    micro_cluster_df = micro_cluster_df[micro_cluster_df['Descrip'] != row['Descrip']]
    micro_cluster_df['dist_micro'] = micro_cluster_df[micro_cols].apply(
        lambda x: euclid(micro_vec, x.values), axis=1)

    top_micro = (
        micro_cluster_df
        .sort_values('dist_micro')
        .head(k)[['Descrip', 'dist_micro']]
        .to_dict('records')
    )

    # --------------------------
    # GLOBAL : k plus proches (macro + micro)
    # --------------------------
    df2 = df.copy()
    df2['global_score'] = (
        macro_weight * df2[macro_cols].apply(lambda x: euclid(macro_vec, x.values), axis=1)
        + micro_weight * df2[micro_cols].apply(lambda x: euclid(micro_vec, x.values), axis=1)
    )

    df2 = df2[df2['Descrip'] != row['Descrip']]

    top_global = (
        df2
        .sort_values('global_score')
        .head(k)[['Descrip', 'global_score']]
        .to_dict('records')
    )

    return {
        "input_ingredient": row['Descrip'],
        "macro_neighbors": top_macro,
        "micro_neighbors": top_micro,
        "best_overall": top_global
    }


In [5]:
res = get_neighbors(df, "butter with salt", k =5)

print("Voisin macro :", res["macro_neighbors"])
print("Voisin micro :", res["micro_neighbors"])
print("Meilleur substitut :", res["best_overall"])

Voisin macro : [{'Descrip': 'butter without salt', 'dist_macro': 0.08311183283280646}, {'Descrip': 'coconut meat dried (desiccated) crmd', 'dist_macro': 0.46003062080264095}, {'Descrip': 'pork frsh var meatsby-products leaf fat raw', 'dist_macro': 0.5442106197814814}, {'Descrip': 'animal fat bacon grease', 'dist_macro': 0.5461557908880208}, {'Descrip': 'lard', 'dist_macro': 0.5644776879698454}]
Voisin micro : [{'Descrip': 'butter whipped w salt', 'dist_micro': 0.010498650841146831}, {'Descrip': 'butter without salt', 'dist_micro': 0.07267203806817915}, {'Descrip': 'margarine reg hard soybn (hydr)', 'dist_micro': 0.07747109381271822}, {'Descrip': 'margarine-like veg oil sprd 60 fat sticktubbottle w salt', 'dist_micro': 0.0907549153088407}, {'Descrip': 'margarine-like veg oil sprd 20 fat w salt', 'dist_micro': 0.09234344545631483}]
Meilleur substitut : [{'Descrip': 'butter without salt', 'global_score': 0.07997989440341827}, {'Descrip': 'butter whipped w salt', 'global_score': 0.48410392

In [6]:
def distance_between(df, ing1, ing2, micro_weight=0.3, macro_weight=0.7):
    """
    Calcule la distance entre deux ingrédients en PCA :
    - macro
    - micro
    - globale (macro + micro)
    """

    # Normalisation des noms
    row1 = df[df['Descrip'].str.lower() == ing1.lower()].iloc[0]
    row2 = df[df['Descrip'].str.lower() == ing2.lower()].iloc[0]

    macro_cols = ['PCA_macro_1', 'PCA_macro_2', 'PCA_macro_3']
    micro_cols = ['PCA_micro_1', 'PCA_micro_2']

    macro_vec1 = row1[macro_cols].values
    macro_vec2 = row2[macro_cols].values

    micro_vec1 = row1[micro_cols].values
    micro_vec2 = row2[micro_cols].values

    def euclid(a, b):
        return np.linalg.norm(a - b)

    # Distances
    dist_macro = euclid(macro_vec1, macro_vec2)
    dist_micro = euclid(micro_vec1, micro_vec2)
    dist_global = macro_weight * dist_macro + micro_weight * dist_micro

    return {
        "ingredient_1": row1["Descrip"],
        "ingredient_2": row2["Descrip"],
        "macro_distance": dist_macro,
        "micro_distance": dist_micro,
        "global_distance": dist_global
    }


In [7]:
distance_between(df, "butter with salt", "butter whipped w salt")

{'ingredient_1': 'butter with salt',
 'ingredient_2': 'butter whipped w salt',
 'macro_distance': np.float64(0.6870776086006435),
 'micro_distance': np.float64(0.010498650841146831),
 'global_distance': np.float64(0.48410392127279445)}

# Tests contraintes

In [8]:
a = pd.read_csv('bd ingredients with tags.csv')
a.rename(columns={'NDB_0': 'NDB_No'}, inplace=True)
cols_to_add = ['PCA_macro_1', 'PCA_macro_2', 'PCA_macro_3',
               'PCA_micro_1', 'PCA_micro_2', 'Cluster_macro', 'Cluster_micro']

a = a.merge(
    df[['NDB_No'] + cols_to_add],
    on='NDB_No',
    how='left'
)

In [9]:
a.head(1)

Unnamed: 0,NDB_No,Descrip,Energy_kcal,Protein_g,Saturated_fats_g,Fat_g,Carb_g,Fiber_g,Sugar_g,Calcium_mg,...,is_vegetarian,is_grain,Descrip.1,PCA_macro_1,PCA_macro_2,PCA_macro_3,PCA_micro_1,PCA_micro_2,Cluster_macro,Cluster_micro
0,1001,butter with salt,717.0,0.85,51.368,81.11,0.06,0.0,0.06,24.0,...,1,0,butter with salt,7.123991,-4.674879,-2.088006,-1.392859,0.822349,1,0


In [10]:
# 'is_lactose', 'is_seafood', 'is_gluten', 'is_vegetable', 'contains_nuts', 'is_sweetener', 'is_vegetarian', 'is_grain'

In [19]:
from transform import TransformConstraints, TransformationType

def get_neighbors(df_init, ingredient_name, constraints : TransformConstraints, micro_weight=0.3, macro_weight=0.7, k=5):
    """
    Trouve les k meilleurs substituts d'un ingrédient selon :
    - PCA macro (dans même cluster macro)
    - PCA micro (dans même cluster micro)
    - Score combiné macro + micro (global)

    Args:
        df (pd.DataFrame): tableau PCA.
        ingredient_name (str): ingrédient recherché.
        micro_weight (float): poids micro.
        macro_weight (float): poids macro.
        k (int): nombre de voisins à retourner.

    Returns:
        dict : voisins macro, micro, global.
    """


    # filtrage selon constraints :
    CONSTRAINT_TO_COLUMN = {
        "no_lactose":     ("is_lactose", 0),
        "no_gluten":      ("is_gluten", 0),
        "no_nuts":        ("contains_nuts", 0),
        "vegetarian":     ("is_vegetarian", 1),
        "vegan":          ("is_vegetable", 1)
    }

    df = df_init.copy()

    for constraint_name, (col, allowed_val) in CONSTRAINT_TO_COLUMN.items():
        if getattr(constraints, constraint_name, False):
            df = df[
                (df[col] == allowed_val) |
                (df['Descrip'].str.lower() == ingredient_name.lower())
            ]
    #################################

    row = df[df['Descrip'].str.lower() == ingredient_name.lower()].iloc[0]

    macro_cols = ['PCA_macro_1', 'PCA_macro_2', 'PCA_macro_3']
    micro_cols = ['PCA_micro_1', 'PCA_micro_2']

    macro_vec = row[macro_cols].values
    micro_vec = row[micro_cols].values

    def euclid(a, b): 
        return np.linalg.norm(a - b)

    # --------------------------
    # macro
    # --------------------------
    macro_cluster_df = df[df['Cluster_macro'] == row['Cluster_macro']].copy()
    macro_cluster_df = macro_cluster_df[macro_cluster_df['Descrip'] != row['Descrip']]
    macro_cluster_df['dist_macro'] = macro_cluster_df[macro_cols].apply(
        lambda x: euclid(macro_vec, x.values), axis=1)

    top_macro = (
        macro_cluster_df
        .sort_values('dist_macro')
        .head(k)[['Descrip', 'dist_macro']]
        .to_dict('records')
    )

    # --------------------------
    # micro
    # --------------------------
    micro_cluster_df = df[df['Cluster_micro'] == row['Cluster_micro']].copy()
    micro_cluster_df = micro_cluster_df[micro_cluster_df['Descrip'] != row['Descrip']]
    micro_cluster_df['dist_micro'] = micro_cluster_df[micro_cols].apply(
        lambda x: euclid(micro_vec, x.values), axis=1)

    top_micro = (
        micro_cluster_df
        .sort_values('dist_micro')
        .head(k)[['Descrip', 'dist_micro']]
        .to_dict('records')
    )

    # --------------------------
    # GLOBAL : k plus proches (macro + micro)
    # --------------------------
    df2 = df.copy()
    df2['global_score'] = (
        macro_weight * df2[macro_cols].apply(lambda x: euclid(macro_vec, x.values), axis=1)
        + micro_weight * df2[micro_cols].apply(lambda x: euclid(micro_vec, x.values), axis=1)
    )

    df2 = df2[df2['Descrip'] != row['Descrip']]

    top_global = (
        df2
        .sort_values('global_score')
        .head(k)[['Descrip', 'global_score']]
        .to_dict('records')
    )

    return {
        "input_ingredient": row['Descrip'],
        "macro_neighbors": top_macro,
        "micro_neighbors": top_micro,
        "best_overall": top_global
    }


In [21]:
constraints = TransformConstraints(
    transformation=TransformationType.SUBSTITUTION,
    no_lactose=True,
    vegan=True
)


get_neighbors(a, "sausage italian pork raw", constraints, k=3)

{'input_ingredient': 'sausage italian pork raw',
 'macro_neighbors': [{'Descrip': 'basil fresh',
   'dist_macro': 3.4276251896073986},
  {'Descrip': 'button mushroom  fresh ', 'dist_macro': 3.4279882043189995},
  {'Descrip': 'onions raw', 'dist_macro': 3.4968690503772155}],
 'micro_neighbors': [{'Descrip': 'basil fresh',
   'dist_micro': 0.1132930555039753},
  {'Descrip': 'button mushroom  fresh ', 'dist_micro': 0.8944636565730697},
  {'Descrip': 'onions raw', 'dist_micro': 1.312781168244727}],
 'best_overall': [{'Descrip': 'basil fresh',
   'global_score': 2.4333255493763715},
  {'Descrip': 'button mushroom  fresh ', 'global_score': 2.66793083999522},
  {'Descrip': 'onions raw', 'global_score': 2.841642685737469}]}

In [23]:
a[a['Descrip'] == "sausage italian pork raw"]

Unnamed: 0,NDB_No,Descrip,Energy_kcal,Protein_g,Saturated_fats_g,Fat_g,Carb_g,Fiber_g,Sugar_g,Calcium_mg,...,is_vegetarian,is_grain,Descrip.1,PCA_macro_1,PCA_macro_2,PCA_macro_3,PCA_micro_1,PCA_micro_2,Cluster_macro,Cluster_micro
11,7036,sausage italian pork raw,346.0,14.25,11.27,31.33,0.65,0.0,0.0,18.0,...,0,0,sausage italian pork raw,1.26118,-1.835476,-0.285404,-0.201408,-0.007535,0,0


In [24]:
a[a['Descrip'] == "basil fresh"]

Unnamed: 0,NDB_No,Descrip,Energy_kcal,Protein_g,Saturated_fats_g,Fat_g,Carb_g,Fiber_g,Sugar_g,Calcium_mg,...,is_vegetarian,is_grain,Descrip.1,PCA_macro_1,PCA_macro_2,PCA_macro_3,PCA_micro_1,PCA_micro_2,Cluster_macro,Cluster_micro
23,2044,basil fresh,23.0,3.15,0.041,0.64,2.65,1.6,0.3,177.0,...,1,0,basil fresh,-1.57519,0.075331,-0.514397,-0.301731,-0.060171,0,0
27,2044,basil fresh,23.0,3.15,0.041,0.64,2.65,1.6,0.3,177.0,...,1,0,basil fresh,-1.57519,0.075331,-0.514397,-0.301731,-0.060171,0,0


In [25]:
distance_between(df, "basil fresh", "sausage italian pork raw")

{'ingredient_1': 'basil fresh',
 'ingredient_2': 'sausage italian pork raw',
 'macro_distance': np.float64(3.4276251896073986),
 'micro_distance': np.float64(0.1132930555039753),
 'global_distance': np.float64(2.4333255493763715)}

In [26]:
distance_between(df, "button mushroom  fresh ", "sausage italian pork raw")

{'ingredient_1': 'button mushroom  fresh ',
 'ingredient_2': 'sausage italian pork raw',
 'macro_distance': np.float64(3.4279882043189995),
 'micro_distance': np.float64(0.8944636565730697),
 'global_distance': np.float64(2.66793083999522)}

In [None]:
# in : 

In [None]:
# Input (recette, contraintes) -> Quel_ingr_a_substituer() -> Substituer_ledit_ingr() -> calcul nouveau score santé
# -> adapter recette (LLM) -> Renvoie nouv_recipe (au grp 5)

In [None]:
# Quel_ingr_a_substituer()
# if contraintes.ing_a_substituer != None : garder celui donné par l'user
# else : utilsier un LLM pour choisir


In [None]:
# Substituer_ledit_ingr(ing, contraintes)
# ma fonction

In [None]:
# Grâce au cluster des Micro/Macro, je prends la liste des 10 aliments possibles pour substituer.
# Je prends ma recette de base et je la place sur le cluster (enfin elle y est déjà..) pour voir
# quelles autres recettes sont similaires (ce sera donc la similirité sur des GROUPES d'ingrédients).
# Maintenant je prendrs le premier aliment de ma liste de 10 qui se trouve dans les recettes similaires