# Exploration Data Analysis

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


In [2]:
# Import de la base de données
recipes = pd.read_csv("../data/RAW_recipes.csv")
recipes.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


# Analyse des ingrédients

L'objectif est de construire 7 variables synthétiques a priori sur la base des ingrédients 

In [3]:
import ast

# Extraction de l'ensemble des ingrédients
ingredients = recipes['ingredients'].apply(ast.literal_eval).explode()
# ingredients_count = ingredients.value_counts()
ingredients = pd.unique(ingredients).tolist()

On calcule les coordonnées des ingrédients dans des embeddings.

In [4]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-mpnet-base-v2")

# sentences = ingredients.astype(str)
# sentences = sentences.tolist()

embeddings = model.encode(ingredients)
# embeddings = pd.DataFrame(embeddings)
# embeddings = embeddings.set_index(ingredients)


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
axes_phrases = {
    "sweet_savory": ("sweet dessert flavor", "savory meal flavor"),
    "spicy_mild": ("spicy hot food", "mild gentle flavor"),
    "lowcal_rich": ("low-calorie healthy food", "rich and fatty dish"),
    "vegetarian_meat": ("vegetarian food without meat", "meat-based dish"),
    "solid_liquid": ("solid food", "liquid food or drink"),
    "raw_processed": ("raw natural ingredient", "processed or prepared food"),
    "western_exotic": ("typical western food", "exotic or asian food"),
}

def axis_vector(model, pos_name, neg_name):
    return model.encode(pos_name) - model.encode(neg_name)

axis_vecs = {axe_name: axis_vector(model, pos_name, neg_name) 
             for axe_name, (pos_name, neg_name) in axes_phrases.items()}

axes_names = list(axis_vecs.keys())
axis_matrix = np.stack(list(axis_vecs.values()))  # shape (n_axes, dim embeddings)

# Normaliser embeddings et axes (pour cosinus)
emb_norm = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
axis_norm = axis_matrix / np.linalg.norm(axis_matrix, axis=1, keepdims=True)

# Produit matriciel pour cosinus
# result[i, j] = cosine(ingredient i, axis j)
cos_sim_matrix = np.dot(emb_norm, axis_norm.T)   # shape (n_ingredients, n_axes)

scores_df = pd.DataFrame(cos_sim_matrix, index=ingredients, columns=axes_names)


In [6]:
# Ajouter les scores moyens aux recettes
for axe in axes_names:
    score_map = scores_df[axe].to_dict()
    recipes[f'score_{axe}'] = recipes['ingredients'].apply(
        lambda ingr_list: np.mean([score_map[ingr] for ingr in ast.literal_eval(ingr_list)])
    )       

In [7]:
recipes.to_csv('../data/features_axes_ingredients.csv', index=False, columns=["id"] + [f'score_{axe}' for axe in axes_names])