In [58]:
import kagglehub
import pandas as pd 
import os
import yaml

with open("config.yaml", 'r') as f:
    config = yaml.safe_load(f)

def fetch_data(dataset_name, version=None)->str:
    if version:
        dataset_name = f"{dataset_name}:{version}"
    return kagglehub.dataset_download(dataset_name)

def load_data(path,files)->pd.DataFrame:
    data_frames = {}
    for file in files : 
        file_path = os.path.join(path, file)
        if os.path.exists(file_path):
            data_frames[file] = pd.read_csv(file_path)
        else:
            raise FileNotFoundError(f"{file} introuvable dans le {path}")
    return data_frames

# Charger les données au niveau global
dataset_path = fetch_data(config['dataset']['name'])
dfs = load_data(dataset_path, config['dataset']['files'])
recipes_df = dfs.get('RAW_recipes.csv')
interactions_df = dfs.get('RAW_interactions.csv')

if __name__ == "__main__":
    for name, df in dfs.items():
        print(f"Data from {name}:")
        print(df.head())
    print(" Recipes shape:", recipes_df.shape)
    print(" Interactions shape:", interactions_df.shape)

Data from RAW_interactions.csv:
   user_id  recipe_id        date  rating  \
0    38094      40893  2003-02-17       4   
1  1293707      40893  2011-12-21       5   
2     8937      44394  2002-12-01       4   
3   126440      85009  2010-02-27       5   
4    57222      85009  2011-10-01       5   

                                              review  
0  Great with a salad. Cooked on top of stove for...  
1  So simple, so delicious! Great for chilly fall...  
2  This worked very well and is EASY.  I used not...  
3  I made the Mexican topping and took it to bunk...  
4  Made the cheddar bacon topping, adding a sprin...  
Data from RAW_recipes.csv:
                                         name      id  minutes  \
0  arriba   baked winter squash mexican style  137739       55   
1            a bit different  breakfast pizza   31490       30   
2                   all in the kitchen  chili  112140      130   
3                          alouette  potatoes   59389       45   
4         

In [29]:
# Rechargement du module pour prendre en compte les modifications
import importlib
import data_prepro
importlib.reload(data_prepro)
from data_prepro import IngredientPreprocessor

# Charger le fichier CSV directement depuis le répertoire courant
preproc = IngredientPreprocessor("ingr_map.csv")
print("IngredientPreprocessor initialisé avec le fichier CSV")

# Vérifier que la carte des ingrédients est bien chargée
print(f"Nombre d'ingrédients dans la carte: {len(preproc.raw_to_normalized)}")

# Tester avec un ingrédient qui devrait être dans la carte
test_result = preproc.normalize_ingredient("4 extra virgin olive oil")
print(f"Test normalization: 'extra virgin olive oil' -> '{test_result}'")

# Tester avec quelques autres ingrédients
test_ingredients = ["large eggs", "all purpose flour", "unsalted butter"]
for ing in test_ingredients:
    result = preproc.normalize_ingredient(ing)
    print(f"'{ing}' -> '{result}'")

INFO:data_prepro:Ingredient map loaded successfully.


IngredientPreprocessor initialisé avec le fichier CSV
Nombre d'ingrédients dans la carte: 11659
Test normalization: 'extra virgin olive oil' -> 'olive oil'
'large eggs' -> 'large eggs'
'all purpose flour' -> 'all purpose flour'
'unsalted butter' -> 'unsalted butter'


In [20]:
#recipes_df.head()
recipes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB


In [21]:
#steps_list=list(recipes_df['steps'])
#steps_list[0][0]
# steps va etre convertit en objet list 
recipes_df['description'][0]
recipes_df['nutrition'][0]
#recipes_df['ingredients'][0]
#recipes_df['tags'][0]
recipes_df['ingredients'][0][0]

'['

In [22]:
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1132367 entries, 0 to 1132366
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   user_id    1132367 non-null  int64 
 1   recipe_id  1132367 non-null  int64 
 2   date       1132367 non-null  object
 3   rating     1132367 non-null  int64 
 4   review     1132198 non-null  object
dtypes: int64(3), object(2)
memory usage: 43.2+ MB


In [46]:
interactions_df.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


In [23]:
correlation = recipes_df[['minutes', 'n_ingredients']].corr()
print(f"Corrélation temps/ingrédients: {correlation.iloc[0,1]:.3f}")

Corrélation temps/ingrédients: -0.001


In [None]:
#recipes_df['steps_count'] = recipes_df['steps'].apply(lambda x: len(x.split('||')) if pd.notnull(x) else 0)
recipes_df['steps'][1]

"['preheat oven to 425 degrees f', 'press dough into the bottom and sides of a 12 inch pizza pan', 'bake for 5 minutes until set but not browned', 'cut sausage into small pieces', 'whisk eggs and milk in a bowl until frothy', 'spoon sausage over baked crust and sprinkle with cheese', 'pour egg mixture slowly over sausage and cheese', 's& p to taste', 'bake 15-20 minutes or until eggs are set and crust is brown']"

In [10]:
recipes_df.shape

(231637, 12)

In [None]:
#print(fastest[['name', 'minutes', 'n_ingredients']])
#recipes_df[recipes_df['minutes'] == 0].count()
# imputing missing values de minutes en utilisant la variable nombre d'ingrédients 
recipes_df[recipes_df['minutes'] == 0].head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
5,apple a day milk shake,5289,0,1533,1999-12-06,"['15-minutes-or-less', 'time-to-make', 'course...","[160.2, 10.0, 55.0, 3.0, 9.0, 20.0, 7.0]",4,"['combine ingredients in blender', 'cover and ...",,"['milk', 'vanilla ice cream', 'frozen apple ju...",4
2451,acorn magic delights,1712,0,1534,1999-10-01,"['15-minutes-or-less', 'time-to-make', 'course...","[148.4, 15.0, 28.0, 2.0, 3.0, 21.0, 4.0]",13,"['melt the butter or margarine over low heat',...",,"['butter', 'brown sugar', 'pecans', 'all-purpo...",7
3079,albanian byrek,4880,0,1534,1999-11-24,"['15-minutes-or-less', 'time-to-make', 'course...","[354.4, 42.0, 25.0, 59.0, 37.0, 37.0, 2.0]",14,"['prepare the dough with flour , 1 and a half ...","the directions to this are vague, but maybe yo...","['flour', 'water', 'oil', 'vinegar', 'salt', '...",9
3193,alfredo sauce with pasta,3258,0,1534,1999-10-10,"['15-minutes-or-less', 'time-to-make', 'course...","[1902.9, 287.0, 5.0, 140.0, 104.0, 583.0, 3.0]",8,['cook noodles or fettuccine according to pack...,,"['butter', 'heavy cream', 'parmesan cheese', '...",6
3259,alice s doughnuts,2284,0,1752,1999-10-18,"['15-minutes-or-less', 'time-to-make', 'course...","[107.3, 6.0, 20.0, 3.0, 3.0, 10.0, 4.0]",17,"['in a large bowl , beat the eggs until foamy'...",,"['eggs', 'sugar', 'milk', 'shortening', 'vanil...",9


In [None]:
#fastest = recipes_df.nsmallest(10, 'minutes')


In [24]:
#recipes_df['ingredients'].head()
import ast
ingredients = ast.literal_eval(recipes_df['ingredients'][15])
ingredients[0]

'lean pork chops'

In [25]:
# catégories des ingrédients
categories = {
        'proteins': ['chicken', 'beef', 'pork', 'fish', 'salmon', 'tuna', 'shrimp', 
                     'turkey', 'lamb', 'egg', 'tofu', 'tempeh'],
        'dairy': ['milk', 'cheese', 'butter', 'cream', 'yogurt', 'sour cream'],
        'vegetables': ['tomato', 'onion', 'garlic', 'carrot', 'potato', 'broccoli',
                       'spinach', 'pepper', 'mushroom', 'lettuce', 'cucumber'],
        'fruits': ['apple', 'banana', 'orange', 'lemon', 'strawberry', 'blueberry'],
        'grains': ['flour', 'rice', 'pasta', 'bread', 'oat', 'quinoa', 'wheat'],
        'spices': ['salt', 'pepper', 'cumin', 'paprika', 'cinnamon', 'basil', 
                   'oregano', 'thyme', 'rosemary'],
        'oils': ['olive oil', 'vegetable oil', 'coconut oil', 'butter'],
        'sweeteners': ['sugar', 'honey', 'maple syrup', 'brown sugar']
    }


In [30]:
subset = recipes_df.head(500).copy()
subset["clean_ingredients"] = subset["ingredients"].apply(preproc.parse_and_clean)
subset[["ingredients", "clean_ingredients"]].head()

Unnamed: 0,ingredients,clean_ingredients
0,"['winter squash', 'mexican seasoning', 'mixed ...","[olive oil, butter, honey, winter squash, salt..."
1,"['prepared pizza crust', 'sausage patty', 'egg...","[sausage, egg, milk, pizza crust, salt and pep..."
2,"['ground beef', 'yellow onions', 'diced tomato...","[yellow onion, ground cumin, lettuce, ground b..."
3,"['spreadable cheese with garlic and herbs', 'n...","[olive oil, yellow bell pepper, red bell peppe..."
4,"['tomato juice', 'apple cider vinegar', 'sugar...","[pepper, clove oil, cinnamon oil, sugar, salt,..."


In [31]:
subset["ingredient_categories"] = subset["clean_ingredients"].apply(preproc.categorize)
subset["ingredient_categories"].head()

0    {'oils': ['olive oil', 'butter'], 'sweeteners'...
1    {'other': ['sausage', 'pizza crust'], 'protein...
2    {'vegetables': ['yellow onion', 'lettuce', 'ro...
3    {'oils': ['olive oil'], 'spices': ['yellow bel...
4    {'spices': ['pepper', 'cinnamon oil', 'salt'],...
Name: ingredient_categories, dtype: object

In [36]:
tests = ["extra virgin olive oil", "large eggs", "fresh basil leaves", "granulated sugar", "unsalted butter"]
for t in tests:
    print(t, "->", preproc.normalize_ingredient(t))

extra virgin olive oil -> olive oil
large eggs -> large eggs
fresh basil leaves -> fresh basil leaf
granulated sugar -> granulated sugar
unsalted butter -> unsalted butter


In [37]:
from collections import Counter
all_clean = [ing for lst in subset["clean_ingredients"] for ing in lst]
Counter(all_clean).most_common(15)

[('salt', 178),
 ('egg', 125),
 ('butter', 111),
 ('onion', 109),
 ('sugar', 80),
 ('water', 76),
 ('milk', 74),
 ('flmy', 69),
 ('pepper', 60),
 ('garlic clove', 57),
 ('olive oil', 54),
 ('brown sugar', 46),
 ('vanilla', 42),
 ('baking powder', 39),
 ('baking soda', 37)]

In [38]:
subset["clean_ingredients"].apply(len).describe()

count    500.000000
mean       9.092000
std        4.042303
min        2.000000
25%        6.000000
50%        9.000000
75%       11.250000
max       23.000000
Name: clean_ingredients, dtype: float64

In [42]:
raw_ingredients = ["2 Cups All-Purpose Flour", "Fresh Garlic", "Olive Oil", "Ground black pepper", "Diced Tomatoes"]
cleaned = [preproc.normalize_ingredient(x) for x in raw_ingredients]
cleaned

['cups all-purpose flour', 'garlic', 'olive oil', 'black pepper', 'tomatoes']

In [43]:
tests = ["extra virgin olive oil", "large eggs", "fresh basil leaves", "granulated sugar", "unsalted butter"]
for t in tests:
    print(t, "->", preproc.normalize_ingredient(t))

extra virgin olive oil -> extra virgin olive oil
large eggs -> large eggs
fresh basil leaves -> basil leaves
granulated sugar -> granulated sugar
unsalted butter -> unsalted butter


Fin de la première étape de pre-traitement. Nous avons donc réussi à convertir le fichier pkl en csv et l'exploiter pour normaliser nos données textuelles.

On va donc appliquer la fonction normalisation sur la variable ingredients de notre dataset qui est dans recipes_df. 

In [75]:
# Appliquer la normalisation sur toute la colonne ingredients
# Ajouter directement une nouvelle colonne au DataFrame existant
print("Normalisation en cours...")

# Appliquer la fonction parse_and_clean sur chaque recette
recipes_df["normalized_ingredients"] = recipes_df["ingredients"].apply(preproc.parse_and_clean)

print("✅ Normalisation terminée!")
print(f"Forme du dataset: {recipes_df.shape}")


Normalisation en cours...
✅ Normalisation terminée!
Forme du dataset: (231637, 13)
✅ Normalisation terminée!
Forme du dataset: (231637, 13)


In [76]:
# Comparer quelques exemples avant/après
print("\n📋 Exemples avant/après normalisation:")
for i in range(3):
    print(f"\n--- Recette {i+1} ---")
    print(f"Avant: {recipes_df.iloc[i]['ingredients']}")
    print(f"Après: {recipes_df.iloc[i]['normalized_ingredients']}")
    
recipes_df[["name", "ingredients", "normalized_ingredients"]].head()


📋 Exemples avant/après normalisation:

--- Recette 1 ---
Avant: ['winter squash', 'mexican seasoning', 'mixed spice', 'honey', 'butter', 'olive oil', 'salt']
Après: ['olive oil', 'butter', 'honey', 'winter squash', 'salt', 'mixed spice', 'mexican seasoning']

--- Recette 2 ---
Avant: ['prepared pizza crust', 'sausage patty', 'eggs', 'milk', 'salt and pepper', 'cheese']
Après: ['eggs', 'milk', 'salt and pepper', 'cheese', 'sausage patty', 'prepared pizza crust']

--- Recette 3 ---
Avant: ['ground beef', 'yellow onions', 'diced tomatoes', 'tomato paste', 'tomato soup', 'rotel tomatoes', 'kidney beans', 'water', 'chili powder', 'ground cumin', 'salt', 'lettuce', 'cheddar cheese']
Après: ['lettuce', 'yellow onions', 'chili powder', 'tomatoes', 'cumin', 'rotel tomatoes', 'water', 'tomato paste', 'cheddar cheese', 'salt', 'kidney beans', 'tomato soup', 'beef']


Unnamed: 0,name,ingredients,normalized_ingredients
0,arriba baked winter squash mexican style,"['winter squash', 'mexican seasoning', 'mixed ...","[olive oil, butter, honey, winter squash, salt..."
1,a bit different breakfast pizza,"['prepared pizza crust', 'sausage patty', 'egg...","[eggs, milk, salt and pepper, cheese, sausage ..."
2,all in the kitchen chili,"['ground beef', 'yellow onions', 'diced tomato...","[lettuce, yellow onions, chili powder, tomatoe..."
3,alouette potatoes,"['spreadable cheese with garlic and herbs', 'n...","[olive oil, yellow bell pepper, new potatoes, ..."
4,amish tomato ketchup for canning,"['tomato juice', 'apple cider vinegar', 'sugar...","[pepper, clove oil, cinnamon oil, sugar, salt,..."


In [84]:
# Analyser les résultats de la normalisation
print("📊 Analyse des ingrédients normalisés:")
print(f"Nombre total de recettes: {len(recipes_df)}")

# Calculer le nombre moyen d'ingrédients par recette
avg_ingredients = recipes_df["normalized_ingredients"].apply(len).mean()
print(f"Nombre moyen d'ingrédients par recette: {avg_ingredients:.1f}")

# Top 20 des ingrédients les plus fréquents après normalisation
from collections import Counter
all_normalized_ingredients = [ing for ingredients_list in recipes_df["normalized_ingredients"] for ing in ingredients_list]
most_common = Counter(all_normalized_ingredients).most_common(20)

print("\n🥇 Top 20 des ingrédients les plus fréquents:")
for i, (ingredient, count) in enumerate(most_common, 1):
    print(f"{i:2d}. {ingredient:<20} : {count:>6,} fois")


# Afficher les colonnes du DataFrame
print(f"\n📋 Colonnes du DataFrame: {list(recipes_df.columns)}")

📊 Analyse des ingrédients normalisés:
Nombre total de recettes: 231637
Nombre moyen d'ingrédients par recette: 9.0

🥇 Top 20 des ingrédients les plus fréquents:
 1. salt                 : 85,746 fois
 2. butter               : 54,975 fois
 3. sugar                : 44,535 fois
 4. onion                : 39,786 fois
 5. water                : 34,926 fois
 6. eggs                 : 33,761 fois
 7. olive oil            : 32,822 fois
 8. garlic cloves        : 26,723 fois
 9. pepper               : 26,633 fois
10. flour                : 26,266 fois
11. milk                 : 25,799 fois
12. black pepper         : 24,271 fois
13. lemon juice          : 19,506 fois
14. cinnamon             : 19,316 fois
15. garlic               : 19,072 fois
16. brown sugar          : 18,655 fois
17. all-purpose flour    : 17,659 fois
18. baking powder        : 17,504 fois
19. egg                  : 17,304 fois
20. tomatoes             : 16,602 fois

📋 Colonnes du DataFrame: ['name', 'id', 'minutes', 'contri

In [81]:
from data_prepro import NutritionPreprocessor
# Nouvelle cellule - Test de NutritionPreprocessor
print("🧪 TEST DE LA CLASSE NutritionPreprocessor")
print("=" * 50)

# Recharger le module pour prendre en compte les modifications
import importlib
importlib.reload(data_prepro)
from data_prepro import NutritionPreprocessor

# Créer une instance du preprocessor
nutrition_processor = NutritionPreprocessor()

# Test 1: Parsing d'une chaîne nutrition normale
print("\n📊 Test 1: Parsing nutrition normale")
test_nutrition_str = "[200.5, 10.2, 15.8, 25.0, 5.5, 12.3, 800.0]"
print(f"Input: {test_nutrition_str}")
parsed_nutrition = nutrition_processor.parse_nutrition(test_nutrition_str)
print(f"Output: {parsed_nutrition}")

# Test 2: Calcul du health score
print(f"\n🏥 Test 2: Calcul du health score")
health_score = nutrition_processor.compute_health_score(parsed_nutrition)
print(f"Health Score: {health_score}")

# Test 3: Exemple avec des données réelles du dataset
print(f"\n🥗 Test 3: Données réelles du dataset")
if 'nutrition' in recipes_df.columns:
    real_nutrition_str = recipes_df['nutrition'].iloc[0]
    print(f"Nutrition originale: {real_nutrition_str}")
    
    real_parsed = nutrition_processor.parse_nutrition(real_nutrition_str)
    print(f"Parsed: {real_parsed}")
    
    real_health_score = nutrition_processor.compute_health_score(real_parsed)
    print(f"Health Score: {real_health_score}")

# Test 4: Cas d'erreur - chaîne malformée
print(f"\n❌ Test 4: Gestion d'erreurs")
malformed_str = "[200.5, 10.2, invalid, 25.0]"
print(f"Input malformé: {malformed_str}")
error_result = nutrition_processor.parse_nutrition(malformed_str)
print(f"Résultat: {error_result}")

# Test 5: Test avec plusieurs exemples du dataset
print(f"\n📈 Test 5: Analyse de plusieurs recettes")
sample_size = 10
nutrition_results = []

for i in range(min(sample_size, len(recipes_df))):
    nutrition_str = recipes_df['nutrition'].iloc[i]
    parsed = nutrition_processor.parse_nutrition(nutrition_str)
    if parsed:  # Si le parsing a réussi
        health_score = nutrition_processor.compute_health_score(parsed)
        nutrition_results.append({
            'recipe_id': i,
            'calories': parsed.get('calories', 0),
            'protein': parsed.get('protein', 0),
            'sugar': parsed.get('sugar', 0),
            'health_score': health_score
        })

# Afficher les résultats
print(f"\n📋 Résultats pour {len(nutrition_results)} recettes:")
print(f"{'ID':<3} {'Calories':<8} {'Protein':<7} {'Sugar':<6} {'Health Score':<12}")
print("-" * 40)
for result in nutrition_results:
    print(f"{result['recipe_id']:<3} {result['calories']:<8.1f} {result['protein']:<7.1f} "
          f"{result['sugar']:<6.1f} {result['health_score']:<12.2f}")

# Test 6: Comparaison de différents profils nutritionnels
print(f"\n🔬 Test 6: Comparaison de profils nutritionnels")

test_profiles = [
    {
        'name': 'Recette saine',
        'nutrition': [300, 8, 12, 40, 5, 20, 500]  # Faible en calories, bon en protéines
    },
    {
        'name': 'Recette riche',
        'nutrition': [800, 35, 45, 60, 15, 15, 1500]  # Riche en calories et sodium
    },
    {
        'name': 'Dessert sucré',
        'nutrition': [450, 20, 25, 55, 35, 8, 200]  # Riche en sucre
    }
]

for profile in test_profiles:
    # Simuler une chaîne nutrition
    nutrition_str = str(profile['nutrition'])
    parsed = nutrition_processor.parse_nutrition(nutrition_str)
    health_score = NutritionPreprocessor.compute_health_score(parsed)
    
    print(f"\n{profile['name']}:")
    print(f"  Calories: {parsed['calories']}")
    print(f"  Protéines: {parsed['protein']}g")
    print(f"  Sucre: {parsed['sugar']}g")
    print(f"  Sodium: {parsed['sodium']}mg")
    print(f"  🏥 Health Score: {health_score}")

print(f"\n✅ Tests terminés!")



ERROR:data_prepro:Erreur parsing nutrition: malformed node or string on line 1: <ast.Name object at 0xfffec2f3a2f0>


🧪 TEST DE LA CLASSE NutritionPreprocessor

📊 Test 1: Parsing nutrition normale
Input: [200.5, 10.2, 15.8, 25.0, 5.5, 12.3, 800.0]
Output: {'calories': 200.5, 'fat': 10.2, 'total_fat': 15.8, 'carbohydrates': 25.0, 'sugar': 5.5, 'protein': 12.3, 'sodium': 800.0}

🏥 Test 2: Calcul du health score
Health Score: 1

🥗 Test 3: Données réelles du dataset
Nutrition originale: [51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]
Parsed: {'calories': 51.5, 'fat': 0.0, 'total_fat': 13.0, 'carbohydrates': 0.0, 'sugar': 2.0, 'protein': 0.0, 'sodium': 4.0}
Health Score: 1

❌ Test 4: Gestion d'erreurs
Input malformé: [200.5, 10.2, invalid, 25.0]
Résultat: {}

📈 Test 5: Analyse de plusieurs recettes

📋 Résultats pour 10 recettes:
ID  Calories Protein Sugar  Health Score
----------------------------------------
0   51.5     0.0     2.0    1.00        
1   173.4    35.0    22.0   1.00        
2   269.8    27.0    39.0   1.00        
3   368.1    8.0     14.0   1.00        
4   352.9    0.0     3.0    1.00        
5   1

In [83]:
recipes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 13 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   name                    231636 non-null  object
 1   id                      231637 non-null  int64 
 2   minutes                 231637 non-null  int64 
 3   contributor_id          231637 non-null  int64 
 4   submitted               231637 non-null  object
 5   tags                    231637 non-null  object
 6   nutrition               231637 non-null  object
 7   n_steps                 231637 non-null  int64 
 8   steps                   231637 non-null  object
 9   description             226658 non-null  object
 10  ingredients             231637 non-null  object
 11  n_ingredients           231637 non-null  int64 
 12  normalized_ingredients  231637 non-null  object
dtypes: int64(5), object(8)
memory usage: 23.0+ MB


In [None]:
categorized_test = preproc.categorize(["chicken", "olive oil", "salt", "tomato", "basil", "sugar", "flour"])
print(categorized_test)

{'proteins': ['chicken'], 'oils': ['olive oil'], 'spices': ['salt', 'basil'], 'vegetables': ['tomato'], 'sweeteners': ['sugar'], 'grains': ['flour']}


In [None]:
categorized = recipes_df["normalized_ingredients"].apply(preproc.categorize)

In [90]:
recipes_df["normalized_ingredients"][0]

['olive oil',
 'butter',
 'honey',
 'winter squash',
 'salt',
 'mixed spice',
 'mexican seasoning']

In [89]:
categorized[0]

{'oils': ['olive oil', 'butter'],
 'sweeteners': ['honey'],
 'other': ['winter squash', 'mixed spice', 'mexican seasoning'],
 'spices': ['salt']}