In [3]:
import kagglehub
import pandas as pd 
import os
import yaml

with open("config.yaml", 'r') as f:
    config = yaml.safe_load(f)

def fetch_data(dataset_name, version=None)->str:
    if version:
        dataset_name = f"{dataset_name}:{version}"
    return kagglehub.dataset_download(dataset_name)

def load_data(path,files)->pd.DataFrame:
    data_frames = {}
    for file in files : 
        file_path = os.path.join(path, file)
        if os.path.exists(file_path):
            data_frames[file] = pd.read_csv(file_path)
        else:
            raise FileNotFoundError(f"{file} introuvable dans le {path}")
    return data_frames

# Charger les données au niveau global
dataset_path = fetch_data(config['dataset']['name'])
dfs = load_data(dataset_path, config['dataset']['files'])
recipes_df = dfs.get('RAW_recipes.csv')
interactions_df = dfs.get('RAW_interactions.csv')

if __name__ == "__main__":
    for name, df in dfs.items():
        print(f"Data from {name}:")
        print(df.head())
    print(" Recipes shape:", recipes_df.shape)
    print(" Interactions shape:", interactions_df.shape)

Data from RAW_interactions.csv:
   user_id  recipe_id        date  rating  \
0    38094      40893  2003-02-17       4   
1  1293707      40893  2011-12-21       5   
2     8937      44394  2002-12-01       4   
3   126440      85009  2010-02-27       5   
4    57222      85009  2011-10-01       5   

                                              review  
0  Great with a salad. Cooked on top of stove for...  
1  So simple, so delicious! Great for chilly fall...  
2  This worked very well and is EASY.  I used not...  
3  I made the Mexican topping and took it to bunk...  
4  Made the cheddar bacon topping, adding a sprin...  
Data from RAW_recipes.csv:
                                         name      id  minutes  \
0  arriba   baked winter squash mexican style  137739       55   
1            a bit different  breakfast pizza   31490       30   
2                   all in the kitchen  chili  112140      130   
3                          alouette  potatoes   59389       45   
4         

In [3]:
#recipes_df.head()
recipes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231637 entries, 0 to 231636
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   name            231636 non-null  object
 1   id              231637 non-null  int64 
 2   minutes         231637 non-null  int64 
 3   contributor_id  231637 non-null  int64 
 4   submitted       231637 non-null  object
 5   tags            231637 non-null  object
 6   nutrition       231637 non-null  object
 7   n_steps         231637 non-null  int64 
 8   steps           231637 non-null  object
 9   description     226658 non-null  object
 10  ingredients     231637 non-null  object
 11  n_ingredients   231637 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 21.2+ MB


In [6]:
#steps_list=list(recipes_df['steps'])
#steps_list[0][0]
# steps va etre convertit en objet list 
recipes_df['description'][0]
recipes_df['nutrition'][0]
#recipes_df['ingredients'][0]
#recipes_df['tags'][0]
recipes_df['ingredients'][0][0]

'['

In [34]:
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1132367 entries, 0 to 1132366
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   user_id    1132367 non-null  int64 
 1   recipe_id  1132367 non-null  int64 
 2   date       1132367 non-null  object
 3   rating     1132367 non-null  int64 
 4   review     1132198 non-null  object
dtypes: int64(3), object(2)
memory usage: 43.2+ MB


In [46]:
interactions_df.head()

Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


In [2]:
correlation = recipes_df[['minutes', 'n_ingredients']].corr()
print(f"Corrélation temps/ingrédients: {correlation.iloc[0,1]:.3f}")

Corrélation temps/ingrédients: -0.001


In [None]:
#recipes_df['steps_count'] = recipes_df['steps'].apply(lambda x: len(x.split('||')) if pd.notnull(x) else 0)
recipes_df['steps'][1]

"['preheat oven to 425 degrees f', 'press dough into the bottom and sides of a 12 inch pizza pan', 'bake for 5 minutes until set but not browned', 'cut sausage into small pieces', 'whisk eggs and milk in a bowl until frothy', 'spoon sausage over baked crust and sprinkle with cheese', 'pour egg mixture slowly over sausage and cheese', 's& p to taste', 'bake 15-20 minutes or until eggs are set and crust is brown']"

In [10]:
recipes_df.shape

(231637, 12)

In [None]:
#print(fastest[['name', 'minutes', 'n_ingredients']])
#recipes_df[recipes_df['minutes'] == 0].count()
# imputing missing values de minutes en utilisant la variable nombre d'ingrédients 
recipes_df[recipes_df['minutes'] == 0].head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
5,apple a day milk shake,5289,0,1533,1999-12-06,"['15-minutes-or-less', 'time-to-make', 'course...","[160.2, 10.0, 55.0, 3.0, 9.0, 20.0, 7.0]",4,"['combine ingredients in blender', 'cover and ...",,"['milk', 'vanilla ice cream', 'frozen apple ju...",4
2451,acorn magic delights,1712,0,1534,1999-10-01,"['15-minutes-or-less', 'time-to-make', 'course...","[148.4, 15.0, 28.0, 2.0, 3.0, 21.0, 4.0]",13,"['melt the butter or margarine over low heat',...",,"['butter', 'brown sugar', 'pecans', 'all-purpo...",7
3079,albanian byrek,4880,0,1534,1999-11-24,"['15-minutes-or-less', 'time-to-make', 'course...","[354.4, 42.0, 25.0, 59.0, 37.0, 37.0, 2.0]",14,"['prepare the dough with flour , 1 and a half ...","the directions to this are vague, but maybe yo...","['flour', 'water', 'oil', 'vinegar', 'salt', '...",9
3193,alfredo sauce with pasta,3258,0,1534,1999-10-10,"['15-minutes-or-less', 'time-to-make', 'course...","[1902.9, 287.0, 5.0, 140.0, 104.0, 583.0, 3.0]",8,['cook noodles or fettuccine according to pack...,,"['butter', 'heavy cream', 'parmesan cheese', '...",6
3259,alice s doughnuts,2284,0,1752,1999-10-18,"['15-minutes-or-less', 'time-to-make', 'course...","[107.3, 6.0, 20.0, 3.0, 3.0, 10.0, 4.0]",17,"['in a large bowl , beat the eggs until foamy'...",,"['eggs', 'sugar', 'milk', 'shortening', 'vanil...",9


In [None]:
#fastest = recipes_df.nsmallest(10, 'minutes')


In [13]:
#recipes_df['ingredients'].head()
import ast
ingredients = ast.literal_eval(recipes_df['ingredients'][15])
ingredients[0]

'lean pork chops'

In [None]:
# catégories des ingrédients
categories = {
        'proteins': ['chicken', 'beef', 'pork', 'fish', 'salmon', 'tuna', 'shrimp', 
                     'turkey', 'lamb', 'egg', 'tofu', 'tempeh'],
        'dairy': ['milk', 'cheese', 'butter', 'cream', 'yogurt', 'sour cream'],
        'vegetables': ['tomato', 'onion', 'garlic', 'carrot', 'potato', 'broccoli',
                       'spinach', 'pepper', 'mushroom', 'lettuce', 'cucumber'],
        'fruits': ['apple', 'banana', 'orange', 'lemon', 'strawberry', 'blueberry'],
        'grains': ['flour', 'rice', 'pasta', 'bread', 'oat', 'quinoa', 'wheat'],
        'spices': ['salt', 'pepper', 'cumin', 'paprika', 'cinnamon', 'basil', 
                   'oregano', 'thyme', 'rosemary'],
        'oils': ['olive oil', 'vegetable oil', 'coconut oil', 'butter'],
        'sweeteners': ['sugar', 'honey', 'maple syrup', 'brown sugar']
    }


In [2]:
import pickle
import pandas as pd

# Charger le dict depuis pickle
with open("ingr_map.pkl", "rb") as f:
    ingr_map = pickle.load(f)

# Vérifier le type de ingr_map
print(type(ingr_map))  # doit afficher <class 'dict'>

ingr_map_df = pd.DataFrame(list(ingr_map.items()), columns=['ingredient', 'category'])
ingr_map_df.head()

ModuleNotFoundError: No module named 'pandas.core.indexes.numeric'

In [7]:
ingr_map.head()

Unnamed: 0,raw_ingr,raw_words,processed,len_proc,replaced,count,id
0,"medium heads bibb or red leaf lettuce, washed,...",13,"medium heads bibb or red leaf lettuce, washed,...",73,lettuce,4507,4308
1,mixed baby lettuces and spring greens,6,mixed baby lettuces and spring green,36,lettuce,4507,4308
2,romaine lettuce leaf,3,romaine lettuce leaf,20,lettuce,4507,4308
3,iceberg lettuce leaf,3,iceberg lettuce leaf,20,lettuce,4507,4308
4,red romaine lettuce,3,red romaine lettuce,19,lettuce,4507,4308


In [9]:
import pickle
import pandas as pd

# Charger le fichier pickle depuis le bon chemin
pkl_path = os.path.join(dataset_path, 'ingr_map.pkl')
with open(pkl_path, 'rb') as f:
    ingr_map = pickle.load(f)

# Transformer en DataFrame
ingr_map_df = pd.DataFrame(list(ingr_map.items()), columns=['raw_ingredient', 'mapped_ingredient'])

# Afficher le résultat
print(f"Shape: {ingr_map_df.shape}")
ingr_map_df.head()

Shape: (7, 2)


Unnamed: 0,raw_ingredient,mapped_ingredient
0,raw_ingr,0 medium heads bibb or red leaf lettuce...
1,raw_words,0 13 1 6 2 3 3 ...
2,processed,0 medium heads bibb or red leaf lettuce...
3,len_proc,0 73 1 36 2 20 3 2...
4,replaced,0 lettuce 1 lettuce 2 let...


In [11]:
from data_prepro import IngredientPreprocessor

In [12]:
preproc = IngredientPreprocessor()

In [13]:
raw_ingredients = ["2 Cups All-Purpose Flour", "Fresh Garlic", "Olive Oil", "Ground black pepper", "Diced Tomatoes"]
cleaned = [preproc.normalize_ingredient(x) for x in raw_ingredients]
cleaned

['cups all-purpose flour', 'garlic', 'olive oil', 'black pepper', 'tomatoes']

In [14]:
i = 0  # index d’une recette
raw_str = recipes_df.loc[i, "ingredients"]  # c’est une string de type liste
parsed_clean = preproc.parse_and_clean(raw_str)
parsed_clean

['mexican seasoning',
 'salt',
 'butter',
 'mixed spice',
 'honey',
 'winter squash',
 'olive oil']

In [20]:
subset = recipes_df.head(500).copy()
subset["clean_ingredients"] = subset["ingredients"].apply(preproc.parse_and_clean)
subset[["ingredients", "clean_ingredients"]].head()

Unnamed: 0,ingredients,clean_ingredients
0,"['winter squash', 'mexican seasoning', 'mixed ...","[mexican seasoning, salt, butter, mixed spice,..."
1,"['prepared pizza crust', 'sausage patty', 'egg...","[salt and pepper, prepared pizza crust, sausag..."
2,"['ground beef', 'yellow onions', 'diced tomato...","[salt, water, kidney beans, chili powder, yell..."
3,"['spreadable cheese with garlic and herbs', 'n...","[salt, pepper, tarragon, new potatoes, red win..."
4,"['tomato juice', 'apple cider vinegar', 'sugar...","[pepper, salt, dry mustard, tomato juice, appl..."


In [21]:
subset["ingredient_categories"] = subset["clean_ingredients"].apply(preproc.categorize)
subset["ingredient_categories"].head()

0    {'other': ['mexican seasoning', 'mixed spice',...
1    {'spices': ['salt and pepper'], 'other': ['pre...
2    {'spices': ['salt', 'cumin'], 'other': ['water...
3    {'spices': ['salt', 'pepper', 'red bell pepper...
4    {'spices': ['pepper', 'salt', 'cinnamon oil'],...
Name: ingredient_categories, dtype: object

In [22]:
tests = ["extra virgin olive oil", "large eggs", "fresh basil leaves", "granulated sugar", "unsalted butter"]
for t in tests:
    print(t, "->", preproc.normalize_ingredient(t))

extra virgin olive oil -> extra virgin olive oil
large eggs -> large eggs
fresh basil leaves -> basil leaves
granulated sugar -> granulated sugar
unsalted butter -> unsalted butter


In [23]:
from collections import Counter
all_clean = [ing for lst in subset["clean_ingredients"] for ing in lst]
Counter(all_clean).most_common(15)

[('salt', 178),
 ('butter', 111),
 ('onion', 90),
 ('eggs', 86),
 ('sugar', 80),
 ('water', 76),
 ('milk', 71),
 ('flour', 69),
 ('pepper', 67),
 ('black pepper', 51),
 ('olive oil', 47),
 ('brown sugar', 46),
 ('garlic cloves', 43),
 ('vanilla', 42),
 ('egg', 40)]

In [24]:
subset["clean_ingredients"].apply(len).describe()

count    500.00000
mean       9.09800
std        4.03447
min        2.00000
25%        6.00000
50%        9.00000
75%       11.25000
max       23.00000
Name: clean_ingredients, dtype: float64

In [26]:
import os
ingr_pkl_path = os.path.join(dataset_path, "ingr_map.pkl")
preproc = IngredientPreprocessor(ingr_pkl_path)

INFO:data_prepro:Ingredient map loaded successfully.


In [27]:
preproc = IngredientPreprocessor(os.path.join(dataset_path, "ingr_map.pkl"))
preproc.normalize_ingredient("Extra Virgin Olive Oil")

INFO:data_prepro:Ingredient map loaded successfully.


'olive oil'

In [28]:
raw_ingredients = ["2 Cups All-Purpose Flour", "Fresh Garlic", "Olive Oil", "Ground black pepper", "Diced Tomatoes"]
cleaned = [preproc.normalize_ingredient(x) for x in raw_ingredients]
cleaned

['cups all-purpose flour',
 'fresh garlic',
 'olive oil',
 'ground black pepper',
 'diced tomato']

In [29]:
tests = ["extra virgin olive oil", "large eggs", "fresh basil leaves", "granulated sugar", "unsalted butter"]
for t in tests:
    print(t, "->", preproc.normalize_ingredient(t))

extra virgin olive oil -> olive oil
large eggs -> large eggs
fresh basil leaves -> fresh basil leaf
granulated sugar -> granulated sugar
unsalted butter -> unsalted butter
