

This notebook integrates a set of classes and functions for working with recipes and ingredients.


Necessary Imports

In [1]:
# * -----------------------------Necessary Imports ----------------------------- #
import pandas as pd
import numpy as np
from utils import DataLoader, filter_recipes_by_ingredients, detect_uncommon_ingredients, filter_uncommon_ingredients

### Create Data Frames for each csv files.

The DataLoader instance loads four different DataFrames from csv files:

- `df_recipe`: contains information about different recipe IDs, recipe names, and other recipe details.
- `df_ingredients`: contains information about ingredient names, synonyms, and entity IDs.
- `df_compound_ingredients`: contains information about compound ingredient names, synonyms, and constituent ingredients.
- `df_relation_recipe_ingredients`: contains information about the relationship between recipe IDs and ingredient names, as well as the original ingredient names and entity IDs.

In [2]:
df_recipe = DataLoader("database/01_Recipe_Details.csv","csv").load_data()
df_ingredients = DataLoader("database/02_Ingredients.csv","csv").load_data()
df_compound_ingredients = DataLoader("database/03_Compound_Ingredients.csv", "csv").load_data()
df_relation_recipe_ingredients = DataLoader("database/04_Recipe-Ingredients_Aliases.csv", "csv").load_data()

#Renaming the Colums 
df_recipe = df_recipe.rename(columns={"Recipe ID": "Recipe_ID"})
df_ingredients = df_ingredients.rename(columns={"Aliased Ingredient Name": "Ingredient_Name", "Ingredient Synonyms": "Synonyms", "Entity ID": "Entity_ID"})
df_compound_ingredients = df_compound_ingredients.rename(columns={"entity id": "Entity_ID","Compound Ingredient Name": "Compound_Name","Compound Ingredient Synonyms": "Compound_Synonyms","Contituent Ingredients": "Contituent"})
df_relation_recipe_ingredients = df_relation_recipe_ingredients.rename(columns={"Recipe ID": "Recipe_ID","Original Ingredient Name": "Original_Name", "Aliased Ingredient Name": "Ingredient_Name", "Entity ID": "Entity_ID" })
#filas_con_hashtag = df_ingredients[df_ingredients['Synonyms'].str.startswith('#')]


### General Information
1. Total de datos por paises
2. 

### Cleaning Data
Before we can start working with the data, we need to clean it up. 
- First, we'll rename some of the columns to make them easier to work with

In [3]:
def add_ingredients_to_recipe(df_recipe, df_relation_recipe_ingredients):
    # Agrupar los ingredientes por receta
    ingredientes_por_receta = df_relation_recipe_ingredients.groupby("Recipe_ID")["Ingredient_Name"].apply(set)
    # Crear una nueva columna en el dataframe de recetas con los ingredientes
    df_recipe["Ingredient_Name"] = df_recipe["Recipe_ID"].apply(lambda x: ingredientes_por_receta.get(x, set()))
    return df_recipe

df_recipe_analysis= add_ingredients_to_recipe(df_recipe,df_relation_recipe_ingredients)
#pd.set_option('display.max_rows', None)
df_recipe_analysis = df_recipe_analysis.drop('Source', axis=1)

df_recipe_analysis

Unnamed: 0,Recipe_ID,Title,Cuisine,Ingredient_Name
0,1,5 spice vegetable fried rice,Indian Subcontinent,"{capsicum , sunflower , soy sauce , pepper bell }"
1,2,aachar aaloo,Indian Subcontinent,"{nigella seed , potato , sunflower , cumin , b..."
2,3,aadu lassan keri nu athanu,Indian Subcontinent,"{asafoetida , sunflower , sesame , fenugreek ,..."
3,4,aaloo kofta,Indian Subcontinent,"{butter , potato , sunflower , coriander , cum..."
4,5,aaloo tamatar subzi,Indian Subcontinent,"{sunflower , curry leaf , lemon }"
...,...,...,...,...
45767,45768,Strawberries & Cream Frappé,USA,"{strawberry , half half , ice , syrup }"
45768,45769,Chocolate Whiskey Bundt Cake,USA,"{butter , vanilla , cream , coffee , flour , s..."
45769,45770,"Wild Rice with Butternut Squash, Leeks, and Corn",USA,"{butter , olive , butternut squash , rice wild..."
45770,45771,Fruit Crumble,USA,"{ice cream , almond , butter , flour , sugar ,..."


- Delete posible null values

In [4]:
df_recipe_analysis.isna().sum()

# Seleccionamos todas las filas que no contienen la cadena 'set()'
df_recipe_analysis = df_recipe_analysis.loc[~df_recipe_analysis.astype(str).apply(lambda x: x.str.contains('set()')).any(axis=1)]
df_recipe_analysis

  df_recipe_analysis = df_recipe_analysis.loc[~df_recipe_analysis.astype(str).apply(lambda x: x.str.contains('set()')).any(axis=1)]


Unnamed: 0,Recipe_ID,Title,Cuisine,Ingredient_Name
0,1,5 spice vegetable fried rice,Indian Subcontinent,"{capsicum , sunflower , soy sauce , pepper bell }"
1,2,aachar aaloo,Indian Subcontinent,"{nigella seed , potato , sunflower , cumin , b..."
2,3,aadu lassan keri nu athanu,Indian Subcontinent,"{asafoetida , sunflower , sesame , fenugreek ,..."
3,4,aaloo kofta,Indian Subcontinent,"{butter , potato , sunflower , coriander , cum..."
4,5,aaloo tamatar subzi,Indian Subcontinent,"{sunflower , curry leaf , lemon }"
...,...,...,...,...
45767,45768,Strawberries & Cream Frappé,USA,"{strawberry , half half , ice , syrup }"
45768,45769,Chocolate Whiskey Bundt Cake,USA,"{butter , vanilla , cream , coffee , flour , s..."
45769,45770,"Wild Rice with Butternut Squash, Leeks, and Corn",USA,"{butter , olive , butternut squash , rice wild..."
45770,45771,Fruit Crumble,USA,"{ice cream , almond , butter , flour , sugar ,..."


In [5]:
# Divide la columna de ingredientes en filas separadas
df_recipe_analysis_explode = df_recipe_analysis.explode('Ingredient_Name')
df_recipe_analysis_explode
# Agrupar por la cantidad de ingredientes por cosina
#df_recipe_analysis_explode.groupby(df_recipe_analysis_explode.Cuisine).Ingredient_Name.count()

Unnamed: 0,Recipe_ID,Title,Cuisine,Ingredient_Name
0,1,5 spice vegetable fried rice,Indian Subcontinent,capsicum
0,1,5 spice vegetable fried rice,Indian Subcontinent,sunflower
0,1,5 spice vegetable fried rice,Indian Subcontinent,soy sauce
0,1,5 spice vegetable fried rice,Indian Subcontinent,pepper bell
1,2,aachar aaloo,Indian Subcontinent,nigella seed
...,...,...,...,...
45771,45772,Enlightened Chicken Pot Pie,USA,chicken
45771,45772,Enlightened Chicken Pot Pie,USA,thyme
45771,45772,Enlightened Chicken Pot Pie,USA,salt
45771,45772,Enlightened Chicken Pot Pie,USA,pepper bell


- Merge ingredients

In [6]:
# Definir la función de fusión de ingredientes
def merge_synonyms(df, synonyms):
    for syn in synonyms:
        preferred_name, alternate_names = syn[0], syn[1:]
        for alt in alternate_names:
            df['Ingredient_Name'] = df['Ingredient_Name'].str.replace(alt, preferred_name)
    return df


In [21]:
df_recipe_analysis_explode.loc[df_recipe_analysis_explode['Ingredient_Name'].str.contains('cheese'), 'Ingredient_Name'] = 'cheese'
df_recipe_analysis_explode.loc[df_recipe_analysis_explode['Ingredient_Name'].str.contains('zucchini'), 'Ingredient_Name'] = 'zucchini'

- Delete most common ingredients

In [22]:
# calculate the top 5 ingredients per cuisine
top_5_per_cuisine = df_recipe_analysis_explode.groupby('Cuisine')['Ingredient_Name'].apply(lambda x: x.value_counts().head(10))

df_top_5 = top_5_per_cuisine.reset_index(name='Frequency')

df_top_5 = df_top_5.rename(columns={"level_1": "Ingredient"})
total_ingredients_by_Cuisine = df_recipe_analysis.groupby('Cuisine')['Cuisine'].count()

total_ingredients_by_Cuisine = total_ingredients_by_Cuisine.reset_index(name='Frequency')

merged_df = pd.merge(total_ingredients_by_Cuisine, df_top_5, on='Cuisine')
merged_df = merged_df.rename(columns={"Frequency_x": "Total of Recipe", "Frequency_y": "Frequency" })

merged_df['Percentage'] = (merged_df['Frequency'].div(merged_df['Total of Recipe']).mul(100))
merged_df

merged_df.groupby('Ingredient')['Cuisine'].count()

Ingredient
allspice           1
almond             2
apple              1
baking powder      4
baking soda        2
                  ..
vegetable oil     15
vinegar           10
wine white         2
yeast              2
yogurt             2
Name: Cuisine, Length: 69, dtype: int64

In [6]:
#Eliminar los ingredientes recurrentes de las recetas por paises 
#salt presente como recurrente en 20 paises
df_recipe_analysis_explode = df_recipe_analysis_explode.loc[~df_recipe_analysis_explode.astype(str).apply(lambda x: x.str.contains('salt')).any(axis=1)]
#sugar presente en 15
df_recipe_analysis_explode = df_recipe_analysis_explode.loc[~df_recipe_analysis_explode.astype(str).apply(lambda x: x.str.contains('sugar')).any(axis=1)]
#garlic present in 17
df_recipe_analysis_explode = df_recipe_analysis_explode.loc[~df_recipe_analysis_explode.astype(str).apply(lambda x: x.str.contains('garlic')).any(axis=1)]

df_recipe_analysis_explode = df_recipe_analysis_explode.loc[~df_recipe_analysis_explode.astype(str).apply(lambda x: x.str.contains('pepper')).any(axis=1)]

df_recipe_analysis_explode = df_recipe_analysis_explode.loc[~df_recipe_analysis_explode.astype(str).apply(lambda x: x.str.contains('water')).any(axis=1)]

df_recipe_analysis_explode = df_recipe_analysis_explode.loc[~df_recipe_analysis_explode.astype(str).apply(lambda x: x.str.contains('onion')).any(axis=1)]

df_recipe_analysis_explode = df_recipe_analysis_explode.loc[~df_recipe_analysis_explode.astype(str).apply(lambda x: x.str.contains('olive')).any(axis=1)]

df_recipe_analysis_explode = df_recipe_analysis_explode.loc[~df_recipe_analysis_explode.astype(str).apply(lambda x: x.str.contains('egg')).any(axis=1)]

df_recipe_analysis_explode = df_recipe_analysis_explode.loc[~df_recipe_analysis_explode.astype(str).apply(lambda x: x.str.contains('butter')).any(axis=1)]

df_recipe_analysis_explode

Unnamed: 0,Recipe_ID,Title,Cuisine,Ingredient_Name
0,1,5 spice vegetable fried rice,Indian Subcontinent,capsicum
0,1,5 spice vegetable fried rice,Indian Subcontinent,sunflower
0,1,5 spice vegetable fried rice,Indian Subcontinent,soy sauce
1,2,aachar aaloo,Indian Subcontinent,nigella seed
1,2,aachar aaloo,Indian Subcontinent,potato
...,...,...,...,...
45771,45772,Enlightened Chicken Pot Pie,USA,worcestershire sauce
45771,45772,Enlightened Chicken Pot Pie,USA,milk
45771,45772,Enlightened Chicken Pot Pie,USA,chicken
45771,45772,Enlightened Chicken Pot Pie,USA,thyme


- Delete recipes with one ingredient

In [8]:
# find_similar_recipes(df_recipe_analysis)
def find_recipes_with_n_ingredients(df, n):
    df_grouped = df.groupby(['Recipe_ID', 'Cuisine']).agg({'Ingredient_Name': list}).reset_index()
    df_grouped.rename(columns={'Ingredient_Name': 'Ingredient_Name'}, inplace=True)
    
    n_ingredient_recipes = []
    for i in range(len(df_grouped)):
        ingredient_count = len(df_grouped.iloc[i]['Ingredient_Name'])
        if ingredient_count == n:
            recipe_name = df_grouped.iloc[i]['Recipe_ID']
            cuisine = df_grouped.iloc[i]['Cuisine']
            n_ingredient_recipes.append(f"{recipe_name}")
    
    return n_ingredient_recipes

In [9]:
one_ingredient_recipe = find_recipes_with_n_ingredients(df_recipe_analysis_explode,1)
df_recipe_analysis_explode = filter_recipes_by_ingredients(df_recipe_analysis_explode, one_ingredient_recipe, 'Recipe_ID')
df_recipe_analysis_explode

Unnamed: 0,Recipe_ID,Title,Cuisine,Ingredient_Name
0,1,5 spice vegetable fried rice,Indian Subcontinent,capsicum
0,1,5 spice vegetable fried rice,Indian Subcontinent,sunflower
0,1,5 spice vegetable fried rice,Indian Subcontinent,soy sauce
1,2,aachar aaloo,Indian Subcontinent,nigella seed
1,2,aachar aaloo,Indian Subcontinent,potato
...,...,...,...,...
44998,44999,Apple-Sausage Stuffing,USA,apple
44998,44999,Apple-Sausage Stuffing,USA,parsley
44998,44999,Apple-Sausage Stuffing,USA,celery
44998,44999,Apple-Sausage Stuffing,USA,cognac brandy


- Delete recipes with 2 ingredients

- Delete uncommon ingredients

In [11]:
#Deteccion de Ingredientes poco Comunes

uncommon_ingredients = detect_uncommon_ingredients(df_recipe_analysis_explode)
# uncommon_ingredients
# df_recipe_analysis_explode.loc[df_recipe_analysis_explode['Ingredient_Name'].isin(uncommon_ingredients)]


# df_recipe_analysis_explode[df_recipe_analysis_explode['Ingredient_Name'].str.contains('strawberry')]

In [12]:
#Deteccion de Ingredientes poco Comunes
df_uncommon_ingredients = filter_uncommon_ingredients(df_recipe_analysis_explode, uncommon_ingredients )


In [13]:
#Remove the uncommon ingredients
df_clean = filter_recipes_by_ingredients(df_recipe_analysis_explode, uncommon_ingredients, 'Ingredient_Name' )
df_clean 

Unnamed: 0,Recipe_ID,Title,Cuisine,Ingredient_Name
0,1,5 spice vegetable fried rice,Indian Subcontinent,capsicum
0,1,5 spice vegetable fried rice,Indian Subcontinent,sunflower
0,1,5 spice vegetable fried rice,Indian Subcontinent,soy sauce
1,2,aachar aaloo,Indian Subcontinent,nigella seed
1,2,aachar aaloo,Indian Subcontinent,potato
...,...,...,...,...
44998,44999,Apple-Sausage Stuffing,USA,apple
44998,44999,Apple-Sausage Stuffing,USA,parsley
44998,44999,Apple-Sausage Stuffing,USA,celery
44998,44999,Apple-Sausage Stuffing,USA,cognac brandy


## Graph analysis and visualization

In [20]:
import networkx as nx
from networkx.algorithms import bipartite
import matplotlib.pyplot as plt


In [16]:
# filtered_df = filter_low_similarity_pairs(df_recipe, 0.5)
# filtered_df

#### Graph Creation
El grafo se crea a partir de los recipe ID y los entity ID

In [21]:
G = nx.Graph()
# G.add_nodes_from(df_recipe_explode.Recipe_ID, title = df_recipe.Title ,cuisine = df_recipe.Cuisine ,bipartite = 0)
# G.add_nodes_from(df_ingredients.Entity_ID, category = df_ingredients.Category ,bipartite= 1)

G.add_nodes_from(df_clean.Cuisine ,bipartite = 'cuisine')
G.add_nodes_from(df_clean.Ingredient_Name,bipartite= 'ingredient')

# Agregar aristas al grafo
# for recipe_id, ingredient_id in zip(df_recipe_explode["Recipe_ID"], df_recipe_explode["Entity_ID"]):
#     G.add_edge(recipe_id, ingredient_id)
    
# for index, row in df_recipe_explode.iterrows():
#     recipe_id = row["Recipe_ID"]
#     ingredient_id = row["Entity_ID"]
#     G.add_edge(recipe_id, ingredient_id)

# # Obtener lista de aristas
# # edges = G.edges()

# # Imprimir cada arista
# # for edge in edges:
# #     print(edge)

# # Mostrar información detallada del grafo
# num_recipes = len(set(df_recipe["Recipe_ID"]))
# num_ingredients = len(set(df_ingredients["Entity_ID"]))
# print(f"El grafo contiene {num_recipes} recetas y {num_ingredients} ingredientes.")
# print(f"Número total de nodos: {G.number_of_nodes()}")
# print(f"Número total de aristas: {G.number_of_edges()}")

#Guardar Grafo
# nx.write_graphml(G, 'grafo_bipartito.graphml')

In [23]:
cuisine = {node for node, att in G.nodes(data = True) if att['bipartite']=='cuisine'}
ingredients = set(G) - cuisine
print(cuisine)
print(ingredients)

{'Misc.: Belgian', 'South America', 'DACH Countries', 'USA', 'Misc.: Central America', 'Thailand', 'Africa', 'Korea', 'France', 'Scandinavia', 'Australia & NZ', 'Middle East', 'Caribbean', 'Italy', 'Canada', 'Misc.: Dutch', 'South East Asia', 'China', 'Japan', 'Spain', 'Misc.: Portugal', 'Eastern Europe', 'Greece', 'British Isles', 'Mexico', 'Indian Subcontinent'}
{'yam ', 'dressing ranch ', 'caraway ', 'red chily ', 'arrowroot ', 'cake ', 'rum ', 'sauerkraut ', 'plantain french', 'bean cluster ', 'merlot ', 'peanut ', 'pastry ', 'rhubarb ', 'lady finger ', 'shallot ', 'tumeric ', 'frankfurter sausage ', 'coleslaw ', 'hen  guinea', 'string  bean', 'pie crust ', 'carrot ', 'pistachio ', 'green chile ', 'italian seasoning ', 'wasabi ', 'crab ', 'half half ', 'couscou ', 'blue  cheese', 'gruyere  cheese', 'chickpea ', 'molass ', 'bay leaf ', 'pasta ', 'prosciutto ', 'brandy cognac ', 'basmati ', 'coriander seed ', 'millet ', 'cooking spray ', 'soybean ', 'anise ', 'cinnamon ', 'cajun seas

In [25]:
# Agregar aristas entre ingredientes y cocinas que contienen ese ingrediente
for ingredient in ingredients:
    ingredient_data = df_clean[df_clean['Ingredient_Name'] == ingredient]
    cuisines = ingredient_data['Cuisine'].unique()
    for cuisine in cuisines:
        cuisine_data = ingredient_data[ingredient_data['Cuisine'] == cuisine]
        num_recipes = len(cuisine_data['Recipe_ID'].unique())
        G.add_edge(ingredient, cuisine, weight=num_recipes)


# Imprimir información del grafo
print(f"Número total de aristas: {G.number_of_edges()}")



Número total de aristas: 7015
[('Indian Subcontinent', 'yam '), ('Indian Subcontinent', 'caraway '), ('Indian Subcontinent', 'red chily '), ('Indian Subcontinent', 'rum '), ('Indian Subcontinent', 'plantain french'), ('Indian Subcontinent', 'bean cluster '), ('Indian Subcontinent', 'peanut '), ('Indian Subcontinent', 'pastry '), ('Indian Subcontinent', 'rhubarb '), ('Indian Subcontinent', 'shallot '), ('Indian Subcontinent', 'tumeric '), ('Indian Subcontinent', 'hen  guinea'), ('Indian Subcontinent', 'string  bean'), ('Indian Subcontinent', 'pie crust '), ('Indian Subcontinent', 'carrot '), ('Indian Subcontinent', 'pistachio '), ('Indian Subcontinent', 'green chile '), ('Indian Subcontinent', 'italian seasoning '), ('Indian Subcontinent', 'crab '), ('Indian Subcontinent', 'half half '), ('Indian Subcontinent', 'couscou '), ('Indian Subcontinent', 'chickpea '), ('Indian Subcontinent', 'molass '), ('Indian Subcontinent', 'bay leaf '), ('Indian Subcontinent', 'pasta '), ('Indian Subcontin

In [27]:
#Guardar Grafo
nx.write_graphml(G, 'grafo_bipartito.graphml')

#### Graph visualization in networkx

In [None]:
from pyvis.network import Network

In [None]:
def draw_network(
    nodes: list,
    df: pd.DataFrame,
    minium_weight: int = 0,
    repulsion: int = 100,
    spring_length=200,
    buttons=False,
):

    net = Network("500px", "500px", notebook=True)
    net.add_nodes(nodes)

    # add edges
    for node, weights in df.iterrows():
        edges = get_edges(node, weights, nodes, minium_weight)
        net.add_edges(edges)

    # change node distance and spring length
    net.repulsion(repulsion, spring_length=spring_length)

    # Tweek configuration UI
    net.show_buttons(filter_=buttons)
    return net

In [None]:
def get_edges(node: str, weights: list, all_nodes: list, minium_weight: int):

    nodes = all_nodes.copy()

    # Remove target node
    nodes.remove(node)

    # Create a list of edges with weights
    edges = [(node, connection, weight) for connection, weight in zip(nodes, weights)]

    # Get only edges with weights greater than the minimum weight
    edges = [edge for edge in edges if edge[2] >= minium_weight]

    return edges

In [None]:
net = draw_network(df_clean["Cuisine"], df_clean, minium_weight=9, repulsion=100, spring_length=500)
net.show("match.html")



AttributeError: 'Series' object has no attribute 'remove'

In [None]:

# Dibujar el grafo
# pos = nx.bipartite_layout(H, set(df_recipe["Recipe_ID"]))
# nx.draw(H, pos, with_labels=True)
# plt.show()

# Visualizacion de un grafo bipartito
nx.draw_networkx(G, pos= nx.bipartite_layout(G, df_recipe_explode["Recipe_ID"], align = 'horizontal'))

#### Creation projection of the graph

In [None]:
Gs = bipartite.projected_graph(G)