# Data Loader

This notebook integrates a set of classes and functions for working with recipes and ingredients.


Necessary Imports

In [1]:
# * -----------------------------Necessary Imports ----------------------------- #
import json
import pandas as pd

In [2]:

# * ----------------------------- CLASS DATALOADER ----------------------------- #
class DataLoader:
    """Clase genérica para procesar información de un archivo csv o json."""

    def __init__(self, file_path, file_type="json"):
        self._file_path = file_path
        self._file_type = file_type

    def load_data(self):
        if self._file_type == "json":
            with open(self._file_path, "r") as f:
                data = json.load(f)
            df = pd.DataFrame(data)
        elif self._file_type == "csv":
            df = pd.read_csv(self._file_path)
        else:
            raise ValueError(f"Unsupported file type: {self._file_type}")
        
        return df

### Create Data Frames for each csv files.

The DataLoader instance loads four different DataFrames from csv files:

- `df_recipe`: contains information about different recipe IDs, recipe names, and other recipe details.
- `df_ingredients`: contains information about ingredient names, synonyms, and entity IDs.
- `df_compound_ingredients`: contains information about compound ingredient names, synonyms, and constituent ingredients.
- `df_relation_recipe_ingredients`: contains information about the relationship between recipe IDs and ingredient names, as well as the original ingredient names and entity IDs.

In [3]:
df_recipe = DataLoader("database/01_Recipe_Details.csv","csv").load_data()
df_ingredients = DataLoader("database/02_Ingredients.csv","csv").load_data()
df_compound_ingredients = DataLoader("database/03_Compound_Ingredients.csv", "csv").load_data()
df_relation_recipe_ingredients = DataLoader("database/04_Recipe-Ingredients_Aliases.csv", "csv").load_data()

#Renaming the Colums 
df_recipe = df_recipe.rename(columns={"Recipe ID": "Recipe_ID"})
df_ingredients = df_ingredients.rename(columns={"Aliased Ingredient Name": "Ingredient_Name", "Ingredient Synonyms": "Synonyms", "Entity ID": "Entity_ID"})
df_compound_ingredients = df_compound_ingredients.rename(columns={"entity id": "Entity_ID","Compound Ingredient Name": "Compound_Name","Compound Ingredient Synonyms": "Compound_Synonyms","Contituent Ingredients": "Contituent"})
df_relation_recipe_ingredients = df_relation_recipe_ingredients.rename(columns={"Recipe ID": "Recipe_ID","Original Ingredient Name": "Original_Name", "Aliased Ingredient Name": "Ingredient_Name", "Entity ID": "Entity_ID" })


### Data Cleaning

Before we can start working with the data, we need to clean it up. First, we'll rename some of the columns to make them easier to work with

In [4]:
def add_ingredients_to_recipe(df_recipe, df_relation_recipe_ingredients):
    # Agrupar los ingredientes por receta
    ingredientes_por_receta = df_relation_recipe_ingredients.groupby("Recipe_ID")["Entity_ID"].apply(set)
    # Crear una nueva columna en el dataframe de recetas con los ingredientes
    df_recipe["Entity_ID"] = df_recipe["Recipe_ID"].apply(lambda x: ingredientes_por_receta.get(x, set()))
    return df_recipe

df_recipe= add_ingredients_to_recipe(df_recipe,df_relation_recipe_ingredients)
#pd.set_option('display.max_rows', None)
df_recipe


Unnamed: 0,Recipe_ID,Title,Source,Cuisine,Entity_ID
0,1,5 spice vegetable fried rice,TARLA_DALAL,Indian Subcontinent,"{426, 362, 291}"
1,2,aachar aaloo,TARLA_DALAL,Indian Subcontinent,"{258, 392, 362, 426, 332, 2001, 61, 373, 317}"
2,3,aadu lassan keri nu athanu,TARLA_DALAL,Indian Subcontinent,"{992, 258, 426, 299, 2001, 341, 376, 317}"
3,4,aaloo kofta,TARLA_DALAL,Indian Subcontinent,"{992, 932, 327, 330, 331, 332, 426, 364, 212, ..."
4,5,aaloo tamatar subzi,TARLA_DALAL,Indian Subcontinent,"{240, 385, 426}"
...,...,...,...,...,...
45767,45768,Strawberries & Cream Frappé,EPICURIOUS,USA,"{2017, 794, 234, 803}"
45768,45769,Chocolate Whiskey Bundt Cake,EPICURIOUS,USA,"{0, 996, 2056, 778, 780, 781, 46, 815, 245, 25..."
45769,45770,"Wild Rice with Butternut Squash, Leeks, and Corn",EPICURIOUS,USA,"{778, 526, 338, 307, 56, 345, 60, 670}"
45770,45771,Fruit Crumble,EPICURIOUS,USA,"{2056, 201, 778, 781, 281, 763, 60}"


## Pre-processing  
- dependiento de los analisis puede ser necesario eliminar ciertos datos 
  - por ejemplo los ingredientes como azucar, sal, aceite pueden no ser relevantes 
  - en el caso de analizar que ingredientes son mas utilizados por cultura puede ser necesario eliminar los anteriores

In [5]:
#df_recipe[df_recipe.Source == 'TARLA_DALAL']
#df_recipe
# df_ingredients

# Divide la columna de ingredientes en filas separadas
df_recipe_explode = df_recipe.explode('Entity_ID')
df_recipe



Unnamed: 0,Recipe_ID,Title,Source,Cuisine,Entity_ID
0,1,5 spice vegetable fried rice,TARLA_DALAL,Indian Subcontinent,"{426, 362, 291}"
1,2,aachar aaloo,TARLA_DALAL,Indian Subcontinent,"{258, 392, 362, 426, 332, 2001, 61, 373, 317}"
2,3,aadu lassan keri nu athanu,TARLA_DALAL,Indian Subcontinent,"{992, 258, 426, 299, 2001, 341, 376, 317}"
3,4,aaloo kofta,TARLA_DALAL,Indian Subcontinent,"{992, 932, 327, 330, 331, 332, 426, 364, 212, ..."
4,5,aaloo tamatar subzi,TARLA_DALAL,Indian Subcontinent,"{240, 385, 426}"
...,...,...,...,...,...
45767,45768,Strawberries & Cream Frappé,EPICURIOUS,USA,"{2017, 794, 234, 803}"
45768,45769,Chocolate Whiskey Bundt Cake,EPICURIOUS,USA,"{0, 996, 2056, 778, 780, 781, 46, 815, 245, 25..."
45769,45770,"Wild Rice with Butternut Squash, Leeks, and Corn",EPICURIOUS,USA,"{778, 526, 338, 307, 56, 345, 60, 670}"
45770,45771,Fruit Crumble,EPICURIOUS,USA,"{2056, 201, 778, 781, 281, 763, 60}"


## Graph analysis and visualization

In [6]:
import networkx as nx
from networkx.algorithms import bipartite
import matplotlib.pyplot as plt


In [13]:

def jaccard_similarity(set1, set2):
    """
    Calcula la similitud Jaccard entre dos conjuntos de elementos.

    Args:
    - set1: Conjunto 1.
    - set2: Conjunto 2.

    Returns:
    - La similitud Jaccard entre set1 y set2.
    """
    intersection_size = len(set1.intersection(set2))
    union_size = len(set1.union(set2))
    return intersection_size / union_size

def filter_low_similarity_pairs(df, threshold):
    """
    Filtra los pares de recetas con una similitud Jaccard inferior a un umbral específico.

    Args:
    - df: Dataset que contiene las recetas y sus ingredientes.
    - threshold: Umbral de similitud Jaccard.

    Returns:
    - El dataset que contiene solo los pares de recetas con una similitud Jaccard superior al umbral.
    """
    recipe_ingredients = df.groupby("Recipe_ID")["Entity_ID"]
    recipe_pairs = [(i, j) for i in recipe_ingredients.index for j in recipe_ingredients.index if i < j]
    similarities = [(i, j, jaccard_similarity(recipe_ingredients[i], recipe_ingredients[j])) for i, j in recipe_pairs]
    similarities_df = pd.DataFrame(similarities, columns=["recipe1", "recipe2", "similarity"])
    filtered_df = similarities_df[similarities_df["similarity"] > threshold]
    return filtered_df

In [16]:
filtered_df = filter_low_similarity_pairs(df_recipe, 0.5)
filtered_df

AttributeError: 'SeriesGroupBy' object has no attribute 'index'

#### Graph Creation
El grafo se crea a partir de los recipe ID y los entity ID

In [None]:
G = nx.Graph()
G.add_nodes_from(df_recipe_explode.Recipe_ID, title = df_recipe.Title ,cuisine = df_recipe.Cuisine ,bipartite = 0)
G.add_nodes_from(df_ingredients.Entity_ID, category = df_ingredients.Category ,bipartite= 1)

# Agregar aristas al grafo
for recipe_id, ingredient_id in zip(df_recipe["Recipe_ID"], df_recipe["Entity_ID"]):
    G.add_edge(recipe_id, ingredient_id)
    
# Mostrar información detallada del grafo
num_recipes = len(set(df_recipe["Recipe_ID"]))
num_ingredients = len(set(df_ingredients["Entity_ID"]))
print(f"El grafo contiene {num_recipes} recetas y {num_ingredients} ingredientes.")
print(f"Número total de nodos: {G.number_of_nodes()}")
print(f"Número total de aristas: {G.number_of_edges()}")


#### Graph visualization in networkx

In [None]:
# Dibujar el grafo
pos = nx.bipartite_layout(H, set(df_recipe["Recipe_ID"]))
nx.draw(H, pos, with_labels=True)
plt.show()