# Preparing environment

In [1]:
#!pip3 install ipython-sql
#!pip install fuzzywuzzyimport requests
#!pip install python-Levenshtein

In [2]:
import requests
import json
import pandas as pd
import psycopg2
from rapidfuzz import process, fuzz
import numpy as np

# Extract

**API DOC**
https://spoonacular.com/application/frontend/downloads/spoonacular-api-slides.pdf

**API Link** https://spoonacular.com/food-api

In [3]:
def extract_data (api_url, num_recipes=100):
  params = { 'number': num_recipes}
  try:
    response = requests.get(api_url, params=params)
    response.raise_for_status() # Raise an exception for non-2xx status codes
    try:
        data = response.json()
        return data
    except json.JSONDecodeError as e:
            print(f"Error parsing JSON data: {e}")
            return None
  except requests.exceptions.RequestException as e:
        print(f"Error extracting data from API: {e}")
        return None

In [None]:
API_KEY = '9fe94823f081434989282d1622cfbc31'
API_URL = 'https://api.spoonacular.com/recipes/random?apiKey=' + API_KEY
data = extract_data(API_URL)
if data:
    # Process the extracted data
    print(data)
else:
    print("An error occurred while fetching data.")

# Transform

## Json to Pandas
you can import any json data into pandas, the keys will become columns names

In [5]:
# here we imported the data as it is
# because our json file has a main head called reciepes, the dataframe here has only one column
# {key:[value]}
#dfj = pd.DataFrame(data)
#dfj.head()

In [6]:
df = pd.DataFrame(data['recipes']) # [{},{},{}]
df.head()

Unnamed: 0,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,veryPopular,sustainable,lowFodmap,weightWatcherSmartPoints,...,cuisines,dishTypes,diets,occasions,instructions,analyzedInstructions,originalId,spoonacularScore,spoonacularSourceUrl,license
0,False,False,True,False,False,False,False,False,False,12,...,[],"[lunch, main course, main dish, dinner]","[gluten free, primal]",[],<ol><li>Coat chicken with a mixture of 6 tsp. ...,"[{'name': '', 'steps': [{'number': 1, 'step': ...",,81.589928,https://spoonacular.com/alouette-chicken-papri...,
1,True,True,True,True,True,False,False,False,False,4,...,[],"[antipasti, starter, snack, appetizer, antipas...","[gluten free, dairy free, paleolithic, lacto o...",[],Heat oil in a large nonstick skillet over medi...,"[{'name': '', 'steps': [{'number': 1, 'step': ...",,94.78595,https://spoonacular.com/baby-bok-choy-stir-fry...,CC BY 3.0
2,False,False,False,False,False,False,False,False,False,35,...,[],[dessert],[],[],<ol><li>Crumble cookies and save 1/3 for toppi...,"[{'name': '', 'steps': [{'number': 1, 'step': ...",,0.366684,https://spoonacular.com/oreo-cake-654018,CC BY 3.0
3,True,False,False,False,False,False,False,False,False,10,...,[],[dessert],[lacto ovo vegetarian],[],<ol><li>Beat heavy cream until medium peaks fo...,"[{'name': '', 'steps': [{'number': 1, 'step': ...",,13.017535,https://spoonacular.com/oreo-cookies-cream-no-...,CC BY 3.0
4,False,False,True,False,False,False,False,False,False,18,...,[],"[lunch, main course, morning meal, brunch, mai...",[gluten free],[],"<ol><li>In a large mixing bowl add six eggs, s...","[{'name': '', 'steps': [{'number': 1, 'step': ...",,76.204941,https://spoonacular.com/frittata-643857,


In [7]:
print(len(df))  # Check the number of rows in the final DataFrame

100


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 37 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   vegetarian                100 non-null    bool   
 1   vegan                     100 non-null    bool   
 2   glutenFree                100 non-null    bool   
 3   dairyFree                 100 non-null    bool   
 4   veryHealthy               100 non-null    bool   
 5   cheap                     100 non-null    bool   
 6   veryPopular               100 non-null    bool   
 7   sustainable               100 non-null    bool   
 8   lowFodmap                 100 non-null    bool   
 9   weightWatcherSmartPoints  100 non-null    int64  
 10  gaps                      100 non-null    object 
 11  preparationMinutes        1 non-null      float64
 12  cookingMinutes            1 non-null      float64
 13  aggregateLikes            100 non-null    int64  
 14  healthScore

In [9]:
df_test=df.copy()

## Defining a Function that extracts keys from dictionaries in a dataframe column

In [10]:

def column_list_dict_to_set_keys(input):
  """
  Extracting a set containing the unique keys of lists of dictionaries contained in the rows of a dataframe column

  Args:
      input: a dataframe column containing a list of dictionaries in each row.

  Returns:
      a set containing unique keys.
  """
  #the input should be a column of a dataframe containing a list of dictionaries in each row
  dataframe_column = input

  # Create an empty set to store all unique keys (efficient for uniqueness)
  all_keys = set()

  # Iterate through each row (list of dictionaries) in the DataFrame
  for list_of_dicts_per_row in dataframe_column:
    
    # Ensure the row is a list
    if not isinstance(list_of_dicts_per_row, list):
      #print(f"Skipping non-list row: {list_of_dicts_per_row}")
      continue  # Skip to the next row if not a list

    # Extract keys from dictionaries in the current row
    for each_dict in list_of_dicts_per_row:
        # Check if the element is a dictionary 
        if isinstance(each_dict, dict):
          all_keys.update(each_dict.keys())  # Efficiently add keys to the set
  return all_keys
    

## Dealing with recipes

In [11]:
# Preparing the needed columns for recipes dataframe
column_names = list(df_test.columns)
print(column_names)
print('\n')

# List of columns to be removed
columns_to_remove = [
    'weightWatcherSmartPoints', 'gaps', 'preparationMinutes', 'cookingMinutes',
    'aggregateLikes', 'healthScore', 'creditsText', 'extendedIngredients', 'id',
    'sourceName', 'imageType', 'image', 'author', 'instructions',
    'analyzedInstructions', 'originalId', 'spoonacularScore', 'spoonacularSourceUrl',
    'diets', 'occasions', 'dishTypes', 'cuisines'
]

# Remove columns from column_names if they exist in df_test.columns
for column in columns_to_remove:
    if column in df_test.columns:
        column_names.remove(column)

print(column_names)


['vegetarian', 'vegan', 'glutenFree', 'dairyFree', 'veryHealthy', 'cheap', 'veryPopular', 'sustainable', 'lowFodmap', 'weightWatcherSmartPoints', 'gaps', 'preparationMinutes', 'cookingMinutes', 'aggregateLikes', 'healthScore', 'creditsText', 'sourceName', 'pricePerServing', 'extendedIngredients', 'id', 'title', 'readyInMinutes', 'servings', 'sourceUrl', 'image', 'imageType', 'summary', 'cuisines', 'dishTypes', 'diets', 'occasions', 'instructions', 'analyzedInstructions', 'originalId', 'spoonacularScore', 'spoonacularSourceUrl', 'license']


['vegetarian', 'vegan', 'glutenFree', 'dairyFree', 'veryHealthy', 'cheap', 'veryPopular', 'sustainable', 'lowFodmap', 'pricePerServing', 'title', 'readyInMinutes', 'servings', 'sourceUrl', 'summary', 'license']


In [12]:
# Constructing recipe dataframe
dfrecipes = pd.DataFrame(df_test, columns=column_names)
dfrecipes.head()

Unnamed: 0,vegetarian,vegan,glutenFree,dairyFree,veryHealthy,cheap,veryPopular,sustainable,lowFodmap,pricePerServing,title,readyInMinutes,servings,sourceUrl,summary,license
0,False,False,True,False,False,False,False,False,False,329.59,Alouette Chicken Paprika,45,4,https://www.foodista.com/recipe/62BLCZVT/aloue...,Alouette Chicken Paprika takes around <b>45 mi...,
1,True,True,True,True,True,False,False,False,False,109.24,Baby Bok Choy Stir Fry,45,2,https://www.foodista.com/recipe/4KHXC282/baby-...,Baby Bok Choy Stir Fry is a hor d'oeuvre that ...,CC BY 3.0
2,False,False,False,False,False,False,False,False,False,238.92,Oreo Cake,45,4,https://www.foodista.com/recipe/MQXQDZWD/oreo-...,Oreo Cake might be a good recipe to expand you...,CC BY 3.0
3,True,False,False,False,False,False,False,False,False,48.24,Oreo Cookies & Cream No-Bake Cheesecake,45,20,http://www.foodista.com/recipe/WWFJTZ7N/oreo-c...,Oreo Cookies & Cream No-Bake Cheesecake requir...,CC BY 3.0
4,False,False,True,False,False,False,False,False,False,270.59,Frittata,45,2,https://www.foodista.com/recipe/SNFFG2VB/frittata,Frittata requires about <b>45 minutes</b> from...,


In [13]:
print(len(dfrecipes))  # Check the number of rows in the final DataFrame

100


In [14]:
# Generate successive numbers for the 'id_recipe' column
dfrecipes['id_recipe'] = range(1, len(dfrecipes) + 1)

In [15]:
columns_to_rename={
    'vegetarian': 'is_vegetarian', 'vegan': 'is_vegan',
    'glutenFree': 'is_glutenFree', 'dairyFree': 'is_dairyFree',
    'veryHealthy': 'is_healthy', 'cheap': 'is_cheap',
    'veryPopular': 'is_Popular', 'sustainable': 'is_sustainable',
    'lowFodmap': 'is_lowFodmap', 'pricePerServing': 'price_per_serving', 
    'readyInMinutes': 'ready_min', 'sourceUrl': 'source_url', 
    'title': 'recipe_title'}
dfrecipes = dfrecipes.rename(columns=columns_to_rename)

In [16]:
# Find duplicates
duplicates = dfrecipes[dfrecipes.duplicated(subset=['recipe_title'])]
duplicates

Unnamed: 0,is_vegetarian,is_vegan,is_glutenFree,is_dairyFree,is_healthy,is_cheap,is_Popular,is_sustainable,is_lowFodmap,price_per_serving,recipe_title,ready_min,servings,source_url,summary,license,id_recipe


In [17]:
dfrecipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   is_vegetarian      100 non-null    bool   
 1   is_vegan           100 non-null    bool   
 2   is_glutenFree      100 non-null    bool   
 3   is_dairyFree       100 non-null    bool   
 4   is_healthy         100 non-null    bool   
 5   is_cheap           100 non-null    bool   
 6   is_Popular         100 non-null    bool   
 7   is_sustainable     100 non-null    bool   
 8   is_lowFodmap       100 non-null    bool   
 9   price_per_serving  100 non-null    float64
 10  recipe_title       100 non-null    object 
 11  ready_min          100 non-null    int64  
 12  servings           100 non-null    int64  
 13  source_url         100 non-null    object 
 14  summary            100 non-null    object 
 15  license            57 non-null     object 
 16  id_recipe          100 non-

## Dealing with ingredients

In [18]:
# Extract the column that contains all the ingredients
dfAllIng = pd.DataFrame(df_test, columns=['extendedIngredients'])
dfAllIng.head(10)

Unnamed: 0,extendedIngredients
0,"[{'id': 1001, 'aisle': 'Milk, Eggs, Other Dair..."
1,"[{'id': 11116, 'aisle': 'Produce', 'image': 'b..."
2,"[{'id': 1001, 'aisle': 'Milk, Eggs, Other Dair..."
3,"[{'id': 1017, 'aisle': 'Cheese', 'image': 'cre..."
4,"[{'id': 10211821, 'aisle': 'Produce', 'image':..."
5,"[{'id': 99242, 'aisle': 'Produce', 'image': 'a..."
6,"[{'id': 18369, 'aisle': 'Baking', 'image': 'wh..."
7,"[{'id': 11821, 'aisle': 'Produce', 'image': 'r..."
8,"[{'id': 18334, 'aisle': 'Refrigerated', 'image..."
9,"[{'id': 11011, 'aisle': 'Produce', 'image': 'a..."


In [19]:
print(len(dfAllIng))  # Check the number of rows

100


In [20]:
# Preparing the needed columns for ingredients dataframe
all_keys_ing = column_list_dict_to_set_keys(dfAllIng['extendedIngredients'])
print(all_keys_ing)
print('\n')

# List of keys to be removed
keys_to_remove = ["originalName", "meta", "image", "original", "amount", "unit", "id"]

# Remove keys from all_keys_ing if they exist in dfAllIng['extendedIngredients']
for key in keys_to_remove:
    if key in all_keys_ing:
        all_keys_ing.remove(key)

print(all_keys_ing)


{'meta', 'unit', 'originalName', 'measures', 'original', 'image', 'consistency', 'nameClean', 'id', 'amount', 'aisle', 'name'}


{'measures', 'consistency', 'nameClean', 'aisle', 'name'}


In [21]:
#obtaining a copy from ing_keys because we will need it later when working with measures
ing_keys = all_keys_ing.copy()

#removing measures element
ing_keys.remove("measures")

# Convert the set of keys to a regular list
ing_keys_list = list(ing_keys)  

# Create the DataFrame with the collected keys as column names
dfIng = pd.DataFrame(columns=ing_keys_list) 
dfIng.head()

Unnamed: 0,consistency,nameClean,aisle,name


In [22]:
#all_ingredients variable recives a list of all dictionaries in the extendedIngredients column
#note that a dataframe row is a dictionary
all_ingredients = df_test['extendedIngredients'].apply(lambda x: [d for d in x if isinstance(d, dict)]).sum()

#Constructing the ingredients dataframe
dfIng = pd.DataFrame(all_ingredients, columns=ing_keys_list)
dfIng.head()


Unnamed: 0,consistency,nameClean,aisle,name
0,SOLID,butter,"Milk, Eggs, Other Dairy",butter
1,SOLID,chicken breast,Meat,chicken breasts halves
2,SOLID,garlic powder,Spices and Seasonings,garlic powder
3,SOLID,garlic herb spreadable cheese,Cheese,alouette garlic & herbs spreadable cheese
4,LIQUID,milk,"Milk, Eggs, Other Dairy",milk


In [23]:
print(len(dfIng))  # Check the number of rows in the final DataFrame

1028


In [24]:
dfIng.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028 entries, 0 to 1027
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   consistency  1028 non-null   object
 1   nameClean    1020 non-null   object
 2   aisle        1028 non-null   object
 3   name         1028 non-null   object
dtypes: object(4)
memory usage: 32.2+ KB


In [25]:
#We need the nameClean column elements to be accurate so we should deal whith its Null elements
none_rows_ing = dfIng[dfIng['nameClean'].isna()]
none_rows_ing

Unnamed: 0,consistency,nameClean,aisle,name
41,SOLID,,?,saki
42,SOLID,,?,saki
164,SOLID,,?,frangelico
165,SOLID,,?,frangelico
372,SOLID,,?,parmesean
373,SOLID,,?,parmesean
668,SOLID,,?,ground
810,SOLID,,?,ground


In [26]:
# filling the missing values in the nameClean column with the values from the originalName column.
dfIng['nameClean'] = dfIng['nameClean'].fillna(dfIng['name'])

In [27]:
# check if the nameClean Null values are well filled 
none_rows_ing = dfIng[dfIng['nameClean'].isna()]
none_rows_ing

Unnamed: 0,consistency,nameClean,aisle,name


In [28]:
# Now after I successfully filled the nameClean null values i will drop originalName column
dfIng.drop('name', axis=1, inplace=True)

In [29]:
dfIng.head()

Unnamed: 0,consistency,nameClean,aisle
0,SOLID,butter,"Milk, Eggs, Other Dairy"
1,SOLID,chicken breast,Meat
2,SOLID,garlic powder,Spices and Seasonings
3,SOLID,garlic herb spreadable cheese,Cheese
4,LIQUID,milk,"Milk, Eggs, Other Dairy"


In [30]:
#rename columns
dfIng = dfIng.rename(columns={'nameClean': 'ing_name'})
dfIng.head()

Unnamed: 0,consistency,ing_name,aisle
0,SOLID,butter,"Milk, Eggs, Other Dairy"
1,SOLID,chicken breast,Meat
2,SOLID,garlic powder,Spices and Seasonings
3,SOLID,garlic herb spreadable cheese,Cheese
4,LIQUID,milk,"Milk, Eggs, Other Dairy"


In [31]:
dfIng.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028 entries, 0 to 1027
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   consistency  1028 non-null   object
 1   ing_name     1028 non-null   object
 2   aisle        1028 non-null   object
dtypes: object(3)
memory usage: 24.2+ KB


In [32]:
# Remove duplicates
dfIng = dfIng.drop_duplicates()
dfIng.reset_index(drop=True, inplace=True)

# Display the resulting DataFrame
dfIng.head()

Unnamed: 0,consistency,ing_name,aisle
0,SOLID,butter,"Milk, Eggs, Other Dairy"
1,SOLID,chicken breast,Meat
2,SOLID,garlic powder,Spices and Seasonings
3,SOLID,garlic herb spreadable cheese,Cheese
4,LIQUID,milk,"Milk, Eggs, Other Dairy"


In [33]:
print(len(dfIng))  # Check the number of rows in the final DataFrame

380


In [34]:
# Generate successive numbers for the 'id_ingredient' column
dfIng['id_ingredient'] = range(1, len(dfIng) + 1)

# here we will not drop the id column as we will use it later to deal with ing_name consistency

In [35]:
dfIng.head()

Unnamed: 0,consistency,ing_name,aisle,id_ingredient
0,SOLID,butter,"Milk, Eggs, Other Dairy",1
1,SOLID,chicken breast,Meat,2
2,SOLID,garlic powder,Spices and Seasonings,3
3,SOLID,garlic herb spreadable cheese,Cheese,4
4,LIQUID,milk,"Milk, Eggs, Other Dairy",5


## Dealing with measures (reference_ing)

### extracting needed columns

In [36]:
# Extract the column that contains all the ingredients measures along with the recipes titles
dfrecipeIng = pd.DataFrame(df_test, columns=['extendedIngredients', 'title'])
dfrecipeIng.head(10)

Unnamed: 0,extendedIngredients,title
0,"[{'id': 1001, 'aisle': 'Milk, Eggs, Other Dair...",Alouette Chicken Paprika
1,"[{'id': 11116, 'aisle': 'Produce', 'image': 'b...",Baby Bok Choy Stir Fry
2,"[{'id': 1001, 'aisle': 'Milk, Eggs, Other Dair...",Oreo Cake
3,"[{'id': 1017, 'aisle': 'Cheese', 'image': 'cre...",Oreo Cookies & Cream No-Bake Cheesecake
4,"[{'id': 10211821, 'aisle': 'Produce', 'image':...",Frittata
5,"[{'id': 99242, 'aisle': 'Produce', 'image': 'a...",Eggplant & Artichoke Heart Galettes
6,"[{'id': 18369, 'aisle': 'Baking', 'image': 'wh...",Dark Shadows Baileys Chocolate Cheesecake Brow...
7,"[{'id': 11821, 'aisle': 'Produce', 'image': 'r...",Easy Slow Cooker Artichoke Garlic Chicken
8,"[{'id': 18334, 'aisle': 'Refrigerated', 'image...",Caramelised Onion and Mushroom Quiche
9,"[{'id': 11011, 'aisle': 'Produce', 'image': 'a...",Asparagus Eggs Benedict


In [37]:
dfrecipeIng.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   extendedIngredients  100 non-null    object
 1   title                100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB


### obtaining a new dataframe that contains measures of ingredients in each recipe

In [38]:
# Preparing the needed columns for measures dataframe
print(all_keys_ing)
print('\n')

# Initialize measure_keys as a copy of all_keys_ing
measure_keys = all_keys_ing.copy()

# List of keys to be removed
keys_to_remove = ["aisle", "consistency"]

# Remove keys from measure_keys if they exist in all_keys_ing
for key in keys_to_remove:
    if key in measure_keys:
        measure_keys.remove(key)

print(measure_keys)

# Convert the set of keys to a regular list
measure_keys_list = list(measure_keys)

# Add 'title' column
measure_keys_list.append('title')

print(measure_keys_list)


{'measures', 'consistency', 'nameClean', 'aisle', 'name'}


{'nameClean', 'name', 'measures'}
['nameClean', 'name', 'measures', 'title']


In [39]:
# Create the DataFrame with the collected keys as column names
dfmeasures = pd.DataFrame(columns=measure_keys_list)
dfmeasures.head()

Unnamed: 0,nameClean,name,measures,title


In [40]:
#first_two_rows = dfrecipeIng.iloc[:2]
#first_two_rows.head()

In [41]:

# Initialize an empty list to collect rows of data
rows_data = []

# Iterate through each row of the original DataFrame
for i, row_series in dfrecipeIng.iterrows():
    # Access the recipe title for the current row 
    title = row_series['title']  # title is a string
    
    # Access the 'extendedIngredients' column for the current row
    recipe_ingredients_list = row_series['extendedIngredients']  # recipe_ingredients_list is a list
    
    if recipe_ingredients_list: # if recipe_ingredients_list is not empty
        # Iterate through each dictionary in the list
        for each_dict in recipe_ingredients_list:
            if each_dict and isinstance(each_dict, dict):  # Check if each_dict is non-empty and is a dictionary
                # Get measures dict 
                measures_dict = each_dict.get('measures', {})
                # Transform measures dict to a list
                measures_list = [measures_dict]       
            else:
                measures_list = None
            
            # Create a dictionary for the row data
            row_data = {
                'nameClean': each_dict.get('nameClean', None),
                'measures': measures_list,
                'name': each_dict.get('name'),
                'title': title
            }
            
            # Append the row data to the list
            rows_data.append(row_data)

# Create the DataFrame from the list of row data
dfmeasures = pd.DataFrame(rows_data, columns=measure_keys_list)


In [42]:
dfmeasures.head()

Unnamed: 0,nameClean,name,measures,title
0,butter,butter,"[{'us': {'amount': 1.0, 'unitShort': 'Tbsp', '...",Alouette Chicken Paprika
1,chicken breast,chicken breasts halves,"[{'us': {'amount': 4.0, 'unitShort': '', 'unit...",Alouette Chicken Paprika
2,garlic powder,garlic powder,"[{'us': {'amount': 2.0, 'unitShort': 'tsps', '...",Alouette Chicken Paprika
3,garlic herb spreadable cheese,alouette garlic & herbs spreadable cheese,"[{'us': {'amount': 6.5, 'unitShort': 'oz', 'un...",Alouette Chicken Paprika
4,milk,milk,"[{'us': {'amount': 1.0, 'unitShort': 'Tbsp', '...",Alouette Chicken Paprika


In [43]:
print(len(dfmeasures))  # Check the number of rows (should be equal to the number of ingredients)

1028


In [44]:
dfmeasures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028 entries, 0 to 1027
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   nameClean  1020 non-null   object
 1   name       1028 non-null   object
 2   measures   1028 non-null   object
 3   title      1028 non-null   object
dtypes: object(4)
memory usage: 32.2+ KB


In [45]:
#We need the nameClean column elements to be accurate so we should deal whith its Null elements
none_rows_ing = dfmeasures[dfmeasures['nameClean'].isna()]
none_rows_ing

Unnamed: 0,nameClean,name,measures,title
41,,saki,"[{'us': {'amount': 1.0, 'unitShort': 'oz', 'un...",Frittata
42,,saki,"[{'us': {'amount': 1.0, 'unitShort': 'oz', 'un...",Frittata
164,,frangelico,"[{'us': {'amount': 1.0, 'unitShort': 'tsp', 'u...",Naturally Sweet Apple Turnovers
165,,frangelico,"[{'us': {'amount': 1.0, 'unitShort': 'tsp', 'u...",Naturally Sweet Apple Turnovers
372,,parmesean,"[{'us': {'amount': 1.0, 'unitShort': 'cup', 'u...",Best Potato Cheese Soup in a bread bowl
373,,parmesean,"[{'us': {'amount': 1.0, 'unitShort': 'cup', 'u...",Best Potato Cheese Soup in a bread bowl
668,,ground,"[{'us': {'amount': 0.25, 'unitShort': 'tsps', ...",Zesty Green Pea and Jalapeño Pesto Pasta
810,,ground,"[{'us': {'amount': 1.0, 'unitShort': 'cloves',...",Triple Chocolate Pumpkin Pie


In [46]:
# filling the missing values in the nameClean column with the values from the originalName column.
dfmeasures['nameClean'] = dfmeasures['nameClean'].fillna(dfmeasures['name'])

In [47]:
# check if the nameClean Null values are well filled 
none_rows_m = dfmeasures[dfmeasures['nameClean'].isna()]
none_rows_m

Unnamed: 0,nameClean,name,measures,title


In [48]:
# Now after I successfully filled the nameClean null values i will drop originalName column
dfmeasures.drop('name', axis=1, inplace=True)

In [49]:
dfmeasures= dfmeasures.rename(columns={'nameClean': 'ing_name'})

In [50]:
dfmeasures.head()

Unnamed: 0,ing_name,measures,title
0,butter,"[{'us': {'amount': 1.0, 'unitShort': 'Tbsp', '...",Alouette Chicken Paprika
1,chicken breast,"[{'us': {'amount': 4.0, 'unitShort': '', 'unit...",Alouette Chicken Paprika
2,garlic powder,"[{'us': {'amount': 2.0, 'unitShort': 'tsps', '...",Alouette Chicken Paprika
3,garlic herb spreadable cheese,"[{'us': {'amount': 6.5, 'unitShort': 'oz', 'un...",Alouette Chicken Paprika
4,milk,"[{'us': {'amount': 1.0, 'unitShort': 'Tbsp', '...",Alouette Chicken Paprika


In [51]:
dfmeasures.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028 entries, 0 to 1027
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ing_name  1028 non-null   object
 1   measures  1028 non-null   object
 2   title     1028 non-null   object
dtypes: object(3)
memory usage: 24.2+ KB


### transforming the measures column

In [52]:

def extract_measure(measures_dict, key_name):
  """
  Extracts a string value representing the measure from the given dictionary using a given name.
  Handles cases where the key name might be different.

  Args:
      measures_dict: A dictionary containing the measure information.
      key_name: The key name to look for.

  Returns:
      A string representing the measure in the format "amount unitShort".
  """
  if key_name in measures_dict:
    return f"{measures_dict[key_name]['amount']} {measures_dict[key_name]['unitShort']}"
  else:
    # Handle cases where the key might be different
    for key in measures_dict:
      if isinstance(measures_dict[key], dict):
        return extract_measure(measures_dict[key], key)
    # If no matching key is found, return an empty string
    return None

#dfmeasures1 = dfmeasures.copy()
#Create two new columns with extracted measures
dfmeasures['measure_1'] = dfmeasures['measures'].apply(lambda x: extract_measure(x[0], "us"))
dfmeasures['measure_2'] = dfmeasures['measures'].apply(lambda x: extract_measure(x[0], "metric"))

# drop the original column
dfmeasures = dfmeasures.drop('measures', axis=1)


In [53]:
dfmeasures.head(10)

Unnamed: 0,ing_name,title,measure_1,measure_2
0,butter,Alouette Chicken Paprika,1.0 Tbsp,1.0 Tbsp
1,chicken breast,Alouette Chicken Paprika,4.0,4.0
2,garlic powder,Alouette Chicken Paprika,2.0 tsps,2.0 tsps
3,garlic herb spreadable cheese,Alouette Chicken Paprika,6.5 oz,184.272 g
4,milk,Alouette Chicken Paprika,1.0 Tbsp,1.0 Tbsp
5,paprika,Alouette Chicken Paprika,8.0 tsps,8.0 tsps
6,bok choy,Baby Bok Choy Stir Fry,1.0 lb,453.592 g
7,shallot,Baby Bok Choy Stir Fry,2.0 large,2.0 large
8,garlic,Baby Bok Choy Stir Fry,2.0 cloves,2.0 cloves
9,coconut oil,Baby Bok Choy Stir Fry,0.5 Tbsps,0.5 Tbsps


In [54]:
def combine_measures(row):
  """
  Combines values from 'measure_1' and 'measure_2' columns into a single string.

  Args:
      row: A pandas Series representing a row of the DataFrame.

  Returns:
      A string containing the combined measure value(s).
  """
  measure_1 = row['measure_1']
  measure_2 = row['measure_2']

  if measure_1 == measure_2:
    return measure_1  # Same values, return one
  else:
    return f"{measure_1} / {measure_2}"  # Different values, concatenate with "/"

#dfmeasures1 = dfmeasures.copy()
# Apply the function to create a new 'measure' column
dfmeasures['measure'] = dfmeasures.apply(combine_measures, axis=1)

# Drop 'measure_1' and 'measure_2' columns 
dfmeasures = dfmeasures.drop(['measure_1', 'measure_2'], axis=1)


In [55]:
dfmeasures.head()

Unnamed: 0,ing_name,title,measure
0,butter,Alouette Chicken Paprika,1.0 Tbsp
1,chicken breast,Alouette Chicken Paprika,4.0
2,garlic powder,Alouette Chicken Paprika,2.0 tsps
3,garlic herb spreadable cheese,Alouette Chicken Paprika,6.5 oz / 184.272 g
4,milk,Alouette Chicken Paprika,1.0 Tbsp


### dealing with id_recipe in dfmeasures¶

In [56]:
# Create a mapping dictionary from dfrecipes, 
recipe_mapping = dfrecipes.set_index('recipe_title')['id_recipe'].to_dict()
#print(recipe_mapping)

In [57]:
# Use the map function to fill the new column
def map_with_none(recipe_title, recipe_mapping):
    """Maps recipe names to id_recipe, handling missing values."""
    if pd.isna(recipe_title):
        return None  # Return None for missing titles
    return recipe_mapping.get(recipe_title, None)  # Use get() to avoid KeyError for missing keys

# Apply the custom map function
dfmeasures['id_recipe'] = dfmeasures['title'].apply(map_with_none, args=(recipe_mapping,))

# Convert id_ingredient to integer type where applicable, keeping None values
dfmeasures['id_recipe'] = dfmeasures['id_recipe'].astype('Int64')


In [58]:
# Display the result
dfmeasures.head()

Unnamed: 0,ing_name,title,measure,id_recipe
0,butter,Alouette Chicken Paprika,1.0 Tbsp,1
1,chicken breast,Alouette Chicken Paprika,4.0,1
2,garlic powder,Alouette Chicken Paprika,2.0 tsps,1
3,garlic herb spreadable cheese,Alouette Chicken Paprika,6.5 oz / 184.272 g,1
4,milk,Alouette Chicken Paprika,1.0 Tbsp,1


### dealing with id_ingredient in dfmeasures

In [59]:
# Create a mapping dictionary from dfIng, 
ing_mapping = dfIng.set_index('ing_name')['id_ingredient'].to_dict()
#print(ing_mapping)

In [60]:
# Use the map function to fill the new column
def map_with_none(ing_name, ing_mapping):
    """Maps ingredient names to id_ingredient, handling missing values."""
    if pd.isna(ing_name):
        return None  # Return None for missing titles
    return ing_mapping.get(ing_name, None)  # Use get() to avoid KeyError for missing keys
    
# Apply the custom map function
dfmeasures['id_ingredient'] = dfmeasures['ing_name'].apply(map_with_none, args=(ing_mapping,))

# Convert id_ingredient to integer type where applicable, keeping None values
dfmeasures['id_ingredient'] = dfmeasures['id_ingredient'].astype('Int64')

In [61]:
# Display the result
dfmeasures.head(10)

Unnamed: 0,ing_name,title,measure,id_recipe,id_ingredient
0,butter,Alouette Chicken Paprika,1.0 Tbsp,1,1
1,chicken breast,Alouette Chicken Paprika,4.0,1,2
2,garlic powder,Alouette Chicken Paprika,2.0 tsps,1,3
3,garlic herb spreadable cheese,Alouette Chicken Paprika,6.5 oz / 184.272 g,1,4
4,milk,Alouette Chicken Paprika,1.0 Tbsp,1,5
5,paprika,Alouette Chicken Paprika,8.0 tsps,1,6
6,bok choy,Baby Bok Choy Stir Fry,1.0 lb / 453.592 g,2,7
7,shallot,Baby Bok Choy Stir Fry,2.0 large,2,8
8,garlic,Baby Bok Choy Stir Fry,2.0 cloves,2,9
9,coconut oil,Baby Bok Choy Stir Fry,0.5 Tbsps,2,10


In [62]:
dfreference_ing = dfmeasures.drop(['ing_name', 'title'], axis=1)

In [63]:
dfreference_ing.head(10)

Unnamed: 0,measure,id_recipe,id_ingredient
0,1.0 Tbsp,1,1
1,4.0,1,2
2,2.0 tsps,1,3
3,6.5 oz / 184.272 g,1,4
4,1.0 Tbsp,1,5
5,8.0 tsps,1,6
6,1.0 lb / 453.592 g,2,7
7,2.0 large,2,8
8,2.0 cloves,2,9
9,0.5 Tbsps,2,10


## Dealing with instructions and steps

### extracting needed columns

In [64]:
# Extract the column that contains all the instructions along with the recipes titles
dfAllIns = pd.DataFrame(df_test, columns=['analyzedInstructions', 'title'])
dfAllIns.head()

Unnamed: 0,analyzedInstructions,title
0,"[{'name': '', 'steps': [{'number': 1, 'step': ...",Alouette Chicken Paprika
1,"[{'name': '', 'steps': [{'number': 1, 'step': ...",Baby Bok Choy Stir Fry
2,"[{'name': '', 'steps': [{'number': 1, 'step': ...",Oreo Cake
3,"[{'name': '', 'steps': [{'number': 1, 'step': ...",Oreo Cookies & Cream No-Bake Cheesecake
4,"[{'name': '', 'steps': [{'number': 1, 'step': ...",Frittata


In [65]:
print(len(dfAllIns))

100


In [66]:
dfAllIns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   analyzedInstructions  100 non-null    object
 1   title                 100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB


### obtaining a new dataframe that contains steps of each recipe

In [67]:
# preparing the needed columns for steps dataframe
all_keys_ins = column_list_dict_to_set_keys(dfAllIns['analyzedInstructions'])
print(all_keys_ins)
print('\n')

# List of keys to be removed
keys_to_remove = ['name']

# Remove keys if they exist in all_keys_ins set
for key in keys_to_remove:
    if key in all_keys_ins:
        all_keys_ins.remove(key)

print(all_keys_ins)

# Convert the set of keys to a list and append 'title'
ins_keys_list = list(all_keys_ins)
ins_keys_list.append('title')

{'name', 'steps'}


{'steps'}


In [68]:
dfsteps = pd.DataFrame(columns=ins_keys_list) 
dfsteps.head()

Unnamed: 0,steps,title


In [69]:
#first_two_rows = dfAllIns.iloc[:2]
#first_two_rows.head()

In [70]:
# Initialize an empty list to collect rows of data
rows_data = []

# Iterate through each row of the original DataFrame
for i, row_series in dfAllIns.iterrows():
    # Access the recipe title for the current row 
    title = row_series['title']  # title is a string
    
    # Access the 'analyzedInstructions' column for the current row
    instructions_list = row_series['analyzedInstructions']  # instructions_list is a list containing one dictionary
    
    if instructions_list and isinstance(instructions_list[0], dict):  # Check if instructions_list is non-empty and its first element is a dictionary
        # Get steps list
        steps_list = instructions_list[0].get('steps', [])
    else:
        steps_list = None
    
    # Create a dictionary for the row data
    row_data = {
        'steps': steps_list,
        'title': title
    }
    
    # Append the row data to the list
    rows_data.append(row_data)

# Create the DataFrame from the list of row data
dfsteps = pd.DataFrame(rows_data, columns=ins_keys_list)


In [71]:
dfsteps.head()

Unnamed: 0,steps,title
0,"[{'number': 1, 'step': 'Coat chicken with a mi...",Alouette Chicken Paprika
1,"[{'number': 1, 'step': 'Heat oil in a large no...",Baby Bok Choy Stir Fry
2,"[{'number': 1, 'step': 'Crumble cookies and sa...",Oreo Cake
3,"[{'number': 1, 'step': 'Beat heavy cream until...",Oreo Cookies & Cream No-Bake Cheesecake
4,"[{'number': 1, 'step': 'In a large mixing bowl...",Frittata


In [72]:
print(len(dfsteps))

100


### transforming the steps column

In [73]:
# preparing the needed columns for steps dataframe
all_keys_steps = column_list_dict_to_set_keys(dfsteps['steps'])
print(all_keys_steps)
print('\n')

# List of keys to be removed
keys_to_remove = ['ingredients']

# Remove keys if they exist in all_keys_steps set
for key in keys_to_remove:
    if key in all_keys_steps:
        all_keys_steps.remove(key)

# Convert the set of keys to a list and append 'title'
keys_steps_list = list(all_keys_steps)
keys_steps_list.append('title')

print(keys_steps_list)


{'equipment', 'step', 'length', 'number', 'ingredients'}


['equipment', 'step', 'length', 'number', 'title']


In [74]:
# Create the DataFrame with the collected keys as column names
dfstep = pd.DataFrame(columns=keys_steps_list) 
dfstep.head()


Unnamed: 0,equipment,step,length,number,title


In [75]:
# Initialize an empty list to collect rows of data
rows_data = []

# Iterate through each row of the original DataFrame dfsteps
for i, row_series in dfsteps.iterrows():
    # Access the recipe title for the current row 
    title = row_series['title']  # title is a string
    
    # Access the 'steps' column for the current row
    steps_list = row_series['steps']
    
    if steps_list is not None:
        # Iterate through each dictionary in the steps_list
        for each_dict in steps_list:
            if each_dict and isinstance(each_dict, dict):  # Check if each_dict is non-empty and is a dictionary
                # Extract relevant information from each_dict
                number = each_dict.get('number', None)
                step = each_dict.get('step', None)
                time = each_dict.get('length', {}).get('number', None)
                unit = each_dict.get('length', {}).get('unit', None)
                
                # Calculate length based on time and unit
                if time is None or unit is None:
                    length = None
                else:
                    length = str(time) + " " + unit
                
                equipment_list = each_dict.get('equipment', [])  # Get equipment list or empty list if 'equipment' is missing
                
                # Handle NaN or None values in equipment_list
                if isinstance(equipment_list, list):
                    equipment_list = [e if pd.notna(e) else None for e in equipment_list]
                
                # Create a dictionary for the row data
                row_data = {
                    'length': length,
                    'number': number,
                    'step': step,
                    'equipment': equipment_list,
                    'title': title
                }
                
                # Append the row data to the list
                rows_data.append(row_data)
    else:
        # If steps_list is None, create a row with None values
        row_data = {
            'length': None,
            'number': None,
            'step': None,
            'equipment': None,
            'title': title
        }
        
        # Append the row data to the list
        rows_data.append(row_data)

# Create the DataFrame from the list of row data
dfstep = pd.DataFrame(rows_data, columns=keys_steps_list)


In [76]:
dfstep.head()

Unnamed: 0,equipment,step,length,number,title
0,[],Coat chicken with a mixture of 6 tsp. of papri...,20 minutes,1.0,Alouette Chicken Paprika
1,"[{'id': 404645, 'name': 'frying pan', 'localiz...","Remove chicken from skillet, reserving liquid.",,2.0,Alouette Chicken Paprika
2,[],"Combine milk, Alouette",,3.0,Alouette Chicken Paprika
3,"[{'id': 405907, 'name': 'mixing bowl', 'locali...",Spreadable Cheese and remaining paprika in a s...,,4.0,Alouette Chicken Paprika
4,[],Pour Alouette Garlic & Herbs,,5.0,Alouette Chicken Paprika


In [77]:
print(len(dfstep))

630


In [78]:
dfstep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 630 entries, 0 to 629
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   equipment  629 non-null    object 
 1   step       629 non-null    object 
 2   length     156 non-null    object 
 3   number     629 non-null    float64
 4   title      630 non-null    object 
dtypes: float64(1), object(4)
memory usage: 24.7+ KB


In [79]:
# checking if there's a Null values in step column
none_rows_step = dfstep[dfstep['step'].isna()]
none_rows_step

Unnamed: 0,equipment,step,length,number,title
597,,,,,Strawberry Basil Sorbet (no Ice Cream Maker Ne...


In [80]:
# Drop rows with Null values in step column
dfstep.dropna(subset=['step'], inplace=True)

In [81]:
none_rows_step = dfstep[dfstep['step'].isna()]
none_rows_step

Unnamed: 0,equipment,step,length,number,title


In [82]:
dfstep.head()

Unnamed: 0,equipment,step,length,number,title
0,[],Coat chicken with a mixture of 6 tsp. of papri...,20 minutes,1.0,Alouette Chicken Paprika
1,"[{'id': 404645, 'name': 'frying pan', 'localiz...","Remove chicken from skillet, reserving liquid.",,2.0,Alouette Chicken Paprika
2,[],"Combine milk, Alouette",,3.0,Alouette Chicken Paprika
3,"[{'id': 405907, 'name': 'mixing bowl', 'locali...",Spreadable Cheese and remaining paprika in a s...,,4.0,Alouette Chicken Paprika
4,[],Pour Alouette Garlic & Herbs,,5.0,Alouette Chicken Paprika


### transforming equipment column

In [83]:
dfstep.head()

Unnamed: 0,equipment,step,length,number,title
0,[],Coat chicken with a mixture of 6 tsp. of papri...,20 minutes,1.0,Alouette Chicken Paprika
1,"[{'id': 404645, 'name': 'frying pan', 'localiz...","Remove chicken from skillet, reserving liquid.",,2.0,Alouette Chicken Paprika
2,[],"Combine milk, Alouette",,3.0,Alouette Chicken Paprika
3,"[{'id': 405907, 'name': 'mixing bowl', 'locali...",Spreadable Cheese and remaining paprika in a s...,,4.0,Alouette Chicken Paprika
4,[],Pour Alouette Garlic & Herbs,,5.0,Alouette Chicken Paprika


In [84]:
# Create the DataFrame with the collected keys as column names
dfstepclean = pd.DataFrame(columns=keys_steps_list) 
dfstepclean.head()

Unnamed: 0,equipment,step,length,number,title


In [85]:
# Initialize an empty list to collect rows of data
rows_data = []

# Iterate through each row of the original DataFrame dfstep
for i, row_series in dfstep.iterrows():
    # Access the recipe title for the current row 
    title = row_series['title']
    # Access the step length for the current row 
    length = row_series['length']
    # Access the recipe step for the current row 
    step = row_series['step']
    # Access the step number for the current row 
    number = row_series['number']
    # Access the 'equipment' column for the current row
    equipments_list = row_series['equipment']  #
    
    # Initialize variables to store processed data
    equipments_name_list = []
    
    # Process equipments_list
    if equipments_list and isinstance(equipments_list, list):
        for each_dict in equipments_list:
            if each_dict and isinstance(each_dict, dict):  # Check if each_dict is non-empty and is a dictionary
                # Get a list containing equipments names
                equipment_name = each_dict.get('name', None)
                if equipment_name is not None:  # Ensure equipment_name is not None
                    equipments_name_list.append(equipment_name)
    else:
        equipments_name_list = None  # Handle case where equipments_list is None or not a list
    
    # Create a dictionary for the row data
    row_data = {
        'length': length,
        'number': number,
        'step': step,
        'equipment': equipments_name_list,
        'title': title
    }
    
    # Append the row data to the list
    rows_data.append(row_data)

# Create the DataFrame from the list of row data
dfstepclean = pd.DataFrame(rows_data, columns=keys_steps_list)


In [86]:
dfstepclean.head()

Unnamed: 0,equipment,step,length,number,title
0,,Coat chicken with a mixture of 6 tsp. of papri...,20 minutes,1.0,Alouette Chicken Paprika
1,[frying pan],"Remove chicken from skillet, reserving liquid.",,2.0,Alouette Chicken Paprika
2,,"Combine milk, Alouette",,3.0,Alouette Chicken Paprika
3,[mixing bowl],Spreadable Cheese and remaining paprika in a s...,,4.0,Alouette Chicken Paprika
4,,Pour Alouette Garlic & Herbs,,5.0,Alouette Chicken Paprika


In [87]:
dfstepclean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 629 entries, 0 to 628
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   equipment  330 non-null    object 
 1   step       629 non-null    object 
 2   length     156 non-null    object 
 3   number     629 non-null    float64
 4   title      629 non-null    object 
dtypes: float64(1), object(4)
memory usage: 24.7+ KB


### generating id_step

In [88]:
# Generate successive numbers for the 'id_ingredient' column
dfstepclean['id_step'] = range(1, len(dfstepclean) + 1)

In [89]:
dfstepclean.head()

Unnamed: 0,equipment,step,length,number,title,id_step
0,,Coat chicken with a mixture of 6 tsp. of papri...,20 minutes,1.0,Alouette Chicken Paprika,1
1,[frying pan],"Remove chicken from skillet, reserving liquid.",,2.0,Alouette Chicken Paprika,2
2,,"Combine milk, Alouette",,3.0,Alouette Chicken Paprika,3
3,[mixing bowl],Spreadable Cheese and remaining paprika in a s...,,4.0,Alouette Chicken Paprika,4
4,,Pour Alouette Garlic & Herbs,,5.0,Alouette Chicken Paprika,5


### dealing with dfstep_final

In [90]:
dfstep_final=dfstepclean.copy()
dfstep_final.head()

Unnamed: 0,equipment,step,length,number,title,id_step
0,,Coat chicken with a mixture of 6 tsp. of papri...,20 minutes,1.0,Alouette Chicken Paprika,1
1,[frying pan],"Remove chicken from skillet, reserving liquid.",,2.0,Alouette Chicken Paprika,2
2,,"Combine milk, Alouette",,3.0,Alouette Chicken Paprika,3
3,[mixing bowl],Spreadable Cheese and remaining paprika in a s...,,4.0,Alouette Chicken Paprika,4
4,,Pour Alouette Garlic & Herbs,,5.0,Alouette Chicken Paprika,5


In [91]:
# Apply the custom map function
dfstep_final['id_recipe'] = dfstep_final['title'].apply(map_with_none, args=(recipe_mapping,))

# Convert id_recipe to integer type where applicable, keeping None values
dfstep_final['id_recipe'] = dfstep_final['id_recipe'].astype('Int64')

In [92]:
dfstep_final.head()

Unnamed: 0,equipment,step,length,number,title,id_step,id_recipe
0,,Coat chicken with a mixture of 6 tsp. of papri...,20 minutes,1.0,Alouette Chicken Paprika,1,1
1,[frying pan],"Remove chicken from skillet, reserving liquid.",,2.0,Alouette Chicken Paprika,2,1
2,,"Combine milk, Alouette",,3.0,Alouette Chicken Paprika,3,1
3,[mixing bowl],Spreadable Cheese and remaining paprika in a s...,,4.0,Alouette Chicken Paprika,4,1
4,,Pour Alouette Garlic & Herbs,,5.0,Alouette Chicken Paprika,5,1


In [93]:
# Drop a column by label (column name)
dfstep_final.drop(columns=['equipment', 'title'], inplace=True)


In [94]:
dfstep_final.head()

Unnamed: 0,step,length,number,id_step,id_recipe
0,Coat chicken with a mixture of 6 tsp. of papri...,20 minutes,1.0,1,1
1,"Remove chicken from skillet, reserving liquid.",,2.0,2,1
2,"Combine milk, Alouette",,3.0,3,1
3,Spreadable Cheese and remaining paprika in a s...,,4.0,4,1
4,Pour Alouette Garlic & Herbs,,5.0,5,1


### Creating the instructions dataframe

In [95]:
# Assign a unique number to each unique title
dfstep_final['instruction_id'] = pd.factorize(dfstep_final['id_recipe'])[0] + 1
dfstep_final.head(30)

Unnamed: 0,step,length,number,id_step,id_recipe,instruction_id
0,Coat chicken with a mixture of 6 tsp. of papri...,20 minutes,1.0,1,1,1
1,"Remove chicken from skillet, reserving liquid.",,2.0,2,1,1
2,"Combine milk, Alouette",,3.0,3,1,1
3,Spreadable Cheese and remaining paprika in a s...,,4.0,4,1,1
4,Pour Alouette Garlic & Herbs,,5.0,5,1,1
5,"Spreadable Cheese mixture into skillet, stirri...",,6.0,6,1,1
6,Heat oil in a large nonstick skillet over medi...,,1.0,7,2,2
7,Add shallots and cook 3 4 minutes to soften.,4 minutes,2.0,8,2,2
8,"Add bok choy, leaving leafy pieces aside. Cook...",4 minutes,3.0,9,2,2
9,Add remaining ingredients except almonds. Top ...,,4.0,10,2,2


In [96]:
dfIns = dfstep_final[['instruction_id', 'id_recipe']].copy()
dfIns.head()

Unnamed: 0,instruction_id,id_recipe
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1


In [97]:
dfIns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 629 entries, 0 to 628
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   instruction_id  629 non-null    int64
 1   id_recipe       629 non-null    Int64
dtypes: Int64(1), int64(1)
memory usage: 10.6 KB


In [98]:
dfIns.drop_duplicates(inplace=True)
dfIns.reset_index(drop=True, inplace=True)
dfIns.head()

Unnamed: 0,instruction_id,id_recipe
0,1,1
1,2,2
2,3,3
3,4,4
4,5,5


In [99]:
dfstep_final.drop(columns=['id_recipe'], inplace=True)

In [100]:
dfstep_final.head()

Unnamed: 0,step,length,number,id_step,instruction_id
0,Coat chicken with a mixture of 6 tsp. of papri...,20 minutes,1.0,1,1
1,"Remove chicken from skillet, reserving liquid.",,2.0,2,1
2,"Combine milk, Alouette",,3.0,3,1
3,Spreadable Cheese and remaining paprika in a s...,,4.0,4,1
4,Pour Alouette Garlic & Herbs,,5.0,5,1


## Dealing with equipments

In [101]:
dfstep.head()

Unnamed: 0,equipment,step,length,number,title
0,[],Coat chicken with a mixture of 6 tsp. of papri...,20 minutes,1.0,Alouette Chicken Paprika
1,"[{'id': 404645, 'name': 'frying pan', 'localiz...","Remove chicken from skillet, reserving liquid.",,2.0,Alouette Chicken Paprika
2,[],"Combine milk, Alouette",,3.0,Alouette Chicken Paprika
3,"[{'id': 405907, 'name': 'mixing bowl', 'locali...",Spreadable Cheese and remaining paprika in a s...,,4.0,Alouette Chicken Paprika
4,[],Pour Alouette Garlic & Herbs,,5.0,Alouette Chicken Paprika


In [102]:
# preparing the needed columns for equipments dataframe
all_keys_equip = column_list_dict_to_set_keys(dfstep['equipment'])
print(all_keys_equip)
print('\n')

# List of keys to be removed
keys_to_remove = ['image', 'localizedName','id', 'temperature']

# Remove keys if they exist in all_keys_equip set
for key in keys_to_remove:
    if key in all_keys_equip:
        all_keys_equip.remove(key)

# Convert the set of keys to a list
keys_equip_list = list(all_keys_equip)

print(keys_equip_list)


{'temperature', 'image', 'id', 'localizedName', 'name'}


['name']


In [103]:
# Create the DataFrame with the collected keys as column names
dfequip = pd.DataFrame(columns=keys_equip_list) 
dfequip.head()

Unnamed: 0,name


In [104]:
# Initialize an empty list to collect rows of data
rows_data = []

# Iterate through each row of the original DataFrame
for i, row_series in dfstep.iterrows():
    equip_list = row_series['equipment']
    
    if equip_list:
        for each_dict in equip_list:
            # Check if the element is a dictionary 
            if each_dict and isinstance(each_dict, dict):
                name = each_dict.get('name', None)
                
                # Create a dictionary for the row data
                row_data = {
                    'name': name
                }
                
                # Append the row data to the list
                rows_data.append(row_data)

# Create the DataFrame from the list of row data
dfequip = pd.DataFrame(rows_data, columns=keys_equip_list)


In [105]:
dfequip.head()

Unnamed: 0,name
0,frying pan
1,mixing bowl
2,frying pan
3,frying pan
4,cake form


In [106]:
print(len(dfequip))  # Check the number of rows in the final DataFrame after removing duplicates

495


In [107]:
dfequip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495 entries, 0 to 494
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    495 non-null    object
dtypes: object(1)
memory usage: 4.0+ KB


In [108]:
# Remove duplicates
dfequip = dfequip.drop_duplicates()


# Reset the index 
dfequip = dfequip.reset_index(drop=True)

# Display the resulting DataFrame
dfequip.head()

Unnamed: 0,name
0,frying pan
1,mixing bowl
2,cake form
3,stand mixer
4,bowl


In [109]:
print(len(dfequip))  # Check the number of rows in the final DataFrame after removing duplicates

55


In [110]:
# Generate successive numbers for the 'id_equipment' column
dfequip['id_equipment'] = range(1, len(dfequip) + 1)

In [111]:
dfequip.head()

Unnamed: 0,name,id_equipment
0,frying pan,1
1,mixing bowl,2
2,cake form,3
3,stand mixer,4
4,bowl,5


In [112]:
dfequip = dfequip.rename(columns={'name': 'equip_name'})

In [113]:
dfequip.head()

Unnamed: 0,equip_name,id_equipment
0,frying pan,1
1,mixing bowl,2
2,cake form,3
3,stand mixer,4
4,bowl,5


In [114]:
dfequip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   equip_name    55 non-null     object
 1   id_equipment  55 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1008.0+ bytes


## Dealing with reference_equip

### Explode the dfstepclean table because equipment is a list of equipments

In [115]:
dfstepclean.head()

Unnamed: 0,equipment,step,length,number,title,id_step
0,,Coat chicken with a mixture of 6 tsp. of papri...,20 minutes,1.0,Alouette Chicken Paprika,1
1,[frying pan],"Remove chicken from skillet, reserving liquid.",,2.0,Alouette Chicken Paprika,2
2,,"Combine milk, Alouette",,3.0,Alouette Chicken Paprika,3
3,[mixing bowl],Spreadable Cheese and remaining paprika in a s...,,4.0,Alouette Chicken Paprika,4
4,,Pour Alouette Garlic & Herbs,,5.0,Alouette Chicken Paprika,5


In [116]:
#Explode the ingredients column
dfstepclean_exploded = dfstepclean.explode(['equipment',])

#Rename the exploded column to ing_name
dfstepclean_exploded = dfstepclean_exploded.rename(columns={'equipment': 'equip_name'})

# Reset the index 
dfstepclean_exploded = dfstepclean_exploded.reset_index(drop=True)

dfstepclean_exploded.head()

Unnamed: 0,equip_name,step,length,number,title,id_step
0,,Coat chicken with a mixture of 6 tsp. of papri...,20 minutes,1.0,Alouette Chicken Paprika,1
1,frying pan,"Remove chicken from skillet, reserving liquid.",,2.0,Alouette Chicken Paprika,2
2,,"Combine milk, Alouette",,3.0,Alouette Chicken Paprika,3
3,mixing bowl,Spreadable Cheese and remaining paprika in a s...,,4.0,Alouette Chicken Paprika,4
4,,Pour Alouette Garlic & Herbs,,5.0,Alouette Chicken Paprika,5


### dealing with id_equipment in dfstepclean_exploded

In [117]:
dfstepclean_exploded.head()

Unnamed: 0,equip_name,step,length,number,title,id_step
0,,Coat chicken with a mixture of 6 tsp. of papri...,20 minutes,1.0,Alouette Chicken Paprika,1
1,frying pan,"Remove chicken from skillet, reserving liquid.",,2.0,Alouette Chicken Paprika,2
2,,"Combine milk, Alouette",,3.0,Alouette Chicken Paprika,3
3,mixing bowl,Spreadable Cheese and remaining paprika in a s...,,4.0,Alouette Chicken Paprika,4
4,,Pour Alouette Garlic & Herbs,,5.0,Alouette Chicken Paprika,5


In [118]:
dfequip.head()

Unnamed: 0,equip_name,id_equipment
0,frying pan,1
1,mixing bowl,2
2,cake form,3
3,stand mixer,4
4,bowl,5


In [119]:
# Create a mapping dictionary from dfequip, 
equip_mapping = dfequip.set_index('equip_name')['id_equipment'].to_dict()
#print(equip_mapping)

In [120]:
# Use the map function to fill the new column
def map_with_none(equip_name, equip_mapping):
    """Maps ingredient names to id_ingredient, handling missing values."""
    if pd.isna(equip_name):
        return None  # Return None for missing equipments names
    return equip_mapping.get(equip_name, None)  # Use get() to avoid KeyError for missing keys

# Apply the custom map function
dfstepclean_exploded['id_equipment'] = dfstepclean_exploded['equip_name'].apply(map_with_none, args=(equip_mapping,))

# Convert id_equipment to integer type where applicable, keeping None values
dfstepclean_exploded['id_equipment'] = dfstepclean_exploded['id_equipment'].astype('Int64')


In [121]:
# Display the result
dfstepclean_exploded.head()

Unnamed: 0,equip_name,step,length,number,title,id_step,id_equipment
0,,Coat chicken with a mixture of 6 tsp. of papri...,20 minutes,1.0,Alouette Chicken Paprika,1,
1,frying pan,"Remove chicken from skillet, reserving liquid.",,2.0,Alouette Chicken Paprika,2,1.0
2,,"Combine milk, Alouette",,3.0,Alouette Chicken Paprika,3,
3,mixing bowl,Spreadable Cheese and remaining paprika in a s...,,4.0,Alouette Chicken Paprika,4,2.0
4,,Pour Alouette Garlic & Herbs,,5.0,Alouette Chicken Paprika,5,


### dealing with id_recipe in dfstepclean_exploded

In [122]:
# Apply the custom map function
dfstepclean_exploded['id_recipe'] = dfstepclean_exploded['title'].apply(map_with_none, args=(recipe_mapping,))

# Convert id_ingredient to integer type where applicable, keeping None values
dfstepclean_exploded['id_recipe'] = dfstepclean_exploded['id_recipe'].astype('Int64')

In [123]:
# Display the result
dfstepclean_exploded.head()

Unnamed: 0,equip_name,step,length,number,title,id_step,id_equipment,id_recipe
0,,Coat chicken with a mixture of 6 tsp. of papri...,20 minutes,1.0,Alouette Chicken Paprika,1,,1
1,frying pan,"Remove chicken from skillet, reserving liquid.",,2.0,Alouette Chicken Paprika,2,1.0,1
2,,"Combine milk, Alouette",,3.0,Alouette Chicken Paprika,3,,1
3,mixing bowl,Spreadable Cheese and remaining paprika in a s...,,4.0,Alouette Chicken Paprika,4,2.0,1
4,,Pour Alouette Garlic & Herbs,,5.0,Alouette Chicken Paprika,5,,1


### creating the dfreference_equip dataframe

In [124]:
dfreference_equip = dfstepclean_exploded[['id_recipe', 'id_step', 'id_equipment']]
#del dfstepclean_exploded
dfreference_equip

Unnamed: 0,id_recipe,id_step,id_equipment
0,1,1,
1,1,2,1
2,1,3,
3,1,4,2
4,1,5,
...,...,...,...
789,99,626,36
790,99,626,13
791,99,627,
792,100,628,13


## Dealing with dish types

### extracting needed columns

In [125]:
# Constructing dish type dataframe
dfdish = pd.DataFrame(df_test, columns=['dishTypes', 'title'])
dfdish.head()

Unnamed: 0,dishTypes,title
0,"[lunch, main course, main dish, dinner]",Alouette Chicken Paprika
1,"[antipasti, starter, snack, appetizer, antipas...",Baby Bok Choy Stir Fry
2,[dessert],Oreo Cake
3,[dessert],Oreo Cookies & Cream No-Bake Cheesecake
4,"[lunch, main course, morning meal, brunch, mai...",Frittata


In [126]:
dfdish_types = dfdish.explode('dishTypes').reset_index(drop=True)
dfdish_types.head()

Unnamed: 0,dishTypes,title
0,lunch,Alouette Chicken Paprika
1,main course,Alouette Chicken Paprika
2,main dish,Alouette Chicken Paprika
3,dinner,Alouette Chicken Paprika
4,antipasti,Baby Bok Choy Stir Fry


In [127]:
print(len(dfdish_types))  # Check the number of rows in the final DataFrame after removing duplicates

381


### dfdish_type dataframe

In [128]:
dfdish_type=dfdish_types.copy()

# Delete title column 
dfdish_type.drop(columns=['title'], inplace=True)

# Remove duplicates
dfdish_type = dfdish_type.drop_duplicates()

# Rename dishTypes column
dfdish_type = dfdish_type.rename(columns={'dishTypes': 'dish_type'})

#Remove NaN values
dfdish_type.dropna(inplace=True)

# Reset the index 
dfdish_type = dfdish_type.reset_index(drop=True)

dfdish_type.head()

Unnamed: 0,dish_type
0,lunch
1,main course
2,main dish
3,dinner
4,antipasti


In [129]:
print(len(dfdish_type))  # Check the number of rows in the final DataFrame after removing duplicates

23


In [130]:
# Generate successive numbers for the 'id_dish_type' column
dfdish_type['id_dish_type'] = range(1, len(dfdish_type) + 1)

In [131]:
dfdish_type.head()

Unnamed: 0,dish_type,id_dish_type
0,lunch,1
1,main course,2
2,main dish,3
3,dinner,4
4,antipasti,5


In [132]:
dfdish_type.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   dish_type     23 non-null     object
 1   id_dish_type  23 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 496.0+ bytes


### dfis_a dataframe

In [133]:
dfdish_types.head()

Unnamed: 0,dishTypes,title
0,lunch,Alouette Chicken Paprika
1,main course,Alouette Chicken Paprika
2,main dish,Alouette Chicken Paprika
3,dinner,Alouette Chicken Paprika
4,antipasti,Baby Bok Choy Stir Fry


In [134]:
# Create a mapping dictionary from dfdish_type, 
dish_mapping = dfdish_type.set_index('dish_type')['id_dish_type'].to_dict()
#print(dish_mapping)

In [135]:
# Use the map function to fill the new column
def map_with_none(dish_type, dish_mapping):
    """Maps dish types names to id_dish_type, handling missing values."""
    if pd.isna(dish_type):
        return None  # Return None for missing dish types
    return dish_mapping.get(dish_type, None)  # Use get() to avoid KeyError for missing keys

# Apply the custom map function
dfdish_types['id_dish_type'] = dfdish_types['dishTypes'].apply(map_with_none, args=(dish_mapping,))

# Convert id_dish_type to integer type where applicable, keeping None values
dfdish_types['id_dish_type'] = dfdish_types['id_dish_type'].astype('Int64')


In [136]:
# Display the result
dfdish_types.head()

Unnamed: 0,dishTypes,title,id_dish_type
0,lunch,Alouette Chicken Paprika,1
1,main course,Alouette Chicken Paprika,2
2,main dish,Alouette Chicken Paprika,3
3,dinner,Alouette Chicken Paprika,4
4,antipasti,Baby Bok Choy Stir Fry,5


In [137]:
# Delete title column 
dfdish_types.drop(columns=['dishTypes'], inplace=True)

In [138]:
dfdish_types.head()

Unnamed: 0,title,id_dish_type
0,Alouette Chicken Paprika,1
1,Alouette Chicken Paprika,2
2,Alouette Chicken Paprika,3
3,Alouette Chicken Paprika,4
4,Baby Bok Choy Stir Fry,5


In [139]:
# Apply the custom map function
dfdish_types['id_recipe'] = dfdish_types['title'].apply(map_with_none, args=(recipe_mapping,))

# Convert id_recipe to integer type where applicable, keeping None values
dfdish_types['id_recipe'] = dfdish_types['id_recipe'].astype('Int64')

In [140]:
dfdish_types.head()

Unnamed: 0,title,id_dish_type,id_recipe
0,Alouette Chicken Paprika,1,1
1,Alouette Chicken Paprika,2,1
2,Alouette Chicken Paprika,3,1
3,Alouette Chicken Paprika,4,1
4,Baby Bok Choy Stir Fry,5,2


In [141]:
dfdish_types.drop(columns=['title'], inplace=True)

In [142]:
dfis_a=dfdish_types.copy()
# del dfdish_types

In [143]:
dfis_a.head()

Unnamed: 0,id_dish_type,id_recipe
0,1,1
1,2,1
2,3,1
3,4,1
4,5,2


## Dealing with cuisines

### extracting needed columns

In [144]:
# Constructing cuisines dataframe
dfcuisines = pd.DataFrame(df_test, columns=['cuisines', 'title'])
dfcuisines.head()

Unnamed: 0,cuisines,title
0,[],Alouette Chicken Paprika
1,[],Baby Bok Choy Stir Fry
2,[],Oreo Cake
3,[],Oreo Cookies & Cream No-Bake Cheesecake
4,[],Frittata


In [145]:
dfcuisines = dfcuisines.explode('cuisines').reset_index(drop=True)
dfcuisines.head()

Unnamed: 0,cuisines,title
0,,Alouette Chicken Paprika
1,,Baby Bok Choy Stir Fry
2,,Oreo Cake
3,,Oreo Cookies & Cream No-Bake Cheesecake
4,,Frittata


In [146]:
print(len(dfcuisines))

131


### dfcuisine dataframe

In [147]:
dfcuisine=dfcuisines.copy()

# Delete title column 
dfcuisine.drop(columns=['title'], inplace=True)

# Remove duplicates
dfcuisine = dfcuisine.drop_duplicates()

# Rename cuisines column
dfcuisine = dfcuisine.rename(columns={'cuisines': 'recipe_cuisine'})

#Remove NaN values
dfcuisine.dropna(inplace=True)

# Reset the index 
dfcuisine = dfcuisine.reset_index(drop=True)

dfcuisine.head()

Unnamed: 0,recipe_cuisine
0,American
1,Mediterranean
2,French
3,European
4,Indian


In [148]:
print(len(dfcuisine))

16


In [149]:
# Generate successive numbers for the 'id_cuisine' column
dfcuisine['id_cuisine'] = range(1, len(dfcuisine) + 1)

In [150]:
dfcuisine.head()

Unnamed: 0,recipe_cuisine,id_cuisine
0,American,1
1,Mediterranean,2
2,French,3
3,European,4
4,Indian,5


In [151]:
dfcuisine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   recipe_cuisine  16 non-null     object
 1   id_cuisine      16 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 384.0+ bytes


### dfbelongs dataframe

In [152]:
dfcuisines.head()

Unnamed: 0,cuisines,title
0,,Alouette Chicken Paprika
1,,Baby Bok Choy Stir Fry
2,,Oreo Cake
3,,Oreo Cookies & Cream No-Bake Cheesecake
4,,Frittata


In [153]:
# Create a mapping dictionary from dfcuisine, 
cuisine_mapping = dfcuisine.set_index('recipe_cuisine')['id_cuisine'].to_dict()
#print(cuisine_mapping)

In [154]:
# Use the map function to fill the new column
def map_with_none(recipe_cuisine, cuisine_mapping):
    """Maps recipes cuisines to id_cuisine, handling missing values."""
    if pd.isna(recipe_cuisine):
        return None  # Return None for missing cuisines
    return cuisine_mapping.get(recipe_cuisine, None)  # Use get() to avoid KeyError for missing keys

# Apply the custom map function
dfcuisines['id_cuisine'] = dfcuisines['cuisines'].apply(map_with_none, args=(cuisine_mapping,))

# Convert id_dish_type to integer type where applicable, keeping None values
dfcuisines['id_cuisine'] = dfcuisines['id_cuisine'].astype('Int64')


In [155]:
dfcuisines.head()

Unnamed: 0,cuisines,title,id_cuisine
0,,Alouette Chicken Paprika,
1,,Baby Bok Choy Stir Fry,
2,,Oreo Cake,
3,,Oreo Cookies & Cream No-Bake Cheesecake,
4,,Frittata,


In [156]:
# Apply the custom map function
dfcuisines['id_recipe'] = dfcuisines['title'].apply(map_with_none, args=(recipe_mapping,))

# Convert id_recipe to integer type where applicable, keeping None values
dfcuisines['id_recipe'] = dfcuisines['id_recipe'].astype('Int64')

In [157]:
dfcuisines.head()

Unnamed: 0,cuisines,title,id_cuisine,id_recipe
0,,Alouette Chicken Paprika,,1
1,,Baby Bok Choy Stir Fry,,2
2,,Oreo Cake,,3
3,,Oreo Cookies & Cream No-Bake Cheesecake,,4
4,,Frittata,,5


In [158]:
dfcuisines.drop(columns=['cuisines','title'], inplace=True)

In [159]:
dfbelongs=dfcuisines.copy()
#del dfcuisines

## Transform Recap

### recipe table
(id_recipe INT, recipe_title VARCHAR(50), ready_min INT, summary VARCHAR(2000), servings INT, is_cheap LOGICAL, price_per_serving DOUBLE, is_vegetarian LOGICAL, is_vegan LOGICAL, is_glutenFree LOGICAL, is_dairyFree LOGICAL, is_healthy LOGICAL, is_sustainable LOGICAL, is_lowFodmap LOGICAL, is_Popular LOGICAL, license VARCHAR(20), source_url VARCHAR(100));

In [160]:
dfrecipes.head()

Unnamed: 0,is_vegetarian,is_vegan,is_glutenFree,is_dairyFree,is_healthy,is_cheap,is_Popular,is_sustainable,is_lowFodmap,price_per_serving,recipe_title,ready_min,servings,source_url,summary,license,id_recipe
0,False,False,True,False,False,False,False,False,False,329.59,Alouette Chicken Paprika,45,4,https://www.foodista.com/recipe/62BLCZVT/aloue...,Alouette Chicken Paprika takes around <b>45 mi...,,1
1,True,True,True,True,True,False,False,False,False,109.24,Baby Bok Choy Stir Fry,45,2,https://www.foodista.com/recipe/4KHXC282/baby-...,Baby Bok Choy Stir Fry is a hor d'oeuvre that ...,CC BY 3.0,2
2,False,False,False,False,False,False,False,False,False,238.92,Oreo Cake,45,4,https://www.foodista.com/recipe/MQXQDZWD/oreo-...,Oreo Cake might be a good recipe to expand you...,CC BY 3.0,3
3,True,False,False,False,False,False,False,False,False,48.24,Oreo Cookies & Cream No-Bake Cheesecake,45,20,http://www.foodista.com/recipe/WWFJTZ7N/oreo-c...,Oreo Cookies & Cream No-Bake Cheesecake requir...,CC BY 3.0,4
4,False,False,True,False,False,False,False,False,False,270.59,Frittata,45,2,https://www.foodista.com/recipe/SNFFG2VB/frittata,Frittata requires about <b>45 minutes</b> from...,,5


In [161]:
dfrecipes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   is_vegetarian      100 non-null    bool   
 1   is_vegan           100 non-null    bool   
 2   is_glutenFree      100 non-null    bool   
 3   is_dairyFree       100 non-null    bool   
 4   is_healthy         100 non-null    bool   
 5   is_cheap           100 non-null    bool   
 6   is_Popular         100 non-null    bool   
 7   is_sustainable     100 non-null    bool   
 8   is_lowFodmap       100 non-null    bool   
 9   price_per_serving  100 non-null    float64
 10  recipe_title       100 non-null    object 
 11  ready_min          100 non-null    int64  
 12  servings           100 non-null    int64  
 13  source_url         100 non-null    object 
 14  summary            100 non-null    object 
 15  license            57 non-null     object 
 16  id_recipe          100 non-

### Ingredients table
(id_ingredient INT, ing_name VARCHAR(50), consistency VARCHAR(20), aisle VARCHAR(20));

In [162]:
dfIng.head()

Unnamed: 0,consistency,ing_name,aisle,id_ingredient
0,SOLID,butter,"Milk, Eggs, Other Dairy",1
1,SOLID,chicken breast,Meat,2
2,SOLID,garlic powder,Spices and Seasonings,3
3,SOLID,garlic herb spreadable cheese,Cheese,4
4,LIQUID,milk,"Milk, Eggs, Other Dairy",5


In [163]:
dfIng.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   consistency    380 non-null    object
 1   ing_name       380 non-null    object
 2   aisle          380 non-null    object
 3   id_ingredient  380 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 12.0+ KB


### reference_ing table
(#id_recipe, #id_ingredient, measure VARCHAR(50));

In [164]:
dfreference_ing.head()

Unnamed: 0,measure,id_recipe,id_ingredient
0,1.0 Tbsp,1,1
1,4.0,1,2
2,2.0 tsps,1,3
3,6.5 oz / 184.272 g,1,4
4,1.0 Tbsp,1,5


In [165]:
dfreference_ing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028 entries, 0 to 1027
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   measure        1028 non-null   object
 1   id_recipe      1028 non-null   Int64 
 2   id_ingredient  1028 non-null   Int64 
dtypes: Int64(2), object(1)
memory usage: 26.2+ KB


### Equipment table
(id_equipment INT, equip_name VARCHAR(50));

In [166]:
dfequip.head()

Unnamed: 0,equip_name,id_equipment
0,frying pan,1
1,mixing bowl,2
2,cake form,3
3,stand mixer,4
4,bowl,5


In [167]:
dfequip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   equip_name    55 non-null     object
 1   id_equipment  55 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 1008.0+ bytes


### Instructions table
Instruction = (id_instruction INT, #id_recipe);

In [168]:
dfIns.head()

Unnamed: 0,instruction_id,id_recipe
0,1,1
1,2,2
2,3,3
3,4,4
4,5,5


In [169]:
dfIns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   instruction_id  99 non-null     int64
 1   id_recipe       99 non-null     Int64
dtypes: Int64(1), int64(1)
memory usage: 1.8 KB


### steps table
(id_step INT, step VARCHAR(8000), number INT, length VARCHAR(50), #id_instruction);

In [170]:
dfstep_final.head()

Unnamed: 0,step,length,number,id_step,instruction_id
0,Coat chicken with a mixture of 6 tsp. of papri...,20 minutes,1.0,1,1
1,"Remove chicken from skillet, reserving liquid.",,2.0,2,1
2,"Combine milk, Alouette",,3.0,3,1
3,Spreadable Cheese and remaining paprika in a s...,,4.0,4,1
4,Pour Alouette Garlic & Herbs,,5.0,5,1


In [171]:
dfstep_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 629 entries, 0 to 628
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            629 non-null    object 
 1   length          156 non-null    object 
 2   number          629 non-null    float64
 3   id_step         629 non-null    int64  
 4   instruction_id  629 non-null    int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 24.7+ KB


### reference_equip table
(#id_recipe, #id_step, #id_equipment);

In [172]:
dfreference_equip.head()

Unnamed: 0,id_recipe,id_step,id_equipment
0,1,1,
1,1,2,1.0
2,1,3,
3,1,4,2.0
4,1,5,


In [173]:
dfreference_equip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 794 entries, 0 to 793
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   id_recipe     794 non-null    Int64
 1   id_step       794 non-null    int64
 2   id_equipment  495 non-null    Int64
dtypes: Int64(2), int64(1)
memory usage: 20.3 KB


### dish_type table
(id_dish_type INT, dish_type VARCHAR(50))

In [174]:
dfdish_type.head()

Unnamed: 0,dish_type,id_dish_type
0,lunch,1
1,main course,2
2,main dish,3
3,dinner,4
4,antipasti,5


In [175]:
dfdish_type.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   dish_type     23 non-null     object
 1   id_dish_type  23 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 496.0+ bytes


### is_a  table
(#id_recipe, #id_dish_type)

In [176]:
dfis_a.head()

Unnamed: 0,id_dish_type,id_recipe
0,1,1
1,2,1
2,3,1
3,4,1
4,5,2


In [177]:
dfis_a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381 entries, 0 to 380
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   id_dish_type  377 non-null    Int64
 1   id_recipe     381 non-null    Int64
dtypes: Int64(2)
memory usage: 6.8 KB


### cuisine table
(id_cuisine INT, recipe_cuisine VARCHAR(50));

In [178]:
dfcuisine.head()

Unnamed: 0,recipe_cuisine,id_cuisine
0,American,1
1,Mediterranean,2
2,French,3
3,European,4
4,Indian,5


In [179]:
dfcuisine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   recipe_cuisine  16 non-null     object
 1   id_cuisine      16 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 384.0+ bytes


### belongs table
(#id_recipe, #id_cuisine);

In [180]:
dfbelongs.head()

Unnamed: 0,id_cuisine,id_recipe
0,,1
1,,2
2,,3
3,,4
4,,5


In [181]:
dfbelongs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131 entries, 0 to 130
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   id_cuisine  63 non-null     Int64
 1   id_recipe   131 non-null    Int64
dtypes: Int64(2)
memory usage: 2.4 KB


# load

In [182]:
DB_HOST='10.0.2.15'
DB_PORT= 5432
DB_NAME='recipe_etl'
DB_USER='maryem'
DB_PASSWORD='HelloWorld'
try:
    conn = psycopg2.connect(
            host=DB_HOST,
            port=DB_PORT,
            dbname=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD
        )
    print("Connected to "+DB_NAME)
except psycopg2.Error as e:
    print("Error: Could not make connection to the database "+DB_NAME)
    print(e)

Connected to recipe_etl


In [183]:
#db_connection.autocommit=True

In [184]:
# Cursor helps you to execute queries on your database
try:
    cur = conn.cursor()
    print("cursor got to the database "+DB_NAME)
    print("Cursor description:", cur.description)
except:
    print("Error: Could not get cursor to the database "+DB_NAME)
    print(e)

cursor got to the database recipe_etl
Cursor description: None


In [185]:

try:
    # Iterate over DataFrame rows and insert values into the Recipe table
    for index, row in dfrecipes.iterrows():
        cur.execute("""
            INSERT INTO Recipe (id_recipe, recipe_title, ready_min, summary, servings, is_cheap, price_per_serving, 
            is_vegetarian, is_vegan, is_glutenFree, is_dairyFree, is_healthy, is_sustainable, is_lowFodmap, 
            is_Popular, license, source_url)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) RETURNING id_recipe;""", (
            int(row['id_recipe']),
            row['recipe_title'],
            int(row['ready_min']),
            row['summary'],
            int(row['servings']),
            bool(row['is_cheap']),
            float(row['price_per_serving']),
            bool(row['is_vegetarian']),
            bool(row['is_vegan']),
            bool(row['is_glutenFree']),
            bool(row['is_dairyFree']),
            bool(row['is_healthy']),
            bool(row['is_sustainable']),
            bool(row['is_lowFodmap']),
            bool(row['is_Popular']),
            row['license'],
            row['source_url'],
        ))
        
        # Get the generated id_recipe
        recipe_id = cur.fetchone()[0]
        #print(f"Inserted recipe_id: {recipe_id}")  # Debugging line

        if recipe_id is None:
            raise ValueError("id_recipe is None, insertion into Recipe table might have failed")
        
    # Commit the transaction
    conn.commit()
    print("Values inserted successfully")
except psycopg2.Error as e:
    # Print error message if something goes wrong
    print("Error in inserting values")
    print(e)
except ValueError as ve:
    # Print error message if the id_recipe is None
    print("Value error")
    print(ve)


Values inserted successfully


In [186]:
try:
    for index, row in dfIns.iterrows():
        cur.execute("""
            INSERT INTO Instruction (id_instruction, id_recipe)
            VALUES (%s, %s) RETURNING id_instruction;""", (
            int(row['instruction_id']),
            int(row['id_recipe'])
        ))
        
        instruction_id = cur.fetchone()[0]

        if instruction_id is None:
            raise ValueError("id_instruction is None, insertion into Instruction table might have failed")
        
    conn.commit()
    print("Values inserted into Instruction successfully")
except psycopg2.Error as e:
    print("Error in inserting values into Instruction")
    print(e)
except ValueError as ve:
    print("Value error")
    print(ve)


Values inserted into Instruction successfully


In [187]:
try:
    for index, row in dfIng.iterrows():
        cur.execute("""
            INSERT INTO Ingredient (id_ingredient, ing_name, consistency, aisle)
            VALUES (%s, %s, %s, %s) RETURNING id_ingredient;""", (
            int(row['id_ingredient']),
            row['ing_name'],
            row['consistency'],
            row['aisle']
        ))
        
        ingredient_id = cur.fetchone()[0]

        if ingredient_id is None:
            raise ValueError("id_ingredient is None, insertion into Ingredient table might have failed")
        
    conn.commit()
    print("Values inserted into Ingredient successfully")
except psycopg2.Error as e:
    print("Error in inserting values into Ingredient")
    print(e)
except ValueError as ve:
    print("Value error")
    print(ve)


Values inserted into Ingredient successfully


In [188]:
try:
    for index, row in dfstep_final.iterrows():
        cur.execute("""
            INSERT INTO Step (id_step, step, number, length, id_instruction)
            VALUES (%s, %s, %s, %s, %s) RETURNING id_step;""", (
            int(row['id_step']),
            row['step'],
            int(row['number']),
            row['length'],
            int(row['instruction_id'])
        ))
        
        step_id = cur.fetchone()[0]

        if step_id is None:
            raise ValueError("id_step is None, insertion into Step table might have failed")
        
    conn.commit()
    print("Values inserted into Step successfully")
except psycopg2.Error as e:
    print("Error in inserting values into Step")
    print(e)
except ValueError as ve:
    print("Value error")
    print(ve)


Values inserted into Step successfully


In [189]:
try:
    for index, row in dfequip.iterrows():
        cur.execute("""
            INSERT INTO Equipment (id_equipment, equip_name)
            VALUES (%s, %s) RETURNING id_equipment;""", (
            int(row['id_equipment']),
            row['equip_name']
        ))
        
        equipment_id = cur.fetchone()[0]

        if equipment_id is None:
            raise ValueError("id_equipment is None, insertion into Equipment table might have failed")
        
    conn.commit()
    print("Values inserted into Equipment successfully")
except psycopg2.Error as e:
    print("Error in inserting values into Equipment")
    print(e)
except ValueError as ve:
    print("Value error")
    print(ve)


Values inserted into Equipment successfully


In [190]:
try:
    for index, row in dfdish_type.iterrows():
        cur.execute("""
            INSERT INTO dish (id_dish, dish_type)
            VALUES (%s, %s) RETURNING id_dish;""", (
            int(row['id_dish_type']),
            row['dish_type']
        ))
        
        dish_id = cur.fetchone()[0]

        if dish_id is None:
            raise ValueError("id_dish is None, insertion into dish table might have failed")
        
    conn.commit()
    print("Values inserted into dish successfully")
except psycopg2.Error as e:
    print("Error in inserting values into dish")
    print(e)
except ValueError as ve:
    print("Value error")
    print(ve)


Values inserted into dish successfully


In [191]:
try:
    for index, row in dfcuisine.iterrows():
        cur.execute("""
            INSERT INTO Cuisine (id_cuisine, recipe_cuisine)
            VALUES (%s, %s) RETURNING id_cuisine;""", (
            int(row['id_cuisine']),
            row['recipe_cuisine']
        ))
        
        cuisine_id = cur.fetchone()[0]

        if cuisine_id is None:
            raise ValueError("id_cuisine is None, insertion into Cuisine table might have failed")
        
    conn.commit()
    print("Values inserted into Cuisine successfully")
except psycopg2.Error as e:
    print("Error in inserting values into Cuisine")
    print(e)
except ValueError as ve:
    print("Value error")
    print(ve)


Values inserted into Cuisine successfully


In [192]:
# Example for reference_ing
try:
    for index, row in dfreference_ing.iterrows():
        cur.execute("""
            INSERT INTO reference_ing (id_recipe, id_ingredient, measure)
            VALUES (%s, %s, %s) RETURNING id_recipe;""", (
            int(row['id_recipe']),
            int(row['id_ingredient']),
            row['measure']
        ))
        
        reference_ing_id = cur.fetchone()[0]

        if reference_ing_id is None:
            raise ValueError("id_recipe in reference_ing is None, insertion might have failed")
        
    conn.commit()
    print("Values inserted into reference_ing successfully")
except psycopg2.Error as e:
    print("Error in inserting values into reference_ing")
    print(e)
except ValueError as ve:
    print("Value error")
    print(ve)

Values inserted into reference_ing successfully


In [193]:
# Ensure that NA values are treated as numpy NaN
dfreference_equip = dfreference_equip.replace({pd.NA: np.nan})

try:
    for index, row in dfreference_equip.iterrows():
        # Handle NA values in 'id_equipment'
        id_equipment = None if pd.isna(row['id_equipment']) else int(row['id_equipment'])
        
        cur.execute("""
            INSERT INTO reference_equip (id_recipe, id_step, id_equipment)
            VALUES (%s, %s, %s) RETURNING id_recipe;""", (
            int(row['id_recipe']),
            int(row['id_step']),
            id_equipment
        ))
        
        reference_equip_id = cur.fetchone()[0]

        if reference_equip_id is None:
            raise ValueError("id_recipe in reference_equip is None, insertion might have failed")
        
    conn.commit()
    print("Values inserted into reference_equip successfully")
except psycopg2.Error as e:
    print("Error in inserting values into reference_equip")
    print(e)
except ValueError as ve:
    print("Value error")
    print(ve)


Values inserted into reference_equip successfully


In [194]:
# Ensure that NA values are treated as numpy NaN
dfis_a = dfis_a.replace({pd.NA: np.nan})

try:
    for index, row in dfis_a.iterrows():
        # Handle NA values in 'id_equipment'
        id_dish_type = None if pd.isna(row['id_dish_type']) else int(row['id_dish_type'])

        cur.execute("""
            INSERT INTO is_a (id_recipe, id_dish)
            VALUES (%s, %s) RETURNING id_recipe;""", (
            int(row['id_recipe']),
            id_dish_type
        ))
        
        is_a_id = cur.fetchone()[0]

        if is_a_id is None:
            raise ValueError("id_recipe in is_a is None, insertion might have failed")
        
    conn.commit()
    print("Values inserted into is_a successfully")
except psycopg2.Error as e:
    print("Error in inserting values into is_a")
    print(e)
except ValueError as ve:
    print("Value error")
    print(ve)


Values inserted into is_a successfully


In [195]:
# Ensure that NA values are treated as numpy NaN
dfbelongs = dfbelongs.replace({pd.NA: np.nan})

try:
    for index, row in dfbelongs.iterrows():
         # Handle NA values in 'id_equipment'
        id_cuisine = None if pd.isna(row['id_cuisine']) else int(row['id_cuisine'])
        
        cur.execute("""
            INSERT INTO belongs (id_recipe, id_cuisine)
            VALUES (%s, %s) RETURNING id_recipe;""", (
            int(row['id_recipe']),
            id_cuisine)
        )
        
        belongs_id = cur.fetchone()[0]

        if belongs_id is None:
            raise ValueError("id_recipe in belongs is None, insertion might have failed")
        
    conn.commit()
    print("Values inserted into belongs successfully")
except psycopg2.Error as e:
    print("Error in inserting values into belongs")
    print(e)
except ValueError as ve:
    print("Value error")
    print(ve)


Values inserted into belongs successfully
