# Preparing environment

In [None]:
import requests
import json
import pandas as pd
import psycopg2
import numpy as np
import logging

# Set up logging

In [None]:
class LevelRangeFilter(logging.Filter):
    """
    A logging filter that allows messages within a specified range of logging levels to pass through.

    Attributes:
        min_level (int): The minimum logging level that the filter allows.
        max_level (int): The maximum logging level that the filter allows.
    """
    def __init__(self, min_level, max_level):
        """
        Initializes the filter with minimum and maximum logging levels.

        Args:
            min_level (int): The minimum logging level.
            max_level (int): The maximum logging level.
        """
        super().__init__()
        self.min_level = min_level
        self.max_level = max_level

    def filter(self, record):
        """
        Determines if the specified record should be logged based on its level.

        Args:
            record (LogRecord): The log record to be checked.

        Returns:
            bool: True if the record's level is within the specified range, False otherwise.
        """
        # Filter records that are not in the specified level range
        return self.min_level <= record.levelno <= self.max_level


# Create a logger for this module
logger = logging.getLogger(__name__)

# Set the logging level for the logger
logger.setLevel(logging.DEBUG)
# Setting the logger level to DEBUG means that all log messages, regardless of their severity, will be processed and output by this logger.

# Create handlers
success_handler = logging.FileHandler('etl_success.log')
debug_handler = logging.FileHandler('etl_debug.log')
error_handler = logging.FileHandler('etl_errors.log')
console_handler = logging.StreamHandler()  # This handler will send logs to the console

# Create formatters and add them to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
success_handler.setFormatter(formatter)
debug_handler.setFormatter(formatter)
error_handler.setFormatter(formatter)
console_handler.setFormatter(formatter)

# Create filters to the handlers
success_filter = LevelRangeFilter(logging.INFO, logging.WARNING)
error_filter = LevelRangeFilter(logging.WARNING, logging.CRITICAL)
debug_filter = LevelRangeFilter(logging.DEBUG, logging.CRITICAL)
console_filter = LevelRangeFilter(logging.INFO, logging.CRITICAL)

# Add filters to the handlers
success_handler.addFilter(success_filter)
error_handler.addFilter(error_filter)
debug_handler.addFilter(debug_filter)
console_handler.addFilter(console_filter)

# check if the logger currently has any handlers attached to it.
# logger.handlers will return a list of handlers attached to logger
if not logger.handlers:  #If this list is empty  it means no handlers are currently attached to the logger.
    # Add handlers to the logger
    logger.addHandler(success_handler)
    logger.addHandler(debug_handler)
    logger.addHandler(error_handler)
    logger.addHandler(console_handler)

In [None]:
# Example usage
logger.info('Logging is set up and ETL pipeline is going to be executed')
#logger.debug('This is a debug message')
#logger.warning('This is a warning message')
#logger.error('This is an error message')
#logger.critical('This is a critical message')

# Extract

**API DOC**
https://spoonacular.com/application/frontend/downloads/spoonacular-api-slides.pdf

**API Link** https://spoonacular.com/food-api

In [None]:
def extract_data (api_url, num_recipes=100):
    """
    Extract data from the given API URL.

    Parameters:
    api_url (str): The URL of the API to extract data from.
    num_recipes (int): The number of recipes to request from the API. Default is 100.

    Returns:
    dict or None: The data extracted from the API in JSON format, or None if there was an error.
    """
    # Parameters to be sent with the API request
    params = {'number': num_recipes}
    try:
        # Send a GET request to the API with the specified parameters
        response = requests.get(api_url, params=params)
        # Raise an exception for non-2xx status codes
        response.raise_for_status() 
        try:
            # Parse the response content as JSON
            data = response.json()
            return data
        except json.JSONDecodeError as e:
            # Log an error if JSON parsing fails
            logger.error(f"Error parsing JSON data: {e}")
            return None
    except requests.exceptions.RequestException as e:
        # Log an error if the API request fails
        logger.error(f"Error extracting data from API: {e}")
        return None

In [None]:
# Define the API key used to authenticate with the Spoonacular API
API_KEY = '9fe94823f081434989282d1622cfbc31'
# Construct the API URL by appending the API key to the base URL for random recipes
# The API_URL is composed of the base URL for the random recipes endpoint and the API key as a query parameter
API_URL = 'https://api.spoonacular.com/recipes/random?apiKey=' + API_KEY

# Call the extract_data function to fetch data from the API URL
# The extract_data function sends a GET request to the API_URL and returns the response data
data = extract_data(API_URL)

# Check if data was successfully extracted
if data:
    logger.debug(data) # Log the extracted data at the debug level
    logger.info("Data extracted successfully.") # Log a success message at the info level
else:
    # Log an error message if data extraction failed
    logger.error("An error occurred while fetching data.")

# Transform

## Json to Pandas
you can import any json data into pandas, the keys will become columns names

In [None]:
# Check if the 'recipes' key exists in the data dictionary and contains a list
if 'recipes' in data and isinstance(data['recipes'], list):
    # Convert the list of recipe dictionaries into a pandas DataFrame
    df = pd.DataFrame(data['recipes'])
    
    # Display the first few rows of the DataFrame to verify its structure
    logger.debug(df.head())
    
    # Log the number of recipes extracted
    logger.info(f"Extracted {len(df)} recipes.")

    # Log detailed information about the DataFrame, such as column types and non-null counts
    logger.debug(df.info())

else:
    # Log an error if the 'recipes' key is missing or does not contain a list
    logger.error("'recipes' key is missing or does not contain a valid list in the data dictionary.")

## Dealing with recipes

In [None]:
# Define the required columns for the dataframe
dfrecipes_COLUMNS = [
    'vegetarian', 'vegan', 'glutenFree', 'dairyFree', 'veryHealthy', 
    'cheap', 'veryPopular', 'sustainable', 'lowFodmap', 'pricePerServing', 
    'title', 'readyInMinutes', 'servings', 'sourceUrl', 'summary', 'license'
]
# Construct the dataframe
try:
    dfrecipes = pd.DataFrame(df, columns=dfrecipes_COLUMNS)
    logger.info(f"Constructed DataFrame with {len(dfrecipes)} recipes.")
    logger.debug(dfrecipes.head())
    logger.debug(dfrecipes.info())
except Exception as e:
    logger.error("Failed to construct DataFrame:", exc_info=True)


# Find duplicates based on the 'title' column
try:
    duplicates = dfrecipes[dfrecipes.duplicated(subset=['title'])]
    logger.debug(duplicates)
    if not duplicates.empty:
    # remove duplicates
        logger.warning(f"Found {len(duplicates)} duplicate recipes. Removing duplicates.")
        dfrecipes = dfrecipes.drop_duplicates(subset=['title'])
        dfrecipes.reset_index(drop=True, inplace=True)
        logger.info(f"DataFrame now contains {len(dfrecipes)} recipes after removing duplicates.")
    else:
        logger.info("No duplicates found")
except Exception as e:
    logger.error("Failed to handle duplicates", exc_info=True)


# Rename columns for clarity and consistency
COLUMNS_TO_RENAME_MAP={
    'vegetarian': 'is_vegetarian', 'vegan': 'is_vegan',
    'glutenFree': 'is_glutenFree', 'dairyFree': 'is_dairyFree',
    'veryHealthy': 'is_healthy', 'cheap': 'is_cheap',
    'veryPopular': 'is_Popular', 'sustainable': 'is_sustainable',
    'lowFodmap': 'is_lowFodmap', 'pricePerServing': 'price_per_serving', 
    'readyInMinutes': 'ready_min', 'sourceUrl': 'source_url', 
    'title': 'recipe_title'}
try:
    dfrecipes = dfrecipes.rename(columns=COLUMNS_TO_RENAME_MAP)
    logger.info("Renamed columns.")
except Exception as e:
    logger.error("Failed to rename columns", exc_info=True)

# Generate successive numbers for the 'id_recipe' column
try:
    dfrecipes['id_recipe'] = range(1, len(dfrecipes) + 1)
    logger.info("Added 'id_recipe'.")
    logger.debug(dfrecipes.head())
    logger.debug(dfrecipes.info())
except Exception as e:
    logger.error("Failed to add 'id_recipe' column", exc_info=True)



## Dealing with ingredients

In [None]:
# Define the required columns for the ingredients DataFrame
dfIng_COLUMNS=['consistency', 'nameClean', 'aisle', 'name']  


# Define a function to a list of dictionaries
def extract_dicts(lst):
    """Extract dictionaries from a list of dictionaries and return a list"""
    return [d for d in lst if isinstance(d, dict)]
# Apply the function and sum the results
try:
    # 'all_ingredients' variable recives a list of all dictionaries in the extendedIngredients column
    # note that a dataframe row is a dictionary
    all_ingredients = df['extendedIngredients'].apply(extract_dicts).sum()
    logger.info(f"Extracted {len(all_ingredients)} ingredients.")
except Exception as e:
    logger.error("Failed to extract ingredient dictionaries", exc_info=True)
    all_ingredients = []

# Construct the ingredients DataFrame
try:
    dfIng = pd.DataFrame(all_ingredients, columns=dfIng_COLUMNS)
    logger.info(f"Constructed ingredients DataFrame with {len(dfIng)} ingredients.")
    logger.debug(dfIng.head())
    logger.debug(dfIng.info())
except Exception as e:
    logger.error("Failed to construct ingredients DataFrame", exc_info=True)
    dfIng = pd.DataFrame(columns=dfIng_COLUMNS)


try:
    # Find duplicates
    duplicates = dfIng[dfIng.duplicated()]
    logger.debug(duplicates)
    # Remove duplicates
    if not duplicates.empty:
        logger.warning(f"Found {len(duplicates)} duplicate ingredients. Removing duplicates.")
        dfIng = dfIng.drop_duplicates().reset_index(drop=True)
        logger.info(f"DataFrame now contains {len(dfIng)} ingredients after removing duplicates.")
    else:
        logger.info("No duplicates found.")
except Exception as e:
    logger.error("Failed to handle duplicates", exc_info=True)

# Handle missing values in the nameClean column
try:
    none_rows = dfIng[dfIng['nameClean'].isna()]
    logger.debug(none_rows)
    if not none_rows.empty:
        logger.warning(f"Found {len(none_rows)} null nameClean. Filling with name.")
        logger.debug(none_rows)
        dfIng['nameClean'] = dfIng['nameClean'].fillna(dfIng['name'])
        logger.info("Filled missing 'nameClean' values.")
    else:
        logger.info("No null 'nameClean' found.")
    logger.debug(dfIng.info())
except Exception as e:
    logger.error("Failed to handle null values in 'nameClean'", exc_info=True)

# Drop the 'name' column after filling 'nameClean'
try:
    dfIng.drop('name', axis=1, inplace=True)
    logger.info("Dropped 'name' column.")
    logger.debug(dfIng.info())
except Exception as e:
    logger.error("Failed to drop 'name' column", exc_info=True)

try:
    # Find duplicates
    duplicates = dfIng[dfIng.duplicated()]
    logger.debug(duplicates)
    # Remove duplicates
    if not duplicates.empty:
        logger.warning(f"Found {len(duplicates)} duplicate ingredients. Removing duplicates.")
        dfIng = dfIng.drop_duplicates().reset_index(drop=True)
        logger.info(f"DataFrame now contains {len(dfIng)} ingredients after removing duplicates.")
    else:
        logger.info("No duplicates found.")
except Exception as e:
    logger.error("Failed to handle duplicates", exc_info=True)

# Rename 'nameClean' to 'ing_name'
try:
    dfIng = dfIng.rename(columns={'nameClean': 'ing_name'})
    logger.info("Renamed 'nameClean' to 'ing_name'.")
    logger.debug(dfIng.info())
except Exception as e:
    logger.error("Failed to rename columns", exc_info=True)

# Generate 'id_ingredient' column
try:
    dfIng['id_ingredient'] = range(1, len(dfIng) + 1)
    logger.info("Added 'id_ingredient'.")
    logger.debug(dfIng.head())
    logger.debug(dfIng.info())
except Exception as e:
    logger.error("Failed to add 'id_ingredient' column", exc_info=True)


## Dealing with measures (reference_ing)

In [None]:
# Extract the column that contains all the ingredients measures along with the recipes titles
try:
    dfrecipeIng = pd.DataFrame(df, columns=['extendedIngredients', 'title'])
    logger.info(f"Extracted ingredients and titles for {len(dfrecipeIng)} recipes.")
    logger.debug(dfrecipeIng.head())
    logger.debug(dfrecipeIng.info())
except Exception as e:
    logger.error("Failed to extract ingredients and titles", exc_info=True)

#----------------------------------------------------------------------
# Obtaining a new dataframe that contains measures of ingredients in each recipe

# Define the required columns for the measures DataFrame
dfmeasures_COLUMNS =['nameClean', 'name', 'measures', 'title']

# Initialize an empty list to collect rows of data
rows_data = []

# Iterate through each row of the original DataFrame
for i, row_series in dfrecipeIng.iterrows():
    try:
        # Access the recipe title for the current row 
        title = row_series['title']  # title is a string
        
        # Access the 'extendedIngredients' column for the current row
        recipe_ingredients_list = row_series['extendedIngredients']  # recipe_ingredients_list is a list
    
        # if recipe_ingredients_list is not an empty list skip the iteration (according to the Guard clauses principle of clean code)
        if not (recipe_ingredients_list and isinstance(recipe_ingredients_list, list)):
            continue
            
        # Iterate through each dictionary in the list
        for each_dict in recipe_ingredients_list:
            if each_dict and isinstance(each_dict, dict):  # Check if each_dict is non-empty and is a dictionary
                # Get measures dict 
                measures_dict = each_dict.get('measures', {})
                # Transform measures dict to a list
                measures_list = [measures_dict]       
            else:
                measures_list = None
            
            # Create a dictionary for the row data
            row_data = {
                'nameClean': each_dict.get('nameClean', None),
                'measures': measures_list,
                'name': each_dict.get('name'),
                'title': title
            }
            
            # Append the row data to the list
            rows_data.append(row_data)
    except Exception as e:
        logger.error(f"Error processing row {i} for recipe '{title}'", exc_info=True)

# Create the DataFrame from the list of row_data
try:
    dfmeasures = pd.DataFrame(rows_data, columns=dfmeasures_COLUMNS)
    logger.info(f"Constructed measures DataFrame with {len(dfmeasures)} rows.")
    logger.debug(dfmeasures.head())
    logger.debug(dfmeasures.info())
except Exception as e:
    logger.error("Failed to construct measures DataFrame", exc_info=True)

# Handle missing values in the nameClean column
try:
    none_rows = dfmeasures[dfmeasures['nameClean'].isna()]
    logger.debug(none_rows)
    if not none_rows.empty:
        logger.warning(f"Found {len(none_rows)} null nameClean. Filling with name.")
        dfmeasures['nameClean'] = dfmeasures['nameClean'].fillna(dfmeasures['name'])
        logger.info("Filled missing 'nameClean' values.")
    else:
        logger.info("No null 'nameClean' found.")
except Exception as e:
    logger.error("Failed to handle null values in 'nameClean'", exc_info=True)

# Drop the 'name' column after filling 'nameClean'
try:
    dfmeasures.drop('name', axis=1, inplace=True)
    logger.info("Dropped 'name' column.")
except Exception as e:
    logger.error("Failed to drop 'name' column", exc_info=True)

# Rename 'nameClean' to 'ing_name'
try:
    dfmeasures = dfmeasures.rename(columns={'nameClean': 'ing_name'})
    logger.info("Renamed 'nameClean' to 'ing_name'.")
    logger.debug(dfmeasures.info())
except Exception as e:
    logger.error("Failed to rename columns", exc_info=True)

#----------------------------------------------------------------------
#transforming the measures column

# Function to extract measure information from measures dict
def extract_measure(measures_dict, key_name):
  """
  Extracts a string value representing the measure from the given dictionary using a given name.
  Handles cases where the key name might be different.

  Args:
      measures_dict: A dictionary containing the measure information.
      key_name: The key name to look for.

  Returns:
      A string representing the measure in the format "amount unitShort".
  """
  if key_name in measures_dict:
    return f"{measures_dict[key_name]['amount']} {measures_dict[key_name]['unitShort']}"
  else:
    # Handle cases where the key might be different
    for key in measures_dict:
      if isinstance(measures_dict[key], dict):
        return extract_measure(measures_dict[key], key)
    # If no matching key is found, return an empty string
    return None

# Create two new columns with extracted measures
try:
    dfmeasures['measure_1'] = dfmeasures['measures'].apply(lambda x: extract_measure(x[0], "us"))
    dfmeasures['measure_2'] = dfmeasures['measures'].apply(lambda x: extract_measure(x[0], "metric"))
    logger.info("Created columns 'measure_1' and 'measure_2'.")
except Exception as e:
    logger.error("Failed to create 'measure_1' and 'measure_2' columns", exc_info=True)

# Drop the original 'measures' column
try:
    dfmeasures.drop('measures', axis=1, inplace=True)
    logger.info("Dropped 'measures' column.")
    logger.debug(dfmeasures.head())
    logger.debug(dfmeasures.info())
except Exception as e:
    logger.error("Failed to drop 'measures' column", exc_info=True)

# Function to combine measures from 'measure_1' and 'measure_2'
def combine_measures(row):
  """
  Combines values from 'measure_1' and 'measure_2' columns into a single string.

  Args:
      row: A pandas Series representing a row of the DataFrame.

  Returns:
      A string containing the combined measure value(s).
  """
  measure_1 = row['measure_1']
  measure_2 = row['measure_2']

  if measure_1 == measure_2:
    return measure_1  # Same values, return one
  else:
    return f"{measure_1} / {measure_2}"  # Different values are concatenated with "/"

# Apply the function to create a new 'measure' column
try:
    dfmeasures['measure'] = dfmeasures.apply(combine_measures, axis=1)
    logger.info("Created 'measure' column by combining 'measure_1' and 'measure_2'.")
except Exception as e:
    logger.error("Failed to create 'measure' column", exc_info=True)

# Drop 'measure_1' and 'measure_2' columns
try:
    dfmeasures.drop(['measure_1', 'measure_2'], axis=1, inplace=True)
    logger.info("Dropped 'measure_1' and 'measure_2' columns.")
    logger.debug(dfmeasures.head())
    logger.debug(dfmeasures.info())
except Exception as e:
    logger.error("Failed to drop 'measure_1' and 'measure_2' columns", exc_info=True)

#----------------------------------------------------------------------
# dealing with id_recipe in dfmeasures

# Create a mapping dictionary from dfrecipes, 
try:
    RECIPE_MAPPING = dfrecipes.set_index('recipe_title')['id_recipe'].to_dict()
    logger.info("Created 'RECIPE_MAPPING'.")
except Exception as e:
    logger.error("Failed to create 'RECIPE_MAPPING'", exc_info=True)

# Function to map recipe titles to id_recipe
def map_with_none(recipe_title, RECIPE_MAPPING):
    """Maps recipe names to id_recipe, handling missing values."""
    if pd.isna(recipe_title):
        return None  # Return None for missing titles
    return RECIPE_MAPPING.get(recipe_title, None)  # Use get() to avoid KeyError for missing keys

try:
    # Apply the mapping function to create 'id_recipe' column
    dfmeasures['id_recipe'] = dfmeasures['title'].apply(map_with_none, args=(RECIPE_MAPPING,))
    logger.info("Added 'id_recipe' column based on 'RECIPE_MAPPING'.")
    # Convert id_recipe to integer type where applicable, keeping None values
    dfmeasures['id_recipe'] = dfmeasures['id_recipe'].astype('Int64')
    logger.info("Converted 'id_recipe' column to Integer type.")
    logger.debug(dfmeasures.head())
    logger.debug(dfmeasures.info())
except Exception as e:
    logger.error("Failed to add or convert 'id_recipe' column", exc_info=True)

#----------------------------------------------------------------------
# dealing with id_ingredient in dfmeasures

# Create a mapping dictionary from dfIng
try:
    ING_MAPPING = dfIng.set_index('ing_name')['id_ingredient'].to_dict()
    logger.info("Created 'ING_MAPPING'.")
except Exception as e:
    logger.error("Failed to create 'ING_MAPPING'", exc_info=True)

# Function to map ingredient names to id_ingredient
def map_with_none(ing_name, ING_MAPPING):
    """Maps ingredient names to id_ingredient, handling missing values."""
    if pd.isna(ing_name):
        return None  # Return None for missing titles
    return ING_MAPPING.get(ing_name, None)  # Use get() to avoid KeyError for missing keys
    
try:
    # Apply the mapping function to create 'id_ingredient' column
    dfmeasures['id_ingredient'] = dfmeasures['ing_name'].apply(map_with_none, args=(ING_MAPPING,))
    logger.info("Added 'id_ingredient' column based on 'ING_MAPPING'.")
    # Convert id_ingredient to integer type where applicable, keeping None values
    dfmeasures['id_ingredient'] = dfmeasures['id_ingredient'].astype('Int64')
    logger.info("Converted 'id_ingredient' column to Integer type.")
    logger.debug(dfmeasures.head())
    logger.debug(dfmeasures.info())
except Exception as e:
    logger.error("Failed to add or convert 'id_ingredient' column", exc_info=True)

#----------------------------------------------------------------------
# Drop 'ing_name' and 'title' columns to finalize the reference DataFrame
try:
    dfreference_ing = dfmeasures.drop(['ing_name', 'title'], axis=1)
    logger.info("Dropped 'ing_name' and 'title' columns.")
    logger.debug(dfreference_ing.head())
    logger.debug(dfreference_ing.info())
except Exception as e:
    logger.error("Failed to finalize reference DataFrame by dropping 'ing_name' and 'title' columns", exc_info=True)


## Dealing with steps

In [None]:
# Extract the column that contains all the instructions along with the recipes titles
try:
    dfAllIns = pd.DataFrame(df, columns=['analyzedInstructions', 'title'])
    logger.info(f"Extracted instructions for {len(dfAllIns)} recipes.")
    logger.debug(dfAllIns.head())
    logger.debug(dfAllIns.info())
except Exception as e:
    logger.error("Failed to extract instructions and titles", exc_info=True)

#---------------------------------------------------------
# obtaining a new dataframe that contains steps of each recipe


# Define the required columns for the steps DataFrame
dfsteps_COLUMNS = ['steps', 'title']

# Initialize an empty list to collect rows of data
rows_data = []

# Define the required columns for the steps DataFrame
for i, row_series in dfAllIns.iterrows():
    try:
        # Access the recipe title for the current row 
        title = row_series['title']  # title is a string
        
        # Access the 'analyzedInstructions' column for the current row
        instructions_list = row_series['analyzedInstructions']  # instructions_list is a list containing one dictionary
        
        if instructions_list and isinstance(instructions_list[0], dict):  # Check if instructions_list is non-empty and its first element is a dictionary
            # Get steps list
            steps_list = instructions_list[0].get('steps', [])
        else:
            steps_list = None
        
        # Create a dictionary for the row data
        row_data = {
            'steps': steps_list,
            'title': title
        }
        
        # Append the row data to the list
        rows_data.append(row_data)
    except Exception as e:
        logger.error(f"Error processing row {i} for recipe '{title}'", exc_info=True)

# Create the DataFrame from the list of row data
try:
    dfsteps = pd.DataFrame(rows_data, columns=dfsteps_COLUMNS)
    logger.info(f"Constructed steps DataFrame for {len(dfsteps)} recipes.")
    logger.debug(dfsteps.head())
    logger.debug(dfsteps.info())
except Exception as e:
    logger.error("Failed to construct steps DataFrame", exc_info=True)


#---------------------------------------------------------
# transforming the steps column

# Define the required columns for the step DataFrame
dfsteps_COLUMNS = ['equipment', 'step', 'length', 'number', 'title']

# Initialize an empty list to collect rows of data
rows_data = []

# Iterate through each row of the original DataFrame dfsteps
for i, row_series in dfsteps.iterrows():
    try:
        # Access the recipe title for the current row 
        title = row_series['title']  # title is a string
        
        # Access the 'steps' column for the current row
        steps_list = row_series['steps']
        
        if steps_list is not None:
            # Iterate through each dictionary in the steps_list
            for each_dict in steps_list:
                if each_dict and isinstance(each_dict, dict):  # Check if each_dict is non-empty and is a dictionary
                    # Extract relevant information from each_dict
                    number = each_dict.get('number', None)
                    step = each_dict.get('step', None)
                    time = each_dict.get('length', {}).get('number', None)
                    unit = each_dict.get('length', {}).get('unit', None)
                    
                    # Calculate length based on time and unit
                    if time is None or unit is None:
                        length = None
                    else:
                        length = f"{time} {unit}"
                    
                    equipment_list = each_dict.get('equipment', [])  # Get equipment list or empty list if 'equipment' is missing
                    
                    # Handle NaN or None values in equipment_list
                    if isinstance(equipment_list, list):
                        equipment_list = [e if pd.notna(e) else None for e in equipment_list]
                    
                    # Create a dictionary for the row data
                    row_data = {
                        'length': length,
                        'number': number,
                        'step': step,
                        'equipment': equipment_list,
                        'title': title
                    }
                    
                    # Append the row data to the list
                    rows_data.append(row_data)
        else:
            # If steps_list is None, create a row with None values
            row_data = {
                'length': None,
                'number': None,
                'step': None,
                'equipment': None,
                'title': title
            }
            
            # Append the row data to the list
            rows_data.append(row_data)
    except Exception as e:
        logger.error(f"Error processing row {i} for recipe '{title}'", exc_info=True)

# Create the DataFrame from the list of row data
try:
    dfstep = pd.DataFrame(rows_data, columns=dfsteps_COLUMNS)
    logger.info(f"Constructed steps DataFrame with {len(dfstep)} steps.")
    logger.debug(dfstep.head())
    logger.debug(dfstep.info())
except Exception as e:
    logger.error("Failed to construct steps DataFrame", exc_info=True)

# Find duplicates
try:
    duplicates = dfstep[dfstep.duplicated(subset=["step", "length", "number", "title"])]
    logger.debug(duplicates)

    # Remove duplicates
    if not duplicates.empty:
        logger.warning(f"Found {len(duplicates)} duplicate steps. Removing duplicates.")
        dfstep = dfstep.drop_duplicates()
        dfstep.reset_index(drop=True, inplace=True)
        logger.info(f"DataFrame now contains {len(dfstep)} steps after removing duplicates.")
    else:
        logger.info("No duplicates found")
except Exception as e:
    logger.error("Failed to find or remove duplicates", exc_info=True)

# Checking if there's a Null values in step column
try:
    none_rows = dfstep[dfstep['step'].isna()]
    logger.debug(none_rows)

    # Deleting the null steps in dfstep DataFrame.
    if len(none_rows) > 0:
        logger.warning(f"Found {len(none_rows)} null steps. Deleting null steps.")
        dfstep.dropna(subset=['step'], inplace=True)
        dfstep.reset_index(drop=True, inplace=True)
        logger.info("Null steps deleted")
        logger.info(f"DataFrame now contains {len(dfstep)} steps after removing null values.")
    else:
        logger.info("No null steps found")
except Exception as e:
    logger.error("Failed to handle null steps", exc_info=True)

logger.debug(dfstep.head())
logger.debug(dfstep.info())   
    
#---------------------------------------------------------
#transforming equipment column

# Initialize an empty list to collect rows of data
rows_data = []

# Iterate through each row of the original DataFrame dfstep
for i, row_series in dfstep.iterrows():
    try:
        # Access the recipe title for the current row 
        title = row_series['title']
        # Access the step length for the current row 
        length = row_series['length']
        # Access the recipe step for the current row 
        step = row_series['step']
        # Access the step number for the current row 
        number = row_series['number']
        # Access the 'equipment' column for the current row
        equipments_list = row_series['equipment']  #
        
        # Initialize variables to store processed data
        equipments_name_list = []
        
        # Process equipments_list
        if equipments_list and isinstance(equipments_list, list):
            for each_dict in equipments_list:
                if each_dict and isinstance(each_dict, dict):  # Check if each_dict is non-empty and is a dictionary
                    # Get a list containing equipments names
                    equipment_name = each_dict.get('name', None)
                    if equipment_name is not None:  # Ensure equipment_name is not None
                        equipments_name_list.append(equipment_name)
        else:
            equipments_name_list = None  # Handle case where equipments_list is None or not a list
        
        # Create a dictionary for the row data
        row_data = {
            'length': length,
            'number': number,
            'step': step,
            'equipment': equipments_name_list,
            'title': title
        }
        
        # Append the row data to the list
        rows_data.append(row_data)
    except Exception as e:
        logger.error(f"Error processing row {i} for recipe '{title}'", exc_info=True)

# Create the DataFrame from the list of row data
try:
    dfstepclean = pd.DataFrame(rows_data, columns=dfsteps_COLUMNS)
    logger.info(f"Constructed dfstepclean DataFrame with {len(dfstepclean)} steps and contains cleaned lists of equipments")
    logger.debug(dfstepclean.head())
    logger.debug(dfstepclean.info())
except Exception as e:
    logger.error("Failed to construct dfstepclean DataFrame", exc_info=True)

#---------------------------------------------------------
#dealing with dfstep_final

try:
    # generating id_step
    dfstepclean['id_step'] = range(1, len(dfstepclean) + 1)
    logger.info("Added 'id_step'.")
    logger.debug(dfstepclean.info())

    dfstep_final = dfstepclean.copy()
    logger.debug(dfstep_final.info())

    # Apply the custom map function
    dfstep_final['id_recipe'] = dfstep_final['title'].apply(map_with_none, args=(RECIPE_MAPPING,))
    logger.info("'id_recipe' column added according to the 'RECIPE_MAPPING'")

    # Convert id_recipe to integer type where applicable, keeping None values
    dfstep_final['id_recipe'] = dfstep_final['id_recipe'].astype('Int64')
    logger.info("'id_recipe' column converted to Integer")

    # Drop columns by label (column name)
    dfstep_final.drop(columns=['equipment', 'title'], inplace=True)
    logger.info("'equipment' and 'title' columns deleted")
    logger.debug(dfstep_final.head())
    logger.debug(dfstep_final.info())
except Exception as e:
    logger.error("Failed to finalize dfstep_final DataFrame", exc_info=True)


## Dealing with instructions

In [None]:
try:
    # Assign a unique number to each unique title
    dfstep_final['instruction_id'] = pd.factorize(dfstep_final['id_recipe'])[0] + 1
    logger.info("Added 'instruction_id'.")
    logger.debug(dfstep_final.head(30))
    logger.debug(dfstep_final.info())
except Exception as e:
    logger.error("Failed to add 'instruction_id'.", exc_info=True)


try:
    # Create the instructions dataframe
    dfIns = dfstep_final[['instruction_id', 'id_recipe']].copy()
    logger.info("Constructed instructions dataframe")
    logger.debug(dfIns.head())
    logger.debug(dfIns.info())
except Exception as e:
    logger.error("Failed to construct instructions dataframe", exc_info=True)

try:
    # Find duplicates
    duplicates = dfIns[dfIns.duplicated()]
    logger.debug(duplicates)

    # Remove duplicates
    if not duplicates.empty:
        logger.warning(f"Found {len(duplicates)} duplicate instructions. Removing duplicates.")
        dfIns = dfIns.drop_duplicates()
        dfIns.reset_index(drop=True, inplace=True)
        logger.info(f"DataFrame now contains {len(dfIns)} instructions after removing duplicates.")
    else:
        logger.info("No duplicates found")
except Exception as e:
    logger.error("An unexpected error occurred while handling duplicates.", exc_info=True)

try:
    # Drop a column by label (column name)
    dfstep_final.drop(columns=['id_recipe'], inplace=True)
    logger.info("'id_recipe' deleted from steps dataframe")
except Exception as e:
    logger.error("Failed to drop 'id_recipe' column.", exc_info=True)

logger.debug(dfIns.head())
logger.debug(dfIns.info())
logger.debug(dfstep_final.info())

## Dealing with equipments

In [None]:
# Define the required columns for the equipments DataFrame
dfequip_COLUMNS = ['name']

# Initialize an empty list to collect rows of data
rows_data = []

# Iterate through each row of the original DataFrame
for i, row_series in dfstep.iterrows():
    try:
        equip_list = row_series['equipment']
        
        if equip_list and isinstance(equip_list, list):
            for each_dict in equip_list:
                # Check if the element is a dictionary
                if each_dict and isinstance(each_dict, dict):
                    name = each_dict.get('name', None)
                    
                    # Create a dictionary for the row data
                    row_data = {
                        'name': name
                    }
                    
                    # Append the row data to the list
                    rows_data.append(row_data)
                else:
                    logger.warning(f"Non-dictionary element found in equipment list at row {i}. Skipping element.")
        else:
            logger.warning(f"Equipment list is empty or not a list at row {i}. Skipping row.")

    except Exception as e:
        logger.error(f"Error processing row {i}", exc_info=True)

# Create the DataFrame from the list of row data
try:
    dfequip = pd.DataFrame(rows_data, columns=dfequip_COLUMNS)
    logger.info(f"Constructed equipments DataFrame with {len(dfequip)} equipments.")
    logger.debug(dfequip.head())
    logger.debug(dfequip.info())
except Exception as e:
    logger.error("Failed to construct equipments DataFrame.", exc_info=True)
    dfequip = pd.DataFrame(columns=dfequip_COLUMNS)  # Create an empty DataFrame as a fallback


try:
    # Find duplicates
    duplicates = dfequip[dfequip.duplicated()]
    logger.debug(duplicates)

    # Remove duplicates
    if not duplicates.empty:
        logger.warning(f"Found {len(duplicates)} duplicate equipments. Removing duplicates.")
        dfequip = dfequip.drop_duplicates()
        dfequip.reset_index(drop=True, inplace=True)
        logger.info(f"DataFrame now contains {len(dfequip)} equipments after removing duplicates.")
    else:
        logger.info("No duplicates found")
except Exception as e:
    logger.error("An error occurred while finding or removing duplicates.", exc_info=True)



# Generate 'id_equipment' column
try:
    dfequip['id_equipment'] = range(1, len(dfequip) + 1)
    logger.info("Added 'id_equipment'.")
    logger.debug(dfequip.info())
except Exception as e:
    logger.error("Failed to add 'id_equipment' column.", exc_info=True)

# Rename 'name' column to 'equip_name'
try:
    dfequip = dfequip.rename(columns={'name': 'equip_name'})
    logger.info("Renamed column 'name' to 'equip_name'.")
    logger.debug(dfequip.head())
    logger.debug(dfequip.info())
except KeyError as e:
    logger.error("Failed to rename 'name' column because it is missing.", exc_info=True)
except Exception as e:
    logger.error("An unexpected error occurred while renaming 'name' column.", exc_info=True)


## Dealing with reference_equip

In [None]:
# Explode the equipment column in dfstepclean dataframe because equipment is a list of equipments 
try:
    dfstepclean_exploded = dfstepclean.explode(['equipment'])
    dfstepclean_exploded = dfstepclean_exploded.reset_index(drop=True)
    logger.info("'equipment' column in step dataframe exploded.")
except Exception as e:
    logger.error("Failed to explode 'equipment' column.", exc_info=True)

# Rename the exploded column to ing_name
try:
    dfstepclean_exploded = dfstepclean_exploded.rename(columns={'equipment': 'equip_name'})
    logger.info("Renamed column 'equipment' to 'equip_name'.")
    logger.debug(dfstepclean_exploded.head())
except Exception as e:
    logger.error("Failed to rename 'equipment' column", exc_info=True)

# -----------------------------------------------------------
# dealing with id_equipment in dfstepclean_exploded
try:
    # Create a mapping dictionary from dfequip
    EQUIP_MAPPING = dfequip.set_index('equip_name')['id_equipment'].to_dict()
    logger.info("'EQUIP_MAPPING' created")
except Exception as e:
    logger.error("Failed to create 'EQUIP_MAPPING'.", exc_info=True)

# Function to map equipment names to id_equipment, handling missing values
def map_with_none(equip_name, EQUIP_MAPPING):
    """Maps equipment names to id_equipment, handling missing values."""
    if pd.isna(equip_name):
        return None  # Return None for missing equipment names
    return EQUIP_MAPPING.get(equip_name, None)  # Use get() to avoid KeyError for missing keys

# Apply the custom map function to fill the new column
try:
    dfstepclean_exploded['id_equipment'] = dfstepclean_exploded['equip_name'].apply(map_with_none, args=(EQUIP_MAPPING,))
    logger.info("'id_equipment' column added according to the 'EQUIP_MAPPING'.")
except Exception as e:
    logger.error("Failed to add 'id_equipment' column.", exc_info=True)

# Convert id_equipment to integer type where applicable, keeping None values
try:
    dfstepclean_exploded['id_equipment'] = dfstepclean_exploded['id_equipment'].astype('Int64')
    logger.info("'id_equipment' column converted to Integer.")
    logger.debug(dfstepclean_exploded.head())
    logger.debug(dfstepclean_exploded.info())
except Exception as e:
    logger.error("Failed to convert 'id_equipment' column to Integer.", exc_info=True)

# -----------------------------------------------------------
# dealing with id_recipe in dfstepclean_exploded

try:
    # Apply the custom map function to fill the new column
    dfstepclean_exploded['id_recipe'] = dfstepclean_exploded['title'].apply(map_with_none, args=(RECIPE_MAPPING,))
    logger.info("'id_recipe' column added according to the 'RECIPE_MAPPING'.")
except Exception as e:
    logger.error("Failed to add 'id_recipe' column.", exc_info=True)
    
try:
    # Convert id_recipe to integer type where applicable, keeping None values
    dfstepclean_exploded['id_recipe'] = dfstepclean_exploded['id_recipe'].astype('Int64')
    logger.info("'id_recipe' column converted to Integer.")
    logger.debug(dfstepclean_exploded.head())
    logger.debug(dfstepclean_exploded.info())
except Exception as e:
    logger.error("Failed to convert 'id_recipe' column to Integer.", exc_info=True)

# -----------------------------------------------------------
# creating the dfreference_equip dataframe
try:
    dfreference_equip = dfstepclean_exploded[['id_recipe', 'id_step', 'id_equipment']]
    logger.info("Constructed reference_equip dataframe.")
    logger.debug(dfreference_equip.head())
    logger.debug(dfreference_equip.info())
except Exception as e:
    logger.error("Failed to create 'dfreference_equip' dataframe.", exc_info=True)

# Clean up dataframes
try:
    del dfstepclean_exploded
    del dfstepclean
    logger.info("Deleted intermediate dataframes.")
except Exception as e:
    logger.error("One or more intermediate dataframes were not found during deletion.", exc_info=True)



## Dealing with dish types

In [None]:
# Define the required columns for the dish DataFrame
dfdish_COLUMNS = ['dishTypes', 'title']
# Construct the DataFrame
try:
    dfdish = pd.DataFrame(df, columns=dfdish_COLUMNS)
    logger.info(f"Extracted dish types for {len(dfdish)} recipes.")
    logger.debug(dfdish.head())
    logger.debug(dfdish.info())
except Exception as e:
    logger.error("Failed to construct dfdish DataFrame.", exc_info=True)

# Explode the 'dishTypes' column
try:
    dfALLdish_types = dfdish.explode('dishTypes').reset_index(drop=True)
    logger.info(f"'dishTypes' column exploded and there are {len(dfALLdish_types)} dish types.")
    logger.debug(dfALLdish_types.head())
    logger.debug(dfALLdish_types.info())
except Exception as e:
    logger.error("Failed to explode 'dishTypes' column.", exc_info=True)

# Copy the exploded DataFrame
dfdish_type=dfALLdish_types.copy()

# Delete 'title' column
try:
    dfdish_type.drop(columns=['title'], inplace=True)
    logger.info("Deleted 'title' column.")
except Exception as e:
    logger.error("Failed to delete 'title' column.", exc_info=True)

try:
    # Find duplicates
    duplicates = dfdish_type[dfdish_type.duplicated()]
    logger.debug(duplicates)
    
    # Remove duplicates
    if not duplicates.empty:
        logger.warning(f"Found {len(duplicates)} duplicate dish types. Removing duplicates.")
        dfdish_type = dfdish_type.drop_duplicates()
        dfdish_type.reset_index(drop=True, inplace=True)
        logger.info(f"DataFrame now contains {len(dfdish_type)} dish types after removing duplicates.")
    else:
        logger.info("No duplicates found.")
except Exception as e:
    logger.error("An error occurred while finding or removing duplicates.", exc_info=True)


try:
    # Find Null values
    none_rows = dfdish_type[dfdish_type['dishTypes'].isna()]
    logger.debug(none_rows)

    # Remove null values
    if not none_rows.empty:
        logger.warning(f"Found {len(none_rows)} null dish types. Removing null values.")
        dfdish_type.dropna(inplace=True)
        dfdish_type.reset_index(drop=True, inplace=True)
        logger.info(f"DataFrame now contains {len(dfdish_type)} dish types after removing null values.")
    else:
        logger.info("No null values found.")
except Exception as e:
    logger.error("An error occurred while finding or removing null values.", exc_info=True)

# Rename 'dishTypes' column to 'dish_type'
try:
    dfdish_type = dfdish_type.rename(columns={'dishTypes': 'dish_type'})
    logger.info("Renamed column 'dishTypes' to 'dish_type'.")
except Exception as e:
    logger.error("Failed to rename 'dishTypes' column.", exc_info=True)

# Generate 'id_dish_type' column
try:
    dfdish_type['id_dish_type'] = range(1, len(dfdish_type) + 1)
    logger.info("Added 'id_dish_type' column.")
    logger.debug(dfdish_type.head())
    logger.debug(dfdish_type.info())
except Exception as e:
    logger.error("Failed to add 'id_dish_type' column.", exc_info=True)


## dfis_a dataframe

In [None]:
# Create a mapping dictionary from dfdish_type
try:
    DISH_MAPPING = dfdish_type.set_index('dish_type')['id_dish_type'].to_dict()
    logger.info("'DISH_MAPPING' created.")
except Exception as e:
    logger.error("Failed to create DISH_MAPPING.", exc_info=True)

# Define a function to map dish types to id_dish_type, handling missing values
def map_with_none(dish_type, mapping_dict):
    """Maps dish type names to id_dish_type, handling missing values."""
    if pd.isna(dish_type):
        return None  # Return None for missing dish types
    return mapping_dict.get(dish_type, None)  # Use get() to avoid KeyError for missing keys

# Apply the custom map function to 'dishTypes' column
try:
    dfALLdish_types['id_dish_type'] = dfALLdish_types['dishTypes'].apply(map_with_none, args=(DISH_MAPPING,))
    logger.info("'id_dish_type' column added according to the 'DISH_MAPPING'.")
except Exception as e:
    logger.error("Failed to map 'dishTypes' to 'id_dish_type'.", exc_info=True)

# Convert 'id_dish_type' to integer type where applicable, keeping None values
try:
    dfALLdish_types['id_dish_type'] = dfALLdish_types['id_dish_type'].astype('Int64')
    logger.info("'id_dish_type' column converted to Integer.")
except Exception as e:
    logger.error("Failed to convert 'id_dish_type' to Integer.", exc_info=True)

# Apply the custom map function to 'title' column for 'id_recipe'
try:
    dfALLdish_types['id_recipe'] = dfALLdish_types['title'].apply(map_with_none, args=(RECIPE_MAPPING,))
    logger.info("'id_recipe' column added according to the 'RECIPE_MAPPING'.")
except Exception as e:
    logger.error("Failed to map 'title' to 'id_recipe'.", exc_info=True)

# Convert 'id_recipe' to integer type where applicable, keeping None values
try:
    dfALLdish_types['id_recipe'] = dfALLdish_types['id_recipe'].astype('Int64')
    logger.info("'id_recipe' column converted to Integer.")
except Exception as e:
    logger.error("Failed to convert 'id_recipe' to Integer.", exc_info=True)

# Log the DataFrame information
logger.debug(dfALLdish_types.head())
logger.debug(dfALLdish_types.info())

# Delete 'dishTypes' and 'title' columns
try:
    dfALLdish_types.drop(columns=['dishTypes', 'title'], inplace=True)
    logger.info("Deleted columns 'dishTypes' and 'title'.")
except Exception as e:
    logger.error("Failed to delete columns 'dishTypes' or 'title'.", exc_info=True)

# Copy the DataFrame and delete the original
try:
    dfis_a = dfALLdish_types.copy()
    del dfALLdish_types
    logger.info("Copied dfALLdish_types to dfis_a and deleted dfALLdish_types.")
except Exception as e:
    logger.error("Failed to copy or delete dfALLdish_types.", exc_info=True)

# Log the final DataFrame information
logger.debug(dfis_a.head())
logger.debug(dfis_a.info())


## Dealing with cuisines

### extracting needed columns

In [None]:
# Define the required columns for the cuisines DataFrame
dfcuisines_COLUMNS = ['cuisines', 'title']

# Construct the DataFrame
try:
    dfALLcuisines = pd.DataFrame(df, columns=dfcuisines_COLUMNS)
    logger.info(f"Extracted cuisines for {len(dfALLcuisines)} recipes.")
    logger.debug(dfALLcuisines.head())
    logger.debug(dfALLcuisines.info())
except Exception as e:
    logger.error("Failed to construct dfALLcuisines DataFrame.", exc_info=True)

# Explode the 'cuisines' column
try:
    dfALLcuisines = dfALLcuisines.explode('cuisines').reset_index(drop=True)
    logger.info(f"'cuisines' column exploded and contains {len(dfALLcuisines)} cuisines.")
    logger.debug(dfALLcuisines.info())
except Exception as e:
    logger.error("Failed to explode 'cuisines' column.", exc_info=True)

# Copy the exploded DataFrame
dfcuisine=dfALLcuisines.copy()

# Delete 'title' column
try:
    dfcuisine.drop(columns=['title'], inplace=True)
    logger.info("Deleted 'title' column.")
except Exception as e:
    logger.error("Failed to delete 'title' column.", exc_info=True)

try:
    # Find duplicates
    duplicates = dfcuisine[dfcuisine.duplicated()]
    logger.debug(duplicates)

    # Remove duplicates
    if not duplicates.empty:
        logger.warning(f"Found {len(duplicates)} duplicate cuisines. Removing duplicates.")
        dfcuisine = dfcuisine.drop_duplicates()
        dfcuisine.reset_index(drop=True, inplace=True)
        logger.info(f"DataFrame now contains {len(dfcuisine)} cuisines after removing duplicates.")
    else:
        logger.info("No duplicates found.")
except Exception as e:
    logger.error("An error occurred while finding or removing duplicates.", exc_info=True)

try:
    # Find Null values
    none_rows = dfcuisine[dfcuisine['cuisines'].isna()]
    logger.debug(none_rows)

    # Remove null values
    if not none_rows.empty:
        logger.warning(f"Found {len(none_rows)} null cuisine values. Removing null values.")
        dfcuisine.dropna(inplace=True)
        dfcuisine.reset_index(drop=True, inplace=True)
        logger.info(f"DataFrame now contains {len(dfcuisine)} cuisines after removing null values.")
    else:
        logger.info("No null values found.")
except Exception as e:
    logger.error("An error occurred while finding or removing null values.", exc_info=True)

# Rename 'cuisines' column to 'recipe_cuisine'
try:
    dfcuisine = dfcuisine.rename(columns={'cuisines': 'recipe_cuisine'})
    logger.info("Renamed column 'cuisines' to 'recipe_cuisine'.")
    logger.debug(dfcuisine.info())
except Exception as e:
    logger.error("Failed to rename 'cuisines' column.", exc_info=True)

# Generate 'id_cuisine' column
try:
    dfcuisine['id_cuisine'] = range(1, len(dfcuisine) + 1)
    logger.info("Added 'id_cuisine' column.")
    logger.debug(dfcuisine.head())
    logger.debug(dfcuisine.info())
except Exception as e:
    logger.error("Failed to add 'id_cuisine' column.", exc_info=True)


### dfbelongs dataframe

In [None]:
# Ensure the CUISINE_MAPPING dictionary is created
try:
    CUISINE_MAPPING = dfcuisine.set_index('recipe_cuisine')['id_cuisine'].to_dict()
    logger.info("'CUISINE_MAPPING' created")
except Exception as e:
    logger.error("Failed to create 'CUISINE_MAPPING'.", exc_info=True)

# Define a function to map recipe cuisines to id_cuisine, handling missing values
def map_with_none(recipe_cuisine, CUISINE_MAPPING):
    """Maps recipes cuisines to id_cuisine, handling missing values."""
    if pd.isna(recipe_cuisine):
        return None  # Return None for missing cuisines
    return CUISINE_MAPPING.get(recipe_cuisine, None)  # Use get() to avoid KeyError for missing keys

# Apply the custom map function to add 'id_cuisine' column
try:
    dfALLcuisines['id_cuisine'] = dfALLcuisines['cuisines'].apply(map_with_none, args=(CUISINE_MAPPING,))
    logger.info("'id_cuisine' column added according to the 'CUISINE_MAPPING'.")
except Exception as e:
    logger.error("Failed to map 'id_cuisine' to 'cuisines'.", exc_info=True)

# Convert id_cuisine to integer type where applicable, keeping None values
try:
    dfALLcuisines['id_cuisine'] = dfALLcuisines['id_cuisine'].astype('Int64')
    logger.info("'id_cuisine' column converted to Integer ")
except Exception as e:
    logger.error("Failed to convert 'id_cuisine' to Integer.", exc_info=True)


# Apply the custom map function to add 'id_recipe' column
try:
    dfALLcuisines['id_recipe'] = dfALLcuisines['title'].apply(map_with_none, args=(RECIPE_MAPPING,))
    logger.info("'id_recipe' column added according to the 'RECIPE_MAPPING'.")
except Exception as e:
    logger.error("Failed to map 'id_recipe' to 'title'.", exc_info=True)

# Convert id_recipe to integer type where applicable, keeping None values
try:
    dfALLcuisines['id_recipe'] = dfALLcuisines['id_recipe'].astype('Int64')
    logger.info("'id_recipe' column converted to Integer.")
except Exception as e:
    logger.error("Failed to convert 'id_recipe' to Integer.", exc_info=True)

# Log the DataFrame head and info
logger.debug(dfALLcuisines.head())
logger.debug(dfALLcuisines.info())

# Drop 'cuisines' and 'title' columns from dfcuisines
try:
    dfALLcuisines.drop(columns=['cuisines', 'title'], inplace=True)
    logger.info("Deleted 'cuisines' and 'title' columns.")
except Exception as e:
    logger.error("Failed to delete 'cuisines' or 'title' columns.", exc_info=True)

# Copy dfcuisines to dfbelongs and delete dfcuisines
try:
    dfbelongs = dfALLcuisines.copy()
    logger.info("Copied dfALLcuisines to dfbelongs.")
    
    del dfALLcuisines
    logger.info("Deleted dfALLcuisines.")

    logger.debug(dfbelongs.head())
    logger.debug(dfbelongs.info())
except Exception as e:
    logger.error("Failed to copy or delete dfcuisines.", exc_info=True)


## Transform Recap

In [None]:
logger.info("Displaying transforamtion recap")

### recipe table
(id_recipe INT, recipe_title VARCHAR(50), ready_min INT, summary VARCHAR(2000), servings INT, is_cheap LOGICAL, price_per_serving DOUBLE, is_vegetarian LOGICAL, is_vegan LOGICAL, is_glutenFree LOGICAL, is_dairyFree LOGICAL, is_healthy LOGICAL, is_sustainable LOGICAL, is_lowFodmap LOGICAL, is_Popular LOGICAL, license VARCHAR(20), source_url VARCHAR(100));

In [None]:
logger.debug(dfrecipes.info())

### Ingredients table
(id_ingredient INT, ing_name VARCHAR(50), consistency VARCHAR(20), aisle VARCHAR(20));

In [None]:
logger.debug(dfIng.info())

### reference_ing table
(#id_recipe, #id_ingredient, measure VARCHAR(50));

In [None]:
logger.debug(dfreference_ing.info())

### Equipment table
(id_equipment INT, equip_name VARCHAR(50));

In [None]:
logger.debug(dfequip.info())

### Instructions table
Instruction = (id_instruction INT, #id_recipe);

In [None]:
logger.debug(dfIns.info())

### steps table
(id_step INT, step VARCHAR(8000), number INT, length VARCHAR(50), #id_instruction);

In [None]:
logger.debug(dfstep_final.info())

### reference_equip table
(#id_recipe, #id_step, #id_equipment);

In [None]:
logger.debug(dfreference_equip.info())

### dish_type table
(id_dish_type INT, dish_type VARCHAR(50))

In [None]:
logger.debug(dfdish_type.info())

### is_a  table
(#id_recipe, #id_dish_type)

In [None]:
logger.debug(dfis_a.info())

### cuisine table
(id_cuisine INT, recipe_cuisine VARCHAR(50));

In [None]:
logger.debug(dfcuisine.info())

### belongs table
(#id_recipe, #id_cuisine);

In [None]:
logger.debug(dfbelongs.info())

# load

In [None]:
# Database configuration
DB_HOST='10.0.2.15' # Database host IP address
DB_PORT= 5432 # Database port number
DB_NAME='recipe_etl' # Name of the database
DB_USER='maryem' # Database user name
DB_PASSWORD='HelloWorld' # Database user password

try:
    # Establish connection to the database
    conn = psycopg2.connect(
            host=DB_HOST,
            port=DB_PORT,
            dbname=DB_NAME,
            user=DB_USER,
            password=DB_PASSWORD
        )
    logger.info(f"Connected to {DB_NAME}")
except psycopg2.Error as e:
    logger.error(f"Error: Could not make connection to the database {DB_NAME}")
    logger.error(e)
    conn = None

In [None]:
# Proceed only if the connection was successful
if conn:
    try:
        # Obtain a cursor to execute queries
        cur = conn.cursor()
        logger.info(f"Cursor obtained for the database {DB_NAME}")
    except psycopg2.Error as e:
        logger.error(f"Error: Could not get cursor for the database {DB_NAME}")
        logger.error(e)
        cur = None

    if cur:
        # Helper function to execute an insert query and check the row count
        def execute_insert_and_check(query, values, table_name):
            try:
                cur.execute(query, values)
                conn.commit()
                
                # Check if the table is empty after insertion
                cur.execute(f"SELECT COUNT(*) FROM {table_name}")
                row_count = cur.fetchone()[0]
                
                if row_count == 0:
                    logger.error(f"The {table_name} table is empty after insertion attempts.")
                    raise ValueError(f"The {table_name} table is empty after insertion attempts.")
                else:
                    logger.info(f"Values inserted into {table_name} successfully")
            except psycopg2.Error as e:
                logger.error(f"Error in inserting values into {table_name}: {e}")
                conn.rollback()
                raise
            except ValueError as ve:
                logger.error(ve)
                raise

        # Insert into Recipe table
        try:
            for index, row in dfrecipes.iterrows():
                execute_insert_and_check("""
                    INSERT INTO Recipe (id_recipe, recipe_title, ready_min, summary, servings, is_cheap, price_per_serving, 
                    is_vegetarian, is_vegan, is_glutenFree, is_dairyFree, is_healthy, is_sustainable, is_lowFodmap, 
                    is_Popular, license, source_url)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);""", (
                    int(row['id_recipe']),
                    row['recipe_title'],
                    int(row['ready_min']),
                    row['summary'],
                    int(row['servings']),
                    bool(row['is_cheap']),
                    float(row['price_per_serving']),
                    bool(row['is_vegetarian']),
                    bool(row['is_vegan']),
                    bool(row['is_glutenFree']),
                    bool(row['is_dairyFree']),
                    bool(row['is_healthy']),
                    bool(row['is_sustainable']),
                    bool(row['is_lowFodmap']),
                    bool(row['is_Popular']),
                    row['license'],
                    row['source_url'],
                ), 'Recipe')
        except Exception as e:
            logger.error("Failed to insert into Recipe table")

        # Insert into Instruction table
        try:
            for index, row in dfIns.iterrows():
                execute_insert_and_check("""
                    INSERT INTO Instruction (id_instruction, id_recipe)
                    VALUES (%s, %s);""", (
                    int(row['instruction_id']),
                    int(row['id_recipe'])
                ), 'Instruction')
        except Exception as e:
            logger.error("Failed to insert into Instruction table")

        # Insert into Ingredient table
        try:
            for index, row in dfIng.iterrows():
                execute_insert_and_check("""
                    INSERT INTO Ingredient (id_ingredient, ing_name, consistency, aisle)
                    VALUES (%s, %s, %s, %s);""", (
                    int(row['id_ingredient']),
                    row['ing_name'],
                    row['consistency'],
                    row['aisle']
                ), 'Ingredient')
        except Exception as e:
            logger.error("Failed to insert into Ingredient table")

        # Insert into Step table
        try:
            for index, row in dfstep_final.iterrows():
                execute_insert_and_check("""
                    INSERT INTO Step (id_step, step, number, length, id_instruction)
                    VALUES (%s, %s, %s, %s, %s);""", (
                    int(row['id_step']),
                    row['step'],
                    int(row['number']),
                    row['length'],
                    int(row['instruction_id'])
                ), 'Step')
        except Exception as e:
            logger.error("Failed to insert into Step table")

        # Insert into Equipment table
        try:
            for index, row in dfequip.iterrows():
                execute_insert_and_check("""
                    INSERT INTO Equipment (id_equipment, equip_name)
                    VALUES (%s, %s);""", (
                    int(row['id_equipment']),
                    row['equip_name']
                ), 'Equipment')
        except Exception as e:
            logger.error("Failed to insert into Equipment table")

        # Insert into Dish table
        try:
            for index, row in dfdish_type.iterrows():
                execute_insert_and_check("""
                    INSERT INTO Dish (id_dish_type, dish_type)
                    VALUES (%s, %s);""", (
                    int(row['id_dish_type']),
                    row['dish_type']
                ), 'Dish')
        except Exception as e:
            logger.error("Failed to insert into Dish table")

        # Insert into Cuisine table
        try:
            for index, row in dfcuisine.iterrows():
                execute_insert_and_check("""
                    INSERT INTO Cuisine (id_cuisine, recipe_cuisine)
                    VALUES (%s, %s);""", (
                    int(row['id_cuisine']),
                    row['recipe_cuisine']
                ), 'Cuisine')
        except Exception as e:
            logger.error("Failed to insert into Cuisine table")

        # Insert into reference_ing table
        try:
            for index, row in dfreference_ing.iterrows():
                execute_insert_and_check("""
                    INSERT INTO reference_ing (id_recipe, id_ingredient, measure)
                    VALUES (%s, %s, %s);""", (
                    int(row['id_recipe']),
                    int(row['id_ingredient']),
                    row['measure']
                ), 'reference_ing')
        except Exception as e:
            logger.error("Failed to insert into reference_ing table")

        # Insert into reference_equip table
        dfreference_equip = dfreference_equip.replace({pd.NA: np.nan})
        try:
            for index, row in dfreference_equip.iterrows():
                id_equipment = None if pd.isna(row['id_equipment']) else int(row['id_equipment'])
                execute_insert_and_check("""
                    INSERT INTO reference_equip (id_recipe, id_step, id_equipment)
                    VALUES (%s, %s, %s);""", (
                    int(row['id_recipe']),
                    int(row['id_step']),
                    id_equipment
                ), 'reference_equip')
        except Exception as e:
            logger.error("Failed to insert into reference_equip table")

        # Insert into is_a table
        dfis_a = dfis_a.replace({pd.NA: np.nan})
        try:
            for index, row in dfis_a.iterrows():
                id_dish_type = None if pd.isna(row['id_dish_type']) else int(row['id_dish_type'])
                execute_insert_and_check("""
                    INSERT INTO is_a (id_recipe, id_dish_type)
                    VALUES (%s, %s);""", (
                    int(row['id_recipe']),
                    id_dish_type
                ), 'is_a')
        except Exception as e:
            logger.error("Failed to insert into is_a table")

        # Insert into belongs table
        dfbelongs = dfbelongs.replace({pd.NA: np.nan})
        try:
            for index, row in dfbelongs.iterrows():
                id_cuisine = None if pd.isna(row['id_cuisine']) else int(row['id_cuisine'])
                execute_insert_and_check("""
                    INSERT INTO belongs (id_recipe, id_cuisine)
                    VALUES (%s, %s);""", (
                    int(row['id_recipe']),
                    id_cuisine
                ), 'belongs')
        except Exception as e:
            logger.error("Failed to insert into belongs table")
        
        finally:
            cur.close()
    else:
        logger.error("Cursor could not be obtained. Exiting the program.")
    conn.close()
else:
    logger.error("Connection to the database could not be established. Exiting the program.")
