# Convert current Data to Json

Import all the required libraries.

In [1]:
import re
import json
import os
from typing import Any, cast

from bs4 import BeautifulSoup, Tag
from markdownify import MarkdownConverter

In [2]:
# general vars
base_path = "data/raw"
out_path = "data/out"
bs_parser = "html.parser"

# list of the files
files_name = [
    "bebidas",
    "botanas",
    "carnes-y-aves",
    "comida-para-ninos",
    "desayunos",
    "ensaladas",
    "faciles",
    "guarniciones-de-exito",
    "guarniciones",
    "navidad",
    "panes",
    "pastas",
    "pescados-y-mariscos",
    "platos-fuertes",
    "postres-para-ninos",
    "postres",
    "recetas-a-la-parrilla",
    "salsas",
    "saludables",
    "sopas",
]

## Helper functions

To convert the HTML to markdown files and clean all the data, I have created some helper functions to clean the text, get the general information from each recipe, get the ingredients, and the preparation steps

In [3]:
def clean(text: Any) -> str:
    """Convert text to a string and clean it."""
    if text is None:
        return ""
    if isinstance(text, Tag):
        text = text.get_text()
    if not isinstance(text, str):
        text = str(text)
    """Replace non-breaking space with normal space and remove surrounding whitespace."""
    text = text.replace(" ", " ").replace("\u200b", "").replace("\u200a", " ")
    text = re.sub(r"(\n\s*)+\n", "\n\n", text)
    text = re.sub(r" +\n", "\n", text)
    return cast(str, text.strip())

In [4]:
def get_general_info(soup, link) -> dict:
    e_preparacion = soup.select_one("div.recipe-info-tiempos-nivel .icon-k7-receta-tpreparacion span")
    tpreparacion = clean(e_preparacion.get_text()) if e_preparacion else 'N/A'
    e_tcocinar = soup.select_one("div.recipe-info-tiempos-nivel .icon-k7-receta-tcocinar span")
    tcocinar = clean(e_tcocinar.get_text()) if e_tcocinar else 'N/A'
    e_tdificulty = soup.select_one("div.recipe-info-tiempos-nivel .icon-k7-receta-tdificultad span")
    tdificulty = clean(e_tdificulty.get_text()) if e_tdificulty else 'N/A'

    general_info = {
        'source': link,
        'prep_time': tpreparacion,
        'cook_time': tcocinar,
        'difficulty': tdificulty,
    }

    return general_info

In [5]:
def get_ingredients(soup) -> list:
    ing_cnt = soup.find('div', id='ingredients-original')
    if ing_cnt is None:  # Manejo del caso donde no se encuentra el contenedor
        return None

    ingredients_labels = ing_cnt.find_all('label', class_='receta-containercheck')
    ingredients = [label.get_text(strip=True) for label in ingredients_labels]

    return [clean(ingredient) for ingredient in ingredients]

In [6]:
def get_preparation(soup) -> dict:
    prep_cnt = soup.find('div', class_='recipe-intro-data-pasos-normal')

    if prep_cnt is None:  # Manejo del caso donde no se encuentra el contenedor
        return None

    steps_labels = prep_cnt.find_all('label', class_='receta-containercheck')
    steps = [step.get_text(strip=True) for step in steps_labels]

    return [clean(step) for step in steps]

In [7]:
def make_recipe_dict(name, info, ingred, prep) -> dict:
    recipe = {
        'name': name,
        'info': info,
        'ingredients': ingred,
        'preparation': prep,
    }

    return recipe

In [8]:
# function to process each file

def process_json(filepath, encode='utf-8') -> list:
    with open(filepath, 'r', encoding=encode) as f:
        data = json.load(f)

    
    ## Data for the categorie
    cat = {}
    cat['name'] = data['cat']
    cat['url'] = data['link']
    cat['slug'] = data['slug']

    recipes = []
    for recipe in data['children']:
        
        rname = recipe['name']
        rlink = recipe['link']
        
        html = recipe['html']
        soup = BeautifulSoup(html, bs_parser)

        # get general info
        info = get_general_info(soup, rlink)

        # get ingredients 
        ingred = get_ingredients(soup)
        if not ingred: continue

        # get prep
        prep = get_preparation(soup)
        if not prep: continue

        ## write and save the file
        recipe = make_recipe_dict(rname, info, ingred, prep)
        recipes.append(recipe)

    cat['recipes'] = recipes

    return  cat

## Process files

In [9]:

# Create an empty list to store all the categories
all_categories = []

for file_name in files_name:
  filepath = f"{base_path}/{file_name}.json"
  os.makedirs(out_path, exist_ok=True) # create the file if it doesn't exist

  print(f"working on file {file_name} ...")
  
  # Process the JSON file and append the category to the list
  category = process_json(filepath)
  all_categories.append(category)
  
print("Process completed.")

# Save all the categories in one JSON file
with open(out_path + "/all_categories.json", "w", encoding="utf-8") as f:
  json.dump(all_categories, f, ensure_ascii=False, indent=4)


working on file bebidas ...
working on file botanas ...
working on file carnes-y-aves ...
working on file comida-para-ninos ...
working on file desayunos ...
working on file ensaladas ...
working on file faciles ...
working on file guarniciones-de-exito ...
working on file guarniciones ...
working on file navidad ...
working on file panes ...
working on file pastas ...
working on file pescados-y-mariscos ...
working on file platos-fuertes ...
working on file postres-para-ninos ...
working on file postres ...
working on file recetas-a-la-parrilla ...
working on file salsas ...
working on file saludables ...
working on file sopas ...
Process completed.


In [17]:
# Open the all_categories.json file and print some statistics

with open(out_path + "/all_categories.json", "r", encoding="utf-8") as f:
  length_file = len(f.read())

## format the number to have commas
print(f'Number of Categories: {len(all_categories)}')
print(f'Number of Recipes: {sum([len(cat["recipes"]) for cat in all_categories]):,}')
print(f'Number of characters: {length_file:,}')

Number of Categories: 20
Number of Recipes: 1,397
Number of characters: 2,812,864
