# Crawling Kiwilimon


In this file, we'll be downloading and formatting many reciepes from <a href="https://www.kiwilimon.com/" target="_blank">kiwilimon.com<a/>. Second try, let's pray 🙏


In [1]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv


In [2]:
# All we need to import

import os
import time
from urllib.parse import urljoin, urlparse

from playwright.async_api import async_playwright


In [3]:
host = "https://www.kiwilimon.com"
data_path = "../data/"
base_dir = "../data/raw"
delay_seconds = 5
bs_parser = "html.parser"

if not os.path.exists(base_dir):
    os.makedirs(base_dir)


In [4]:
playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=True)
page = await browser.new_page()


### Get categories links


In [5]:
# Xpath
post_xpath = "/html/body/div[10]/div[3]/div[3]/div[1]/div/a"  # /html/body/div[10]/div[3]/div[3]/div[1]/div[2]/a


In [6]:
categories_links = []  # post of each reciepe
page_url = page.url

while True:
    page_url = f"{host}/recetas"
    print(page_url)
    await page.goto(page_url)
    await page.wait_for_load_state()
    if page_url != page_url:
        break
    for elm in await page.locator("xpath=" + post_xpath).element_handles():
        post_url = urljoin(page_url, await elm.get_attribute("href"))
        categories_links.append(post_url)
    break

print(len(categories_links), len(set(categories_links)))


https://www.kiwilimon.com/recetas
20 20


In [7]:
categories_links


['https://www.kiwilimon.com/preferencia/faciles',
 'https://www.kiwilimon.com/recetas/postres',
 'https://www.kiwilimon.com/recetas/carnes-y-aves',
 'https://www.kiwilimon.com/temporada/recetas-a-la-parrilla',
 'https://www.kiwilimon.com/recetas/ensaladas',
 'https://www.kiwilimon.com/recetas/guarniciones',
 'https://www.kiwilimon.com/temporada/navidad',
 'https://www.kiwilimon.com/recetas/pescados-y-mariscos',
 'https://www.kiwilimon.com/recetas/platos-fuertes',
 'https://www.kiwilimon.com/recetas/botanas',
 'https://www.kiwilimon.com/recetas/pastas',
 'https://www.kiwilimon.com/recetas/comida-para-ninos',
 'https://www.kiwilimon.com/recetas/postres-para-ninos',
 'https://www.kiwilimon.com/recetas/sopas',
 'https://www.kiwilimon.com/recetas/saludables',
 'https://www.kiwilimon.com/recetas/desayunos',
 'https://www.kiwilimon.com/recetas/panes',
 'https://www.kiwilimon.com/recetas/guarniciones-de-exito',
 'https://www.kiwilimon.com/recetas/salsas',
 'https://www.kiwilimon.com/recetas/be

Using the post links, save the categories links in a txt file.


In [8]:
# get only the categories from the url at the last after the last /
categories = [urlparse(link).path.split("/")[-1] for link in categories_links]


In [9]:
# Using the post links, save the categories links in a txt file.
with open(os.path.join(data_path, "temporary", "categories.txt"), "w") as f:
    for cat in categories:
        f.write(f"{cat}\n")


In [10]:
categories_links = [
    {
        "cat": " ".join(url[url.rfind("/") + 1 :].split("-")).title(),
        "slug": url[url.rfind("/") + 1 :],
        "link": url,
        "children": [],
    }
    for url in categories_links
]
categories_links[:5]


[{'cat': 'Faciles',
  'slug': 'faciles',
  'link': 'https://www.kiwilimon.com/preferencia/faciles',
  'children': []},
 {'cat': 'Postres',
  'slug': 'postres',
  'link': 'https://www.kiwilimon.com/recetas/postres',
  'children': []},
 {'cat': 'Carnes Y Aves',
  'slug': 'carnes-y-aves',
  'link': 'https://www.kiwilimon.com/recetas/carnes-y-aves',
  'children': []},
 {'cat': 'Recetas A La Parrilla',
  'slug': 'recetas-a-la-parrilla',
  'link': 'https://www.kiwilimon.com/temporada/recetas-a-la-parrilla',
  'children': []},
 {'cat': 'Ensaladas',
  'slug': 'ensaladas',
  'link': 'https://www.kiwilimon.com/recetas/ensaladas',
  'children': []}]

## Get recipes link from each categorie


In [11]:
post_xpath = "/html/body/div[10]/div[7]/div[2]/div/a | /html/body/div[10]/div[7]/div[2]/div/div/a"


In [14]:
# Get many links from each categorie

for idx, cat in enumerate(categories_links):
    page_url = cat["link"]
    print(page_url)
    await page.goto(page_url)
    await page.wait_for_load_state()
    if page_url != page_url:
        break

    # load more recipes clicking the "ver más" button
    for _ in range(6):
        ver_mas_button = page.locator('div:text("Ver más")')

        # validate if the button is visible
        is_visible = await ver_mas_button.is_visible()
        if not (
            await ver_mas_button.count() and is_visible
        ):  # avoiding error when there's no more "Ver más" buttons
            break
        await ver_mas_button.click()
        await page.wait_for_load_state()

    # load all the recipe links
    print("loading...")
    for elm in await page.locator("xpath=" + post_xpath).element_handles():
        post_url = urljoin(page_url, await elm.get_attribute("href"))
        # print(cat[idx]['cat'], '-', post_url)
        name = " ".join(post_url[post_url.rfind("/") + 1 :].split("-")).title()
        categories_links[idx]["children"].append({"name": name, "link": post_url})

    print(
        len(categories_links[idx]["children"]),
        f"recipes added to '{categories_links[idx]['cat']}'.",
    )

print("\nDone!")


https://www.kiwilimon.com/preferencia/faciles
loading...
104 recipes added to 'Faciles'.
https://www.kiwilimon.com/recetas/postres
loading...
98 recipes added to 'Postres'.
https://www.kiwilimon.com/recetas/carnes-y-aves
loading...
98 recipes added to 'Carnes Y Aves'.
https://www.kiwilimon.com/temporada/recetas-a-la-parrilla
loading...
98 recipes added to 'Recetas A La Parrilla'.
https://www.kiwilimon.com/recetas/ensaladas
loading...
98 recipes added to 'Ensaladas'.
https://www.kiwilimon.com/recetas/guarniciones
loading...
98 recipes added to 'Guarniciones'.
https://www.kiwilimon.com/temporada/navidad
loading...
98 recipes added to 'Navidad'.
https://www.kiwilimon.com/recetas/pescados-y-mariscos
loading...
98 recipes added to 'Pescados Y Mariscos'.
https://www.kiwilimon.com/recetas/platos-fuertes
loading...
84 recipes added to 'Platos Fuertes'.
https://www.kiwilimon.com/recetas/botanas
loading...
98 recipes added to 'Botanas'.
https://www.kiwilimon.com/recetas/pastas
loading...
98 reci

In [16]:
# save the categories_links in a json file
import json

with open(os.path.join(data_path, "temporary", "categories_links.json"), "w") as f:
    # save it with json format
    json.dump(categories_links, f, indent=2)


# Clean the data before to crawl


In [17]:
# load the categories_links from the json file, because its separeted by categories,
# its important to load it in a way that we can iterate over all the recipes and load it
# into a dataframe
with open(os.path.join(data_path, "temporary", "categories_links.json")) as f:
    categories_links = json.load(f)


# get the recipes from the categories_links
recipes = []
for cat in categories_links:
    for rec in cat["children"]:
        recipes.append(
            {
                "category": cat["cat"],
                "category_slug": cat["slug"],
                "recipe": rec["name"],
                "link": rec["link"],
            }
        )

print(len(recipes), "recipes loaded.")


1784 recipes loaded.


In [18]:
# load the recipes in a dataframe
import pandas as pd

df = pd.DataFrame(recipes)
df.head()


Unnamed: 0,category,category_slug,recipe,link
0,Faciles,faciles,Horchata De Pumpkin Spice,https://www.kiwilimon.com/receta/bebidas/sin-a...
1,Faciles,faciles,Tostadas De Rajas Con Carne,https://www.kiwilimon.com/receta/platos-fuerte...
2,Faciles,faciles,Rollos De Ensalada De Pollo Thai,https://www.kiwilimon.com/receta/platos-fuerte...
3,Faciles,faciles,Espresso Tonic,https://www.kiwilimon.com/receta/bebidas/espre...
4,Faciles,faciles,Atole De Pumpkin Spice,https://www.kiwilimon.com/receta/bebidas/bebid...


In [21]:
# get the recipe_slug from the link, remember that some of them has no - in the name, but if have, keep it
df["recipe_slug"] = df["link"].apply(lambda x: x[x.rfind("/") + 1 :])

df.head()


Unnamed: 0,category,category_slug,recipe,link,recipe_slug
0,Faciles,faciles,Horchata De Pumpkin Spice,https://www.kiwilimon.com/receta/bebidas/sin-a...,horchata-de-pumpkin-spice
1,Faciles,faciles,Tostadas De Rajas Con Carne,https://www.kiwilimon.com/receta/platos-fuerte...,tostadas-de-rajas-con-carne
2,Faciles,faciles,Rollos De Ensalada De Pollo Thai,https://www.kiwilimon.com/receta/platos-fuerte...,rollos-de-ensalada-de-pollo-thai
3,Faciles,faciles,Espresso Tonic,https://www.kiwilimon.com/receta/bebidas/espre...,espresso-tonic
4,Faciles,faciles,Atole De Pumpkin Spice,https://www.kiwilimon.com/receta/bebidas/bebid...,atole-de-pumpkin-spice


Because we have repeted links, we need to clean merge it's categories and save it in a csv file.


In [22]:
# save in a csv file
df.to_csv(os.path.join(data_path, "temporary", "recipes.csv"), index=False)


## Get the HTML for Each post

the html files will be saved in the `data/raw` folder


In [24]:
# load the recipes from the csv file
df_recipes = pd.read_csv(os.path.join(data_path, "temporary", "recipes.csv"))


In [25]:
# drop duplicated links
df_recipes = df_recipes.drop_duplicates(subset=["link"])


In [26]:
# TODO: Save the data into files and not into a whole json file
# See the example in the pathway_indexer to scrap the page easier and faster
count = 1
for idx, row in df_recipes.iterrows():
    print("Crawling:", row["link"])
    if count % 50 == 0:
        time.sleep(delay_seconds)
    await page.goto(row["link"])
    await page.wait_for_load_state()
    html = await page.content()
    with open(
        os.path.join(base_dir, f"{row['recipe_slug']}.html"), "w", encoding="utf-8"
    ) as f:
        f.write(html)

    count += 1


print("Done!")


Crawling: https://www.kiwilimon.com/receta/bebidas/sin-alcohol/aguas-frescas/horchata-de-pumpkin-spice
Crawling: https://www.kiwilimon.com/receta/platos-fuertes/mexicanos/tostadas-de-rajas-con-carne
Crawling: https://www.kiwilimon.com/receta/platos-fuertes/pollo/rollos-de-ensalada-de-pollo-thai
Crawling: https://www.kiwilimon.com/receta/bebidas/espresso-tonic
Crawling: https://www.kiwilimon.com/receta/bebidas/bebidas-calientes/atole/atole-de-pumpkin-spice
Crawling: https://www.kiwilimon.com/receta/postres/postres-frios/helado/helado-de-nogada
Crawling: https://www.kiwilimon.com/receta/sopas/cremas/crema-de-flor-de-calabaza-con-elote
Crawling: https://www.kiwilimon.com/receta/bebidas/cocteles/margarita-de-chile-serrano
Crawling: https://www.kiwilimon.com/receta/botanas/botanitas-mexicanas/chicharron-de-rib-eye-con-guacamole
Crawling: https://www.kiwilimon.com/receta/postres/pays/pays-de-manzana/receta-de-pay-de-manzana
Crawling: https://www.kiwilimon.com/receta/platos-fuertes/mexicanos/