In [34]:
from urllib.parse import  urljoin
from bs4 import BeautifulSoup
import requests
import markdownify as md
import re

In [33]:
base_url = "https://www.archanaskitchen.com/"
no_pages = 337

In [5]:
def getText(element):
    if element:
        return element.text.strip()
    else:
        None

In [31]:
class NoContentException(Exception):
    pass

In [77]:
def extract_recipe_data(soup):
    data = {}
    header = soup.select_one(".recipe-header")
    if not header:
        raise NoContentException
    data["title"] = getText(header.select_one(".recipe-title"))
    data["shortdescription"] = getText(header.select_one("p"))
    data["author"] = getText(soup.find(attrs={"itemprop":"author"}))
    data["created"] = soup.select_one(".itemDateCreated").attrs["content"]
    data["image"] = urljoin(base_url,soup.select_one(".recipe-image").find("img").attrs["src"])

    html = " ".join([str(x) for x in soup.select_one(".recipedescription").contents]).strip()
    data["description"] = md.markdownify(html, heading_style="ATX")

    table = soup.select_one(".cuisineandcourse")
    if table:
        cusine = table.select_one(".cuisine")
        if cusine:
            data["cuisine"] = cusine.find(attrs={"itemprop":"recipeCuisine"}).text.lower().replace("recipes","").strip()
        course = table.select_one(".course")
        if course:
            data["course"] = course.find(attrs={"itemprop":"keywords"}).text.strip().lower()
        diet = table.select_one(".diet")
        if diet:
            data["diet"] = diet.find(attrs={"itemprop":"keywords"}).text.strip().lower()
        equipments = table.select_one(".products")
        data["equipments"] = None
        if equipments:
            data["equipments"] = [x.text.strip().lower() for x in equipments.select("a")]

    table = soup.select_one(".RecipeServesTime")
    if table:
        data["prepTime"] = re.findall(r'\d+', table.find(attrs={"itemprop":"prepTime"}).text)[0]
        data["cookTime"] = re.findall(r'\d+', table.find(attrs={"itemprop":"cookTime"}).text)[0]
        data["totalTime"] = re.findall(r'\d+', table.find(attrs={"itemprop":"totalTime"}).text)[0]
        serving = table.find(attrs={"itemprop":"recipeYield"})
        data["servings"] = None
        if serving:
            data["servings"] = re.findall(r'\d+', serving.text)[0]
    ingredients = soup.select_one(".recipeingredients")
    data["ingredients"]=None
    if ingredients:
        data["ingredients"] = [" ".join(x.text.split()) for x in ingredients.find_all("li",attrs={"itemprop":"ingredients"})]

    html = " ".join([str(x) for x in soup.select_one(".recipeinstructions").contents]).strip()
    data["instructions"] = md.markdownify(html, heading_style="ATX")

    return data

In [78]:
import threading

In [36]:
lock = threading.Lock()

soups = {}
recipies = {}

In [79]:
def append_error_url(url,e):
    global error_urls
    with lock:
        error_urls.append((url,e))

def store_soup(url,soup):
    global soups
    with lock:
        soups[url] = soup

def get_soup_from_cache(url):
    global soups
    with lock:
        try:
            return soups[url]
        except KeyError:
            return None

def store_recipie(url,recipie):
    global recipies
    with lock:
        recipies[url] = recipie

In [80]:
def extract_pages(start:int,end:int):
    for i in range(start,end):
        page_url = urljoin(base_url,f"/recipes/page-{i}")
        response = requests.get(page_url)
        s = BeautifulSoup(response.text)
        for card in s.select_one("#ak_recipe_categoryblog").select(".blogRecipe"):
            url = urljoin(base_url,card.find("a").attrs["href"])
            soup = get_soup_from_cache(url)
            if soup is None:
                response = requests.get(url)
                soup = BeautifulSoup(response.text)
                store_soup(url,soup)
            print(f"{i}: {url}")
            try:
                recipie= extract_recipe_data(soup) 
                store_recipie(url,recipie)
            except Exception as e:
                append_error_url(url,e)

In [None]:
import os
max_threads = os.cpu_count()-1

threads = []
error_urls = []

for i in range(no_pages//max_threads+1):
    args = (i*max_threads+1,min((i+1)*max_threads,no_pages+1))
    thread = threading.Thread(target=extract_pages, args=args)
    threads.append(thread)
    thread.start()

for thread in threads:
    thread.join()

In [82]:
len(recipies)

7525

In [89]:
import json
with open("data.json","w") as fh:
    json.dump(recipies,fh,indent=4)

In [87]:
error_urls

[('https://www.archanaskitchen.com/carrot-capsicum-mushroom-sweet-corn-tawa-sabzi-recipe',
  __main__.NoContentException()),
 ('https://www.archanaskitchen.com/open-toast-macaroni-recipe',
  __main__.NoContentException())]

In [90]:
# import dill

# with open("soups.pik","wb") as fh:
#     dill.dump(soups,fh)

In [None]:
# with open("soups.pik","rb") as fh:
#     soups = dill.load(fh)