In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import ElementNotVisibleException as NotVisible
import pandas as pd
import numpy as np
import time
import re
import concurrent.futures

In [2]:
#create a webdriver object and set options for headless browsing
options = Options()
options.headless = True
driver = webdriver.Chrome('./chromedriver',options=options)

In [3]:
#uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url,driver):
    driver.get(url)
    res_html = driver.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
    return soup

In [4]:
def scrape_categories(driver):
    categories = pd.DataFrame(columns=['category', 'url'])
    url = 'https://www.allrecipes.com/recipes/'
    soup = get_js_soup(url, driver)
    for category in soup.find_all('li', class_='carouselNav__listItem recipeCarousel__listItem'):
        category_name = category.find('div', class_='carouselNav__linkText').text
        category_url = category.find('a', class_='carouselNav__link recipeCarousel__link')['href']
        categories = categories.append({'category':category_name, 'url':category_url}, ignore_index=True)
    return categories

In [5]:
def scrape_recipes(url, driver, num_pages, recipes):     
    for i in range(num_pages):
        soup = get_js_soup(f'{url}?page={i}', driver)

        for recipe in soup.find_all('div', class_='component tout'):
            # Check if the recipe is a Gallary of recipes. Do not include these.
            if not recipe.find('span', class_='tout__contentFlagText'):
                recipe_name = recipe.find('a', class_='tout__titleLink')['title']
                recipe_url = recipe.find('a', class_='tout__titleLink')['href']
                recipes = recipes.append({'recipe_name':recipe_name, 'recipe_url':"https://www.allrecipes.com/" + recipe_url}, ignore_index=True)
    
    return recipes

In [218]:
# Loop through each category and scrape recipes
recipes = pd.DataFrame(columns=['recipe_name', 'recipe_url'])
for i in range(len(categories)):
    recipes = scrape_recipes(categories.url[i], driver, 40, recipes)

KeyboardInterrupt: 

In [6]:
#recipes.to_csv('recipes.csv')

# There are ~500 duplicate entries in the ~18000 recipes in recipes.csv
recipes = pd.read_csv('recipes.csv', index_col=0).drop_duplicates()

In [7]:
def scrape_ingredients(url):
    soup = get_js_soup(url, driver)   

    recipe_name = recipes.loc[recipes.recipe_url == url, 'recipe_name'].values[0]
    
    # If a rating section exists on the recipe page
    if soup.find('span', class_='review-star-text'):
        
        # Some recipes have a rating section but have 'Unrated' rating. If this is not the case then extract the rating
        if soup.find('span', class_='review-star-text').text != 'Rating: Unrated':
            rating = soup.find('span', class_='review-star-text').text

            # ratings are input as "Rating: x.xx". Remove all but numeric value and convert to float
            rating = float(re.findall(r'\d+\.?\d*', rating)[0])
        else:
            # If a rating doesn't exist, give it a value of 0.00
            rating = 0.0
    
    for ingredients in soup.find_all('li', class_='ingredients-item'):        
        ingredient = ingredients.find('input', class_='checkbox-list-input')['value']
        quantity = ingredients.find('input', class_='checkbox-list-input')['data-init-quantity']
        unit = ingredients.find('input', class_='checkbox-list-input')['data-unit']
        recipes_dict.append({'recipe':recipe_name, 'url':url, 'rating':rating, 'ingredient':ingredient, 'quantity':quantity, 'unit':unit})

In [19]:
ingredients = pd.read_csv('ingredients.csv', index_col=['recipe', 0]).drop_duplicates()

In [47]:
urls = list(recipes.recipe_url[4000:5000])

In [52]:
recipes_dict = []
start = time.time()
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(scrape_ingredients, list(recipes.recipe_url))
end = time.time()
print(end-start)

7.106196403503418


In [50]:
asdf = pd.DataFrame(recipes_dict)

In [58]:
len(recipes.recipe.unique())

11227

In [53]:
recipes = pd.read_csv('ingredients2000-4000.csv')

In [45]:
asdf.to_csv('ingredients2000-4000.csv')

In [103]:
recipes_dict = []

In [48]:
recipes_dict = []
start = time.time()
for url in urls:
    scrape_ingredients(url)
end = time.time()
print(end-start)

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: headless chrome=96.0.4664.45)


In [None]:
#ingredients.to_csv('ingredients.csv')

## Filtering by ingredients

Generating a random list of ingredients from the data set, to simulate a list of ingredients that a user may create

In [27]:
from random import sample

In [33]:
pantry = sample(list(ingredients.ingredient.unique()), 20)

## Prividing Recomendation