In [204]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import ElementNotVisibleException as NotVisible
import pandas as pd
import numpy as np
import time
import re
import concurrent.futures

In [205]:
#create a webdriver object and set options for headless browsing
options = Options()
options.headless = True
driver = webdriver.Chrome('./chromedriver',options=options)

In [206]:
#uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url,driver):
    driver.get(url)
    res_html = driver.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
    return soup

In [207]:
def scrape_categories(driver):
    categories = pd.DataFrame(columns=['category', 'url'])
    url = 'https://www.allrecipes.com/recipes/'
    soup = get_js_soup(url, driver)
    for category in soup.find_all('li', class_='carouselNav__listItem recipeCarousel__listItem'):
        category_name = category.find('div', class_='carouselNav__linkText').text
        category_url = category.find('a', class_='carouselNav__link recipeCarousel__link')['href']
        categories = categories.append({'category':category_name, 'url':category_url}, ignore_index=True)
    return categories

In [208]:
def scrape_recipes(url, driver, num_pages, recipes):     
    for i in range(num_pages):
        soup = get_js_soup(f'{url}?page={i}', driver)

        for recipe in soup.find_all('div', class_='component tout'):
            # Check if the recipe is a Gallary of recipes. Do not include these.
            if not recipe.find('span', class_='tout__contentFlagText'):
                recipe_name = recipe.find('a', class_='tout__titleLink')['title']
                recipe_url = recipe.find('a', class_='tout__titleLink')['href']
                recipes = recipes.append({'recipe_name':recipe_name, 'recipe_url':"https://www.allrecipes.com/" + recipe_url}, ignore_index=True)
    
    return recipes

In [218]:
# Loop through each category and scrape recipes
recipes = pd.DataFrame(columns=['recipe_name', 'recipe_url'])
for i in range(len(categories)):
    recipes = scrape_recipes(categories.url[i], driver, 40, recipes)

KeyboardInterrupt: 

In [174]:
recipes.to_csv('recipes.csv')

In [212]:
# There are ~500 duplicate entries in the ~18000 recipes in recipes.csv
recipes = pd.read_csv('recipes.csv', index_col=0).drop_duplicates()

In [210]:
recipes['rating'], recipes['ingredient'], recipes['quantity'], recipes['unit'] = np.nan, np.nan, np.nan, np.nan

In [211]:
recipes

Unnamed: 0,recipe_name,recipe_url,rating,ingredient,quantity,unit
0,Herbed Pomegranate Salsa,https://www.allrecipes.com//recipe/38034/herbe...,,,,
1,Deer Jerky,https://www.allrecipes.com//recipe/46324/deer-...,,,,
2,Superb Sauteed Mushrooms,https://www.allrecipes.com//recipe/222795/supe...,,,,
3,Easy Apple Strudel,https://www.allrecipes.com//recipe/47821/easy-...,,,,
4,Seven Layer Taco Dip,https://www.allrecipes.com//recipe/19673/seven...,,,,
...,...,...,...,...,...,...
18211,Black Bean and Rice Enchiladas,https://www.allrecipes.com//recipe/222598/blac...,,,,
18213,Real Chiles Rellenos,https://www.allrecipes.com//recipe/214088/real...,,,,
18215,Slovak Stuffed Cabbage,https://www.allrecipes.com//recipe/14597/slova...,,,,
18219,Authentic Mexican Picadillo,https://www.allrecipes.com//recipe/267628/auth...,,,,


In [249]:
ingredients = ingredients = pd.DataFrame(columns=['recipe', 'url', 'rating', 'ingredient', 'quantity', 'unit'])

In [262]:
ingredients = pd.DataFrame()

In [229]:
def scrape_ingredients(url, driver):
    soup = get_js_soup(url, driver)   
    
    recipe_name = recipes.loc[recipes.recipe_url == url, 'recipe_name'].values[0]
    
    # If a rating section exists on the recipe page
    if soup.find('span', class_='review-star-text'):
        
        # Some recipes have a rating section but have 'Unrated' rating. If this is not the case then extract the rating
        if soup.find('span', class_='review-star-text').text != 'Rating: Unrated':
            rating = soup.find('span', class_='review-star-text').text

            # ratings are input as "Rating: x.xx". Remove all but numeric value and convert to float
            rating = float(re.findall(r'\d+\.?\d*', rating)[0])
        else:
            # If a rating doesn't exist, give it a value of 0.00
            rating = 0.0
    
    for ingredients in soup.find_all('li', class_='ingredients-item'):        
        
        ingredient = ingredients.find('input', class_='checkbox-list-input')['value']        
        quantity = ingredients.find('input', class_='checkbox-list-input')['data-init-quantity']
        unit = ingredients.find('input', class_='checkbox-list-input')['data-unit']
        
        # Add data to recipes dataframe
        #ingredients.loc[recipe_num, ['recipe', 'url', 'rating', 'ingredient', 'quantity', 'unit']] = recipe_name, url, rating, ingredient, quantity, unit
        ingredients = pd.DataFrame([recipe_name, url, rating, ingredient, quanitity])
        ingredients = ingredients.append({'recipe':recipe_name, 'url':url, 'rating':rating, 'ingredient':ingredient, 'quantity':quantity, 'unit':unit}, ignore_index=True)

In [287]:
def scrape_ingredients(url, driver, ingredients):
    #Get the index from recipes dataframe that matches the url
    
    #index = pd.MultiIndex.from_product([[recipe_num], np.arange(0)], names=['recipe_num', 'ingredient_num'])
    #ingredients_df = pd.DataFrame(columns=['rating', 'ingredient', 'quantity', 'unit'], index=index)
    ingredients_df = pd.DataFrame(columns=['rating', 'ingredient', 'quantity', 'unit'])
    soup = get_js_soup(url, driver)   
    
    # If a rating section exists on the recipe page
    if soup.find('span', class_='review-star-text'):
        
        # Some recipes have a rating section but have 'Unrated' rating. If this is not the case then extract the rating
        if soup.find('span', class_='review-star-text').text != 'Rating: Unrated':
            rating = soup.find('span', class_='review-star-text').text

            # ratings are input as "Rating: x.xx". Remove all but numeric value and convert to float
            rating = float(re.findall(r'\d+\.?\d*', rating)[0])
        else:
            # If a rating doesn't exist, give it a value of 0.00
            rating = 0.0
    
    for ingredients in soup.find_all('li', class_='ingredients-item'):        
        ingredient = ingredients.find('input', class_='checkbox-list-input')['value']
        quantity = ingredients.find('input', class_='checkbox-list-input')['data-init-quantity']
        unit = ingredients.find('input', class_='checkbox-list-input')['data-unit']
        ingredients_df = ingredients_df.append({'rating':rating, 'ingredient':ingredient, 'quantity':quantity, 'unit':unit}, ignore_index=True)
    
    # Reindex as a multi index with the recipe as an indexer
    #recipe_num = recipes.index[recipes.recipe_url == asdf.index[0][0]][0]
    #index = pd.MultiIndex.from_product([[recipe_num], np.arange(len(ingredients_df))], names=['recipe_num', 'ingredient_num'])
    #ingredients_df = ingredients_df.set_index(index)
    
    #ingredients = pd.concat([ingredients, ingredients_df])
    
    ingredients_df['recipe_name'] = recipes.loc[recipes.recipe_url == url, 'recipe_name'].values[0]
    ingredients_df['url'] = url
    
    ingredients = ingredients.append(ingredients_df)
    
    return ingredients_df

In [281]:
asdf = scrape_ingredients(recipes.recipe_url[1], driver)

In [288]:
scrape_ingredients(recipes.recipe_url[1], driver, ingredients)

  new_child.parent = self
  new_child.previous_sibling = previous_child


AttributeError: 'DataFrame' object has no attribute '_last_descendant'

In [284]:
ingredients=ingredients.append(asdf)

In [285]:
ingredients

Unnamed: 0,rating,ingredient,quantity,unit,recipe_name,url
0,4.73,(1/2 pound) deer meat,1.0,pound,Deer Jerky,https://www.allrecipes.com//recipe/46324/deer-...
1,4.73,soy sauce,4.0,tablespoons,Deer Jerky,https://www.allrecipes.com//recipe/46324/deer-...
2,4.73,(10 fluid ounce) bottle Worcestershire sauce,4.0,tablespoons,Deer Jerky,https://www.allrecipes.com//recipe/46324/deer-...
3,4.73,liquid smoke,2.0,tablespoons,Deer Jerky,https://www.allrecipes.com//recipe/46324/deer-...
4,4.73,(28 ounce) bottle ketchup,1.0,tablespoon,Deer Jerky,https://www.allrecipes.com//recipe/46324/deer-...
5,4.73,black pepper,0.25,teaspoon,Deer Jerky,https://www.allrecipes.com//recipe/46324/deer-...
6,4.73,pinch garlic powder,0.25,teaspoon,Deer Jerky,https://www.allrecipes.com//recipe/46324/deer-...
7,4.73,onion salt,0.25,teaspoon,Deer Jerky,https://www.allrecipes.com//recipe/46324/deer-...
8,4.73,pinch salt,0.5,teaspoon,Deer Jerky,https://www.allrecipes.com//recipe/46324/deer-...
0,4.73,(1/2 pound) deer meat,1.0,pound,Deer Jerky,https://www.allrecipes.com//recipe/46324/deer-...


In [275]:
scrape_ingredients(recipes.recipe_url[1], driver)

  new_child.parent = self
  new_child.previous_sibling = previous_child


AttributeError: 'DataFrame' object has no attribute '_last_descendant'

In [267]:
ingredients.append(scrape_ingredients(recipes.recipe_url[1], driver))

  new_child.parent = self
  new_child.previous_sibling = previous_child


AttributeError: 'DataFrame' object has no attribute '_last_descendant'

In [253]:
ingredients = pd.DataFrame()

In [None]:
index = pd.MultiIndex.from_product([[0], np.arange(0)], names=['recipe_num', 'ingredient_num'])
ingredients = pd.DataFrame(columns=['rating', 'ingredient', 'quantity', 'unit'], index=index)

# Loop through all recipes and get ingredients
for i in range(2000):
    temp_ingredients = scrape_ingredients(recipes.recipe_url[i], driver)

    # Reindex as a multi index with the recipe as an indexer
    index = pd.MultiIndex.from_product([[i], np.arange(len(temp_ingredients))], names=['recipe_num', 'ingredient_num'])
    temp_ingredients = temp_ingredients.set_index(index)

    ingredients = pd.concat([ingredients, temp_ingredients])


In [None]:
def scrape(url):
    temp_ingredients = scrape_ingredients(url, driver)
    # Reindex as a multi index with the recipe as an indexer
    index = pd.MultiIndex.from_product([[url], np.arange(len(temp_ingredients))], names=['url', 'ingredient_num'])
    temp_ingredients = temp_ingredients.set_index(index)

    ingredients = pd.concat([ingredients, temp_ingredients])

In [20]:
urls = recipes.recipe_url[0:100]

In [None]:
with concurrent.futures.ProcessPoolExecutor() as executor:
    f = executor.map(scrape_ingredients)

In [None]:
ingredients.to_csv('ingredients.csv')

## Filtering by ingredients

## Prividing Recomendation