In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import ElementNotVisibleException as NotVisible
import pandas as pd
import numpy as np
import time
import re
from random import sample

In [2]:
#create a webdriver object and enable headless browsing
options = Options()
options.headless = True
driver = webdriver.Chrome('./chromedriver',options=options)

In [3]:
def get_js_soup(url,driver):
    """ Uses webdriver object to execute javascript code and get dynamically loaded webcontent
        
        Args:
            url: a url for a webpage to be loaded by the webdriver
            driver: a webdriver object         
        
        Returns:
            A BeautifulSoup object containing the parsed html for the loaded webpage
    """
    driver.get(url)
    res_html = driver.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser')
    return soup

In [4]:
def scrape_categories(driver):
    """ Gather all the categories of recipes on https://www.allrecipes.com/recipes/.
    
    Args:
        driver: The webdriver.
    
    Returns:
        A dataframe containing the name of each category and the url to all the recipes in that category.
        
    """
    categories = pd.DataFrame(columns=['category', 'url'])
    url = 'https://www.allrecipes.com/recipes/'
    soup = get_js_soup(url, driver)
    
    for category in soup.find_all('li', class_='carouselNav__listItem recipeCarousel__listItem'):
        category_name = category.find('div', class_='carouselNav__linkText').text
        category_url = category.find('a', class_='carouselNav__link recipeCarousel__link')['href']
        categories = categories.append({'category':category_name, 'url':category_url}, ignore_index=True)
    
    return categories

In [5]:
def scrape_recipes(url, driver, num_pages):
    """ Pull the names and urls for recipes on allrecipes.com
    
    Args:
        url: The url for each recipe category, which contains a list of recipes.
        driver: The webdriver.
        num_pages: The number of pages of each recipe categroy to scrape.
    
    Returns:
        An dataframe containing the name of each recipe scraped and its url.
        
    """
    recipes = pd.DataFrame(columns=['recipe_name', 'recipe_url'])
    
    for i in range(num_pages):
        soup = get_js_soup(f'{url}?page={i+1}', driver)

        for recipe in soup.find_all('div', class_='component tout'):
            
            # Check if the recipe is a Gallary of recipes. Do not include these.
            if not recipe.find('span', class_='tout__contentFlagText'):
                recipe_name = recipe.find('a', class_='tout__titleLink')['title']
                recipe_url = recipe.find('a', class_='tout__titleLink')['href']
                recipes = recipes.append({'recipe_name':recipe_name, 'recipe_url':"https://www.allrecipes.com/" + recipe_url}, ignore_index=True)
    
    return recipes

In [6]:
def scrape_ingredients(url):
    """ Scrapes the recipes webpage and pulls the recipes rating, ingredients, 
        the ingredient quantitiy, and the units for that quantity, e.g., grams or ounces.
        Appends these to the dictionary, ingredients_dict.
        
        Args:
            url: The url to a recipes webpage on allrecipes.com.

    """

    soup = get_js_soup(url, driver)   

    recipe_name = recipes.loc[recipes.recipe_url == url, 'recipe_name'].values[0]
    
    # If a rating section exists on the recipe page
    if soup.find('span', class_='review-star-text'):
        
        # Some recipes have a rating section but have 'Unrated' rating. If this is not the case then extract the rating
        if soup.find('span', class_='review-star-text').text != 'Rating: Unrated':
            rating = soup.find('span', class_='review-star-text').text

            # ratings are input as "Rating: x.xx". Remove all but numeric value and convert to float
            rating = float(re.findall(r'\d+\.?\d*', rating)[0])
        else:
            # If a rating doesn't exist, give it a value of 0.00
            rating = 0.0
    
    for ingredients in soup.find_all('li', class_='ingredients-item'):        
        ingredient = ingredients.find('input', class_='checkbox-list-input')['value']
        quantity = ingredients.find('input', class_='checkbox-list-input')['data-init-quantity']
        unit = ingredients.find('input', class_='checkbox-list-input')['data-unit']
        ingredients_dict.append({'recipe':recipe_name, 'url':url, 'rating':rating, 'ingredient':ingredient, 'quantity':quantity, 'unit':unit})

In [7]:
# gather the url for all categories
categories = scrape_categories(driver)

In [8]:
categories

Unnamed: 0,category,url
0,Appetizers and Snacks,https://www.allrecipes.com/recipes/76/appetize...
1,BBQ &amp; Grilling,https://www.allrecipes.com/recipes/88/bbq-gril...
2,Bread Recipes,https://www.allrecipes.com/recipes/156/bread/
3,Breakfast and Brunch,https://www.allrecipes.com/recipes/78/breakfas...
4,Desserts,https://www.allrecipes.com/recipes/79/desserts/
5,Dinner Recipes,https://www.allrecipes.com/recipes/17562/dinner/
6,Drinks,https://www.allrecipes.com/recipes/77/drinks/
7,Everyday Cooking,https://www.allrecipes.com/recipes/1642/everyd...
8,"Fruits, Vegetables and Other Produce",https://www.allrecipes.com/recipes/1116/fruits...
9,Holidays and Events,https://www.allrecipes.com/recipes/85/holidays...


In [9]:
# Loop through each category and scrape recipes
recipes = pd.DataFrame(columns=['recipe_name', 'recipe_url'])

for i in range(len(categories)):
    recipes = recipes.append(scrape_recipes(categories.url[i], driver, 2))
    
# Uncomment below line to save recipes to a csv file
# recipes.to_csv('recipes.csv')

In [10]:
recipes

Unnamed: 0,recipe_name,recipe_url
0,Herbed Pomegranate Salsa,https://www.allrecipes.com//recipe/38034/herbe...
1,Easy Guacamole,https://www.allrecipes.com//recipe/14064/easy-...
2,Chef John's Fresh Salmon Cakes,https://www.allrecipes.com//recipe/239541/chef...
3,Roasted Green Beans,https://www.allrecipes.com//recipe/229349/roas...
4,Balsamic Bruschetta,https://www.allrecipes.com//recipe/54165/balsa...
...,...,...
19,"Daddy Eddie's Roast Pork (Pernil), Puerto Rica...",https://www.allrecipes.com//recipe/254168/dadd...
20,Stuffed Cabbage Rolls,https://www.allrecipes.com//recipe/26661/stuff...
21,Sweet and Sour Sauce I,https://www.allrecipes.com//recipe/19670/sweet...
22,Egg Drop Soup (Better than Restaurant Quality!),https://www.allrecipes.com//recipe/115965/egg-...


In [7]:
# Read the saved recipes csv file to a dataframe. Dropping duplicates because there are ~500 duplicate entries in the ~18000 recipes in recipes.csv
recipes = pd.read_csv('recipes.csv', index_col=0).drop_duplicates()

In [52]:
# Run scrape recipes on each url in the recipes dataframe.
# This saves some time by performing this as a multithreaded process

# import concurrent.futures
# ingredients_dict = []
# start = time.time()
# with concurrent.futures.ThreadPoolExecutor() as executor:
#     executor.map(scrape_ingredients, list(recipes.recipe_url))
# end = time.time()
# print(end-start)

7.106196403503418


In [27]:
# Run scrape recipes on each url in the recipes dataframe.
ingredients_dict = []
start = time.time()
for url in recipes.recipe_url[:5]:
    scrape_ingredients(url)
end = time.time()
print(end-start)

# Convert dictionary to a pandas dataframe 
scraped_ingredients = pd.DataFrame.from_dict(ingredients_dict)

12.172820568084717


### Ingredients table cleanup

In [13]:
# Import ingredients from file
ingredients = pd.read_csv('scraped_ingredients.csv', index_col=0).drop(['recipe_num'], axis=1).drop_duplicates()

##### Some ingredients have some additional information between parentheses, for example, "gochujang (Korean hot pepper paste)" and "lemon (for zesting)".  The following removes the parantheses and its contents.

In [15]:
ingredients.ingredient = ingredients.ingredient.str.replace('\(.*?\)', '', regex=True)

# Strip whitespace at beginning and end of ingredient and make all ingredients lower case
ingredients.ingredient = ingredients.ingredient.str.strip().str.lower()

##### Remove the word "sprig", "pinch", "jar", "stick", "bunch", and "jigger" from the start of ingredients. For example, change "pinch salt" to "salt" and "jigger lime juice" ro "lime juice".

In [16]:
ingredients.ingredient = ingredients.ingredient.str.replace('^(sprig|jar|pinch|stick|jigger|bunch) ', '', regex=True)

##### Some ingredients end with "cooked" or "raw". The follow removes these words from the ingredient name.

In [17]:
ingredients.ingredient = ingredients.ingredient.str.replace(', (raw|cooked)$', '', regex=True)

In [18]:
# Save to csv
ingredients.to_csv('recipe_ingredients.csv')

## Filtering by ingredients

Generating a random list of ingredients from the data set, to simulate a list of ingredients that a user may create

In [3]:
ingredients = pd.read_csv('recipe_ingredients.csv', index_col=0)

In [37]:
pantry = sample(list(ingredients.ingredient.unique()), 200)
pd.DataFrame(pantry, columns=['ingredients']).to_csv('pantry.csv')

In [43]:
pantry = pd.read_csv('pantry.csv', index_col=0)
pantry = pantry['ingredients'].to_list()

## Providing Recomendation

In [46]:
def get_recommendation(ingredients, pantry):
    """ Creates a dataframe of recipes that a user can make based on the ingredients in their pantry
    
    Args:
        ingredients: A dataframe of all recipes and ingredients scraped from allrecipes.com
        pantry: a list of ingredients from the user
    
    Returns:
        A dataframe containing the recipe name, rating, and url for recipes that a user can make
        based on the ingredients that the user provided. The recipes in the returned dataframe are
        sorted in descending order based on the recipe's rating.
    
    """
    ingredients = pd.read_csv('recipe_ingredients.csv', index_col=[0])
    
    # Filter ingredients dataframe to only include ingredients that are in the users pantry.
    # This will provide a list of recipes for which the user has some or all of the ingredients for.
    matched_recipes = ingredients[ingredients.isin({'ingredient': pantry}).ingredient]
    
    # Get the full ingredients list for all recipes for which the user has some of the ingredients
    full_matched_recipes = ingredients[ingredients['recipe'].isin(pd.unique(matched_recipes['recipe']))]
    
    # Compare the number of ingredients between the two dataframes. This will tell which recipes the user has all of the ingredients for.
    comparison = matched_recipes.groupby(['recipe']).nunique()['ingredient'] == full_matched_recipes.groupby(['recipe']).nunique()['ingredient']
    
    # Retreive all of the recipe names for which the user has all of the ingredients for
    user_recipes = matched_recipes.recipe.unique()[comparison]
    
    # Provide a list of recipes that the user can make ranked by the rating of the recipe
    recommendation = ingredients.loc[ingredients.recipe.isin(user_recipes)][['recipe', 'rating', 'url']].drop_duplicates().sort_values(by=['rating'], ascending=False)
    
    return recommendation

In [47]:
recommendation = get_recommendation(ingredients, pantry)

In [48]:
recommendation.head(20)

Unnamed: 0,recipe,rating,url
354,Dessert Crepes,4.8,https://www.allrecipes.com//recipe/19037/desse...
9954,Tom's Blackened Seasoning,4.7,https://www.allrecipes.com//recipe/64761/toms-...
282,Brie Cheese Appetizer,4.69,https://www.allrecipes.com//recipe/15015/brie-...
4880,Sarah's Knish,4.61,https://www.allrecipes.com//recipe/212834/sara...
6585,Habanero Salsa,4.6,https://www.allrecipes.com//recipe/38886/haban...
10445,Garlic and Herb Marinade,4.55,https://www.allrecipes.com//recipe/19777/garli...
14900,Steak Taco,4.5,https://www.allrecipes.com//recipe/240212/stea...
