In [110]:
# Import Dependencies
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [111]:
# Function for Scraping Data
def scrape_recipe(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Ingredients
    ingredients = soup.find("div", class_="loc section-content section__content")
    if ingredients:
        ingredients = ingredients.text.strip().replace('\n', '<br>')
    else:
        ingredients = "Ingredients not found"
    
    # Description
    description = soup.find("h2", class_="heading__subtitle")
    if description:
        description = description.text.strip().replace('\n', '<br>')
    else:
        description = "Description not found"

    # Recipe Facts
    recipe_facts = soup.find("div", id="project-meta_1-0", class_="comp project-meta")
    if recipe_facts:
        recipe_facts = recipe_facts.text.strip().replace('\n', '<br>')
    else:
        recipe_facts = "Recipe facts not found"

    # Directions
    directions = soup.find("div", id="structured-project__steps_1-0", class_="comp text-passage structured-content structured-project__steps mntl-sc-page mntl-block")
    if directions:
        directions = directions.text.strip().replace('\n', '<br>')
    else:
        directions = "Directions not found"

    # Nutrition Facts
    nutrition_facts = soup.find("tbody", class_="nutrition-info__table--body")
    if nutrition_facts:
        nutrition_facts = nutrition_facts.text.strip().replace('\n', '<br>')
    else:
        nutrition_facts = "Nutrition facts not found"
    
    # Rating
    ratings = soup.find("p", id="recipe-rating_1-0", class_="comp recipe-rating text-block")
    if ratings:
        ratings = ratings.text.strip()
    else:
        ratings = "Ratings not found"

    # Tags
    tags = soup.find("ul", id="link-list_1-0", class_="comp tag-nav__list link-list")
    if tags:
        tags = [tag.text.strip() for tag in tags.find_all("a")]
        tags = ', '.join(tags)
    else:
        tags = "Tags not found"

    # Title
    title = soup.find("h1", class_="heading__title")
    if title:
        title = title.text.strip().replace('\n', '<br>')
    else:
        title = "Title not found"

    # Number of servings
    nutrition_label = soup.find("div", class_="nutrition-label")
    if nutrition_label:
        number_of_servings = nutrition_label.find('table').find('thead').find('tr').find_next_sibling().text[10:]
    else:
        number_of_servings = "Number of servings not found"
    
    # Calories
    if nutrition_label:
        table_body = nutrition_label.find('table').find('tbody').find('tr')
        calories = table_body.find_next_sibling().find('td').find_next_sibling().text
    else:
        calories = "Calories not found"

    # Total Fats
    if nutrition_label:
        tot_fat = table_body.find_next_sibling().find_next_sibling().find_next_sibling().find('td').text[10:]
    else:
        tot_fat = "Total Fats not found"

    # Saturated Fats
    if nutrition_label:
        sat_fat = table_body.find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find('td').text[14:]
    else:
        sat_fat = "Saturated Fats not found"

    # Cholesterol
    if nutrition_label:
        cholesterol = table_body.find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find('td').text[12:]
    else:
        cholesterol = "Cholesterol not found"

    # Sodium
    if nutrition_label:
        sodium = table_body.find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find('td').text[7:]
    else:
        sodium = "Sodium not found"

    # Total Carbohydrates
    if nutrition_label:
        tot_carb = table_body.find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find('td').text[19:]
    else:
        tot_carb = "Total Carbohydrates not found"

    # Dietary Fibers
    if nutrition_label:
        diet_fib = table_body.find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find('td').text[14:]
    else:
        diet_fib = "Dietary Fibers not found"

    # Total Sugars
    if nutrition_label:
        tot_sugar = table_body.find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find('td').text[13:]
    else:
        tot_sugar = "Total Sugars not found"

    # Protein
    if nutrition_label:
        protein = table_body.find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find_next_sibling().find('td').text[7:]
    else:
        protein = "Protein not found"

    # Vitamin C
    if nutrition_label:
        table_foot = nutrition_label.find('table').find('tfoot').find('tr')
    if nutrition_label:
        vit_c = table_foot.find('td').text[10:]
    else:
        vit_c = "Vitamin C not found"

    # Calcium
    if nutrition_label:
        calcium = table_foot.find_next_sibling().find('td').text[8:]
    else:
        calcium = "Calcium not found"

    # Iron
    if nutrition_label:
        iron = table_foot.find_next_sibling().find_next_sibling().find('td').text[5:]
    else:
        iron = "Iron not found"

    # potassium
    if nutrition_label:
        potassium = table_foot.find_next_sibling().find_next_sibling().find_next_sibling().find('td').text[10:]
    else:
        potassium = "Potassium not found"

    # Image Link
    image_data = soup.find('div', class_= 'img-placeholder')
    if image_data:
        try:
            image_link = image_data.find('img')['src']
        except:
            image_link = image_data.find('img')['data-src']
    else:
        image_link = "Image not found"
    
    
    return title, ratings, description, ingredients, recipe_facts, directions, nutrition_facts, tags, number_of_servings, calories, tot_fat, sat_fat, cholesterol, sodium, tot_carb, diet_fib, tot_sugar, protein, vit_c, calcium, iron, potassium, image_link

In [112]:
# CSV of urls to scrape
url_df = pd.read_csv('SeriousEatsCSV.csv')
urls = []
for url in url_df['loc']:
    urls.append(url)

In [113]:
# Scrape URL for information
results = []
for url in urls:
    title, ratings, description, ingredients, recipe_facts, directions, nutrition_facts, tags, number_of_servings, calories, tot_fat, sat_fat, cholesterol, sodium, tot_carb, diet_fib, tot_sugar, protein, vit_c, calcium, iron, potassium, image_link = scrape_recipe(url)
    results.append([title, ratings, description, tags, ingredients, recipe_facts, directions, nutrition_facts, number_of_servings, calories, tot_fat, sat_fat, cholesterol, sodium, tot_carb, diet_fib, tot_sugar, protein, vit_c, calcium, iron, potassium, image_link])

KeyboardInterrupt: 

In [None]:
# Create DataFrame with Information we scraped
df = pd.DataFrame(results, columns=["Title", "Rating", "Description", "Tags", "Ingredients", "Recipe Facts",
                                    "Directions", "Nutrition Facts", 'Number of Servings',
                                    'Calories', 'Total Fat', 'Saturated Fat', 'Cholesterol', 'Sodium',
                                    'Total Carbohydrate', 'Dietary Fiber', 'Total Sugars', 'Protein',
                                    'Vitamin C', 'Calcium', 'Iron', 'Potassium', 'Image Link'])
df = df.replace({'<br>': '\n'}, regex=True)
df = df.replace({'\n+': '\n'}, regex=True)
df = df.replace({'\n+': ' '}, regex=True)
df.head()

In [None]:
# Search Tags for keywords
course_keywords = ['breakfast', 'brunch', 'main', 'snack', 'appetizer', 'salad', 'side', 'dessert', 'condiment', 'sauce']
cuisine_keywords = ['african', 'asian', 'caribbean', 'central american', 'europe', 'middle eastern', 'north american', 'oceanic', 'south american', 'world']
diet_keywords = ['dairy-free', 'gluten-free', 'vegan', 'vegetarian']

course_list = []
cuisine_list = []
diet_list = []

for tag in df['Tags']:
    tag = tag.lower()
    temp = []
    for keyword in course_keywords:
        if keyword in tag:
            if keyword not in temp:
                temp.append(keyword)
    course_list.append(temp)

    temp = []
    for keyword in cuisine_keywords:
        if keyword in tag:
            if keyword not in temp:
                temp.append(keyword)
    cuisine_list.append(temp)

    temp = []
    for keyword in diet_keywords:
        if keyword in tag:
            if keyword not in temp:
                temp.append(keyword)
    diet_list.append(temp)

In [None]:
# Add Series to DataFrame
df['Course Keywords'] = course_list
df['Cuisine Keywords'] = cuisine_list
df['Diet Keywords'] = diet_list
df['Recipe URLs'] = urls

df = df[["Title", "Rating", "Description", "Ingredients", "Recipe Facts",
         "Directions", "Nutrition Facts", 'Number of Servings',
         'Calories', 'Total Fat', 'Saturated Fat', 'Cholesterol', 'Sodium',
         'Total Carbohydrate', 'Dietary Fiber', 'Total Sugars', 'Protein',
         'Vitamin C', 'Calcium', 'Iron', 'Potassium', "Tags", 'Course Keywords',
         'Diet Keywords', 'Cuisine Keywords', 'Recipe URLs', 'Image Link']]

In [None]:
# df to csv
df.to_csv("recipes.csv", index=False)

In [None]:
# df to json
df.to_json("recipes.json", orient="index")