In [3]:
# ALlrecipes.com scraper
import numpy as np
from bs4 import BeautifulSoup
import requests
import urllib.request
from urllib.request import Request, urlopen
import re
import pandas as pd
# Libraries to track the scraping
import time
from tqdm import tqdm_notebook
import random
import socks
import socket
import torpy

In [2]:
from torpy.http.requests import TorRequests

In [227]:
session = requests.session()
session.proxies = {'http': 'socks5://localhost:9150',
                   'https': 'socks5://localhost:9150'}
r = session.get('http://httpbin.org/ip')
print(r.text)

{
  "origin": "51.68.201.119"
}



In [46]:
# Set the pattern for finding the links for the pages; use this to find the links as they're stored in text on the page
url_pattern = re.compile('detailUrl":"(/recipe/[0-9]+/[a-z-]*/)')
def get_links(url):
    response = requests.get(url)
    page_urls = re.findall(url_pattern, response.text)
    return page_urls

def collect_data(link):
    # Function that scrapes the relevant data from the page
    # Parameter - link - a link to a recipe, passed one at time
    # Returns   - a dictionary {name: {rating: *, category: cats, ingredients: [], method: [], prep: #, cook: #}}
    
    url = "https://www.allrecipes.com" + str(link)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    recipe_dict = {}
    
    # Get title
    title = soup.find_all("meta",attrs = {"property":"og:title"})
    title = re.findall('="(.*)" ', str(title))[0]
    
    # Get food categories / hierarchy
    category_list = []
    categories = soup.find_all("span", class_="toggle-similar__title")
    if len(categories) == 0:
        categories = soup.find_all("span", class_="breadcrumbs__title")

    for cat in categories:
        cat = re.sub("\n", "", cat.getText()).strip()
        if cat in ["Home", "Recipes"]:
            continue
        category_list.append(cat)
   
    
    # Get rating
    rating_data = soup.find_all("meta",attrs = {"property":"og:rating"})
    if len(rating_data) == 0:
        rating_data = soup.find_all("meta",attrs = {"name":"og:rating"})
    rating = re.findall('content="([0-9][.]*[0-9]{0,2})',str(rating_data))[0]
    
    # Get ingredients
    ingredient_list = []
    ingredients = soup.find_all("span",class_="recipe-ingred_txt added")
    if len(ingredients) == 0:
        ingredients = soup.find_all("span",class_="ingredients-item-name")
    
    for ingredient in ingredients:
        ingredient_list.append(split_quantity_ingredients(ingredient.getText().strip()))
    
    # Get cooking method
    cooking_method_list = []
    cooking_method = soup.find_all("span",class_="recipe-directions__list--item")
    if len(cooking_method) == 0:
        cooking_method = soup.find_all("div", class_="section-body")
    for method in cooking_method:
        method = re.sub("\n","",method.getText()).strip()
        if len(re.findall("[0-9]* calories", method)) == 0 and len(method) > 0:
            cooking_method_list.append(method)

    # Get prep and cook times
    cooktimes = soup.find_all("time")
    if len(cooktimes) == 0:
        prep = re.findall('prepTime": "(PT[0-9]*[A-Z])',response.text)
        cook = re.findall('cookTime": "(PT[0-9]*[A-Z])',response.text)
    else:
        cooktimes = re.findall('datetime="(PT[0-9]*[A-Z])', str(cooktimes))
        prep = cooktimes[0]
        cook = cooktimes[1]
    
    # Create and returnt the dictionary
    recipe_dict[title] = {"Rating":rating,
                          "Category": category_list,
                          "Ingredients": ingredient_list,
                          "Method": cooking_method_list,
                          "Prep time": prep,
                          "Cooking time": cook}
    
    return recipe_dict

def split_quantity_ingredients(ingredient):
    # Function that splits the ingredient items from their quantity so the two can be stored separately
    # Parameters - ingredient: a single ingredient like "1 tablespoon beetroot juice"
    # Returns    - two lists, ingredients and quantity
    
    # Remove stuff in brackets (it doesn't add value) and remove doubles spaces
    ingredient = re.sub(" \(.*\)", "", ingredient)
    ingredient = re.sub("  ", " ", ingredient) 
    
    # Test to see how long the ingredients item so that items like "2 eggs" get split into two items
    # If it's only two items, assume the first is the quantity
    if len(ingredient.split(' ')) <= 2 or ingredient.split(' ')[1][-1] == ",":
        ingredient = ingredient.split(' ')
        if re.search('[0-9]',ingredient[0]) is None:
            item = ' '.join(ingredient)
            quantity = 0
        else:
            quantity = ingredient[0]
            item = ' '.join(ingredient[1:])
    
    elif len(ingredient.split(' ')) > 2: 
        # Check for those unicode 1/2, 1/4, 1/8 etc... and replace with a number
        if ord(ingredient[0]) in [189, 8531, 188, 8532, 8533, 190, 8533, 8539]:
            unicode_dict = {188: "1/4", 189: "1/2", 8531: "1/3", 8532: "2/3", 190: "3/4", 8533: "1/5",
                            8529: "1/8"}
            ingredient = unicode_dict.get(ord(ingredient[0])) + ingredient[1:]
        
        # Check if the unicodes occur in the 3rd place in the string - e.g. 1 1/2
        if ord(ingredient[2]) in [189, 8531, 188, 8532, 8533, 190, 8533, 8539]:
            unicode_dict = {188: "1/4", 189: "1/2", 8531: "1/3", 8532: "2/3", 190: "3/4", 8533: "1/5",
                            8529: "1/8"}
            ingredient = ingredient[0:2] + unicode_dict.get(ord(ingredient[2])) + ingredient[3:]
        
        # Create the patterns for getting the quantities followed by the ingredients
        quantity_pattern = re.compile("([0-9] *[0-9]*[.]*/*-*[0-9]*/*[0-9]* \w+)")
        quantity = re.findall(quantity_pattern, ingredient)
        
        if len(quantity) > 0:
        # The ingredient follows after the quantity, so look there for it
            ingredient_pattern = re.compile(str(quantity[-1]+" (.*)"))
            quantity = '/'.join(quantity)
            item = ingredient_pattern.search(ingredient)
            if item is not None:
                item = item[1]
            else:
                item = ""
        else:
            quantity = ""
            item = ingredient

    
    return quantity, item
    
    

In [174]:

base_url = "https://www.allrecipes.com/?page="
links = []
for i in tqdm_notebook(range(2,40)):
    scrape_url = base_url + str(i)
    links.extend(get_links(scrape_url))
#links


In [182]:
links[1:3]

['/recipe/269732/mexican-taco-meatloaf/',
 '/recipe/10813/best-chocolate-chip-cookies/']

In [186]:
import csv
with open("links.csv", 'w') as links_file:
    links_writer = csv.writer(links_file)
    links_writer.writerow(links)

In [205]:
# Future process for scraping pages
# Download links from pages and store in a file
# Access
#recipe_dict = {}
def get_recipes(links, start_idx, iters, db):
    # Simple function that loads #iters recipes at once and then loads these into the database
    # Parameters:  links       - All of the links currently scraped
    #              start_idx   - Index to in the links list to start scraping from
    #              iters       - The number of pages loaded at once
    #              db          - the sqlalchemy db engine
    # Returns:     Position in the links list of the last page scraped
    
    recipe_dict = {}    
    # To prevent the IP address from getting b
    # Going from the last link scraped, scrape iter numbers of pages
    for n in tqdm_notebook(range(start_idx,start_idx+iters)):
        try:
            recipe_dict.update(collect_data(links[n]))
        except:
            print("Error with link[{}]: {}".format(n, links[n]))
    
    # Put the dictionary into a DataFrame for processing (might not actually be necessary)
    recipe_df = np.transpose(pd.DataFrame(recipe_dict))
    recipe_df.reset_index(inplace = True)
    recipe_df.rename(columns={"index":"Recipe_name", "Cooking time":"Cooking_time", "Prep time":"Prep_time"}, inplace = True)
    
    # Add to the database and return the number of recipes added
    recipes_added = write_to_db(recipe_df, db)
    print("Number of recipes added: {}".format(recipes_added))
    
    return start_idx+iters

In [213]:
start_idx = 1100
for i in range(2):
    start_idx = get_recipes(links, start_idx, 10, db)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


Recipe Beef Stroganoff for Instant Pot® Recipe: already on database
Recipe Amish White Bread Recipe: already on database
Recipe Instant Pot® Hamburger Soup  Recipe: already on database
Recipe Sloppy Joe Casserole with Noodles Recipe: already on database
Recipe Grown-Up Homemade Peeps® Recipe: already on database
Recipe Banana Banana Bread Recipe: already on database
Recipe World's Best Lasagna: already on database
Recipe Best Brownies: already on database
Recipe Greek Turkey Burgers Recipe: already on database
Recipe Fluffy Pancakes Recipe: already on database
Number of recipes added: 0


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

Error with link[1119]: /recipe/24010/easy-chicken-marsala/

Recipe Delicious Black Bean Burritos Recipe: already on database
Recipe Air Fryer Chicken Taquitos Recipe: already on database
Recipe Chef John's Carrot Cake Recipe: already on database
Recipe Chicken Parmesan: already on database
Recipe Mayo Chicken: already on database
Recipe Good Old Fashioned Pancakes Recipe: already on database
Recipe Creamy Au Gratin Potatoes Recipe: already on database
Recipe Better-Than-Olive Garden® Alfredo Sauce  Recipe: already on database
Number of recipes added: 1


In [218]:
len(links) - len(set(links)), len(set(links))

(3663, 99)

In [217]:
testlist = ["A", "A", "B", "B", "C"]
len(testlist) - len(set(testlist))
print(set(testlist))

{'A', 'C', 'B'}


In [188]:
recipe_df = np.transpose(pd.DataFrame(recipe_dict))
recipe_df.reset_index(inplace = True)
recipe_df.rename(columns={"index":"Recipe_name", "Cooking time":"Cooking_time", "Prep time":"Prep_time"}, inplace = True)

In [189]:
recipe_df.head()

Unnamed: 0,Recipe_name,Category,Cooking_time,Ingredients,Method,Prep_time,Rating
0,Kimchi Fried Rice with Frizzled Eggs Recipe,"[Trusted Brands: Recipes and Tips, UNCLE BEN'S®]",PT15M,"[(1 1/2 tablespoons, toasted sesame oil), (1 c...",[Heat 1 1/2 tablespoons sesame oil in a large ...,PT15M,5.0
1,Mexican Taco Meatloaf Recipe,"[World Cuisine, Latin American, Mexican]",PT45M,"[(1 1/2 pounds, lean ground beef), (1 cup, cru...",[Preheat the oven to 350 degrees F (175 degree...,PT15M,4.55
2,Best Chocolate Chip Cookies,"[Desserts, Cookies, Drop Cookies]",[PT10M],"[(1 cup, butter, softened), (1 cup, white suga...",[Preheat oven to 350 degrees F (175 degrees C)...,[PT20M],4.61
3,Brown Sugar Banana Bread Recipe,"[Breakfast and Brunch, Breakfast Bread]",PT40M,"[(4 ripe, bananas, cut into chunks), (1 1/4 cu...",[Preheat oven to 350 degrees F (175 degrees C)...,PT15M,4.49
4,Janet's Rich Banana Bread Recipe,"[Bread, Quick Bread, Fruit Bread, Banana Bread...",PT1H,"[(1/2 cup, butter, melted), (1 cup, white suga...",[Preheat oven to 350 degrees F (175 degrees C)...,PT10M,4.82


## Database
Create the database and place the recipes into the database for storage

In [101]:
import sqlalchemy
import os
import re

In [190]:
# Create the database object to connect
db_name = 'recipesdb'
db_host = 'localhost'
db_user = 'NeilSinclair'
db_pass = 'Passw0rd!'
db = sqlalchemy.create_engine('mysql+pymysql://{}:{}@{}/{}'.format(db_user, db_pass, db_host, db_name))

In [169]:
# Quick clear of the DB for testing purposes
#with db.connect() as conn:
#    conn.execute("TRUNCATE TABLE recipes")

In [198]:
recipe_df.shape[0]

97

In [212]:
def write_to_db(recipe_df, db):
    # Function that writes recipes to the database
    # Parameters: recipe_df - a dataframe containing the recipes to be written to the DB
    #             db        - the sqlalchemy database engine
    # Returns: the number of recipes added to the DB
    
    # load items into the database
    col_order = ["recipe_name", "category", "cooking_time", "ingredients", "method", "prep_time", "rating"]
    query = 'INSERT INTO recipes (' + ' ,'.join(col_order) + ') VALUES (' + '%s, %s, %s, %s, %s, %s, %s' + ')'

    with db.connect() as conn:
        counter = 0
        # Get a list of al of the recipes currently in the database
        recipes_captured = [item[0] for item in conn.execute("SELECT (recipe_name) FROM recipes").fetchall()]
        # Cycle through each line in the dataframe currently storing the recipes
        for i in range(0,recipe_df.shape[0]):
            temp_list = []
            # See if the recipe has been captured already, if so, don't add and move onto the next one  
            recipe = recipe_df.iloc[i, :]
            if recipe[0].lower() in recipes_captured:
                print("Recipe {}: already on database".format(recipe[0]))
                continue
            else:
                for item in recipe:
                    item = re.sub("\[|\]", "", str(item)).lower()
                    temp_list.append(str(item))
                conn.execute(query, tuple(temp_list))
                conn.execute("COMMIT")
                counter += 1
    
    return(counter)

In [209]:
with db.connect() as conn:
    #test = conn.execute("SELECT * FROM recipes WHERE category LIKE '%%mexican%%'").fetchall()
    test2 = list(conn.execute("SELECT (recipe_name) FROM recipes").fetchall())
    
#for item in test2:
#    print(item[0])
itemlen = [item[0] for item in test2]
len(itemlen)

97

'/recipe/10813/best-chocolate-chip-cookies/'

In [22]:
## Code here is for testing any scraping methods
response = requests.get("https://www.allrecipes.com"+str(links[5]))
soup = BeautifulSoup(response.text, "html.parser")

In [23]:
method_list = []
methods = soup.find_all("span",class_="recipe-directions__list--item")
print(len(methods))
if len(methods) == 0:
    methods = soup.find_all("div", class_="section-body")
for method in methods:
    method = method.getText().strip()
    if len(re.findall("[0-9]* calories", method)) == 0:
           method_list.append(method)
method_list


6


['Preheat the oven to 350 degrees F (175 degrees C). Brush the inside of a fluted tube pan with 1 tablespoon melted butter.',
 'Mix sugar, cinnamon, and nutmeg in a bowl. Combine 1/4 cup of the spiced sugar mixture with minced apples and stir until coated.',
 'Separate cinnamon roll dough rounds and cut into quarters. Set aside the icing.',
 'Spoon 1/2 of the apple mixture and 1/2 of the walnuts into the bottom of the prepared pan. Roll 1/2 of the cinnamon roll quarters in the remaining spiced sugar and place on top of the apples and walnuts. Spoon 1/2 of the remaining butter over dough. Repeat with remaining apples, walnuts, dough, and butter.',
 'Bake in the preheated oven until golden brown, about 45 minutes. Let bread cool in the pan for 10 minutes, then invert onto a plate. Cool for 5 more minutes before coating with icing.',
 '']

' I had a dream '