In [231]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import random

In [232]:
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None
def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [233]:
# Do not re-execute this!!!
ingre_dish = {}
ingre_dish_img = {}

In [234]:
ingredients = ['egg','milk','avocado','fish','beef','chicken','tofu','pork','beans','pepper','cauliflower',
              'rice','cabbage','lettuce','carrot','onion','spinash','potato','tomato','broccoli']

# initialize the 'ingredient-dishes_link' dictionary with 
# key: string - ingredient name
# value: list - list of links to recipe
for food in ingredients:
    ingre_dish[food] = []
    ingre_dish_img[food] = []

In [235]:
'''
Given the number of ingredients you want to look up,
Auto generate an url search ingre_num number of ingredients
'''
def autoURLGenerator(ingre_num):
    random.shuffle(ingredients)
    this_ingre = ingredients[0:ingre_num]
    url = 'https://www.myrecipes.com/search?q='
    # append the ingredient we want to look up behind the url
    for item in this_ingre:
        url += item+'+'
    url = url[:-1] # we don't want the extra '+' at the end
    return url

'''
Given an url of list of recipes, get all the links to specific recipe
'''
def getDishesURL(url):
    # Write byte to a file
    raw_html = simple_get(url)
    file = open('raw_html.html', 'wb')
    file.write(raw_html)
    file.close()
    html = open('raw_html.html').read()
    soup = BeautifulSoup(html,'html.parser')
    all_result = soup.find_all('a', attrs={'class':'search-result-title-link'})
    links = []
    for result in all_result:
        links.append(result.get('href'))
        
    return links

def updateDishes(links):
    for link in links:
        img_src = ''
        print(link)
        html = simple_get(link)
        soup = BeautifulSoup(html,'html.parser')
        result = soup.find('div', attrs={'class':'ingredients'})
        img_result = soup.find('meta', attrs={'name':'pinterest:media'})
        #image = img_result.find('img')
        img_src = img_result.get('content')
        print(img_src)
            
        try:
            ingredient_list = result.find_all('li')
        except:
            return      

        # turn all 'li tags' into strins for further parsing
        li_string = []
        for li in ingredient_list:
            li_string.append(li.string)

        for item in li_string:
            for word in item.split():
                if word in ingredients:
                    ingre_dish[word].append(link)
                    ingre_dish_img[word].append(img_src)

In [237]:
# train 10 epochs
for i in range(30):
    url = autoURLGenerator(random.randint(1,5))
    links = getDishesURL(url)
    updateDishes(links)

https://www.myrecipes.com/recipe/miso-carrot-dressing
https://cdn-image.myrecipes.com/sites/default/files/styles/4_3_horizontal_-_1200x900/public?itok=Y6-bwmRb
https://www.myrecipes.com/recipe/parmesan-herb-baked-flounder
https://cdn-image.myrecipes.com/sites/default/files/styles/4_3_horizontal_-_1200x900/public?itok=Y6-bwmRb
http://www.myrecipes.com/recipe/indian-style-tofu-cauliflower-with-chutney
https://cdn-image.myrecipes.com/sites/default/files/image/recipes/ck/10/04/italian-tofu-cauliflower-ck-x.jpg
https://www.myrecipes.com/recipe/tofu-pine-nut-lettuce-wraps
https://cdn-image.myrecipes.com/sites/default/files/styles/4_3_horizontal_-_1200x900/public?itok=Y6-bwmRb
https://www.myrecipes.com/recipe/tofu-carrot-ginger-dressing-with-miso
https://cdn-image.myrecipes.com/sites/default/files/image/recipes/ck/tofu-dressing-ck-522136-x.jpg
https://www.myrecipes.com/recipe/lettuce-wedge-potato-salad
https://cdn-image.myrecipes.com/sites/default/files/lettuce-wedge-potato-salad-ck.jpg
https

KeyboardInterrupt: 

In [243]:
ingre_dish_img

{'egg': ['https://cdn-image.myrecipes.com/sites/default/files/egg-noddle-stir-fry-broccoli-ck.jpg',
  'https://cdn-image.myrecipes.com/sites/default/files/image/recipes/ck/quiche-ck-491421-x.jpg',
  'https://cdn-image.myrecipes.com/sites/default/files/styles/4_3_horizontal_-_1200x900/public?itok=Y6-bwmRb',
  'https://cdn-image.myrecipes.com/sites/default/files/styles/4_3_horizontal_-_1200x900/public?itok=Y6-bwmRb',
  'https://cdn-image.myrecipes.com/sites/default/files/image/recipes/ck/06/01/cajun-quiche-ck-1142014-x.jpg',
  'https://cdn-image.myrecipes.com/sites/default/files/image/recipes/ck/06/01/cajun-quiche-ck-1142014-x.jpg',
  'https://cdn-image.myrecipes.com/sites/default/files/image/recipes/ck/06/01/cajun-quiche-ck-1142014-x.jpg',
  'https://cdn-image.myrecipes.com/sites/default/files/styles/4_3_horizontal_-_1200x900/public?itok=Y6-bwmRb',
  'https://cdn-image.myrecipes.com/sites/default/files/image/recipes/sl/14/03/classic-homemade-mayonnaise-sl-x.jpg',
  'https://cdn-image.my

In [240]:
print(len(ingre_dish['egg']))
len(ingre_dish_img['egg'])

32


32

In [241]:
for keys in ingredients:
    if len(ingre_dish[keys]) != len(ingre_dish_img[keys]):
        print(keys)