# Scrapping framework - simple
Retrives recipes from web pages, relies on provided set of urls. Do not allow for any kind of site structore discovery or so. Reliable and quite fast.

In [2]:
### imports
import urllib.request

import sys
import time
import json

from bs4 import BeautifulSoup

import multiprocessing as mp

### Generators
These functions are responsible for returning list of links to be crawled and also an sequentional number. This function **must** take two arguments `begin` and `end` wich are basically python `range` arguments. Moreover this function **must** return two elements: *url* and *step* (between range borders). Here is how generator should look like:

```python
def sample_generator(begin, end):
    for i in range(begin, end):
        yield 'http://example.com', i
```

Samples below:

In [20]:
def cookbooks_generator(begin, end):
    seed = 'http://www.cookbooks.com/Recipe-Details.aspx?id='
    for i in range(begin, end):
        yield seed + str(i), i

In [21]:
def allrecipes_generator(begin, end):
    with open('allrecipes/links.txt') as f:
        links = list(map(lambda x: x.strip(), f.readlines()))
    for i in range(begin, end):
        yield links[i], i

In [1]:
def recipesplus_generator(begin, end):
    seed = 'http://recipes-plus.com/api/v2.0/recipes/'
    with open('recipes-plus/ids.txt') as f:
        ids = list(map(lambda x: x.strip(), f.readlines()))
    for i in range(min(begin, len(ids)), min(end, len(ids))):
        yield seed + ids[i], i

In [3]:
### use this assertions to check if your generator function does right
test_generator = recipesplus_generator

temp = list(test_generator(0,4))
assert type(temp) is list
assert len(temp) == 4
assert type(temp[0]) is tuple
assert len(temp[0]) == 2
for t1, t2 in temp:
    assert type(t1) is str
    assert len(t1) > 0
    # dummy url check
    assert t1.find('http://') == 0 or t1.find('https://') == 0
    assert t1.find('.') > 0
    assert type(t2) is int

### Ingredients
This function should find all **ingredients** on the page. Takes single argument `soup` (BeautifulSoup), returns not empty *list* of ingredients (with quantities) as simple string.

In [22]:
# cookbooks get ingredients
def ingredients_cookbooks(soup):
    ingredients_tag = soup.find_all('span', 'H2', string='ingredients')[0]
    return ingredients_tag.parent.p.get_text('|').strip().split('|')

In [23]:
# allrecipes get ingredients
def ingredients_allrecipes(soup):
    temp = list(map(lambda x: x.get_text().strip(), soup.find_all('span', 'recipe-ingred_txt')))
    temp = list(filter(lambda x: x != '' and x != 'Add all ingredients to list', temp))
    return temp

In [7]:
# recipesplus get ingredients
def ingredients_recipesplus(recipe):
    func = lambda x: x["amount"] + ' ' + x["unit"] + ' ' + x['ingredient']
    return list(map(func, recipe['ingredients']))

In [11]:
with open('recipes-plus/test.json') as f:
    data = json.load(f)
ingredients_recipesplus(data['data'])

['1 medium orange',
 '100 gram packet',
 '1 knuckle size strip of watermelon flesh',
 '1 smaller knuckle size feta cheese']

### Directions
This function should find all **directions** on the page. Takes single argument `soup` (BeautifulSoup), returns not empty *list* of directions as simple string.

In [24]:
# cookbooks get directions
def directions_cookbooks(soup):
    directions_tag = soup.find_all('span', 'H2', string='preparation')[0]
    # print(directions_tag)
    return directions_tag.parent.p.get_text('|').strip().split('  ')

In [25]:
# allrecipes get directions
def directions_allrecipes(soup):
    temp = list(map(lambda x: x.get_text().strip(), soup.find_all('span', 'recipe-directions__list--item')))
    temp = list(filter(lambda x: x != '', temp))
    return temp

In [14]:
def directions_recipesplus(recipe):
    return recipe['steps']

In [15]:
directions_recipesplus(data['data'])

['cut the orange with knife, to 4 quarters and bring out the pulps out the skins carefully.',
 'Next make jelly drink as there on the packet is recommended.',
 'Now add the pulps into the jelly and mix a bit. After wards pour the jelly mixture into a medium serving pot. Then put it into refrigerator.',
 'Bring out the jelly after 2 hours.Cut a strip of water melon and a strip of cheese. Put the cheese on top, at the middle of jelly,s surface and put the water melon on top of the cheese.']

### Title
This function should find **title** on the page. Takes single argument `soup` (BeautifulSoup), returns title as simple string.

In [27]:
# cookbooks get title
def title_cookbooks(soup):
    return soup.find_all('p', 'H2')[0].get_text().lower()

In [28]:
# allrecipes get title
def title_allrecipes(soup):
    return list(map(lambda x: x.get_text().strip(), soup.find_all('h1','recipe-summary__h1')))[0]

### Ingredients , directions and title testbed
Test your code here

In [38]:
#
# use this code to test your functions on real life pages
# note that not all urls from generator may be valid
#
### provide generator and proper functions
test_generator = allrecipes_generator
test_ingredients = ingredients_allrecipes
test_directions = directions_allrecipes
test_title = title_allrecipes

for url, i in test_generator(0,4):
    test_html_doc = urllib.request.urlopen(url).read().decode('utf-8')
    test_soup = BeautifulSoup(test_html_doc, 'html.parser')
    # assert title
    title = test_title(test_soup)
    assert type(title) is str
    assert len(title) > 0
    # assert ingredients
    ing = test_ingredients(test_soup)
    assert type(ing) is list
    assert len(ing) > 0
    for i in ing:
        assert type(i) is str
        assert len(i) > 0
    direc = test_directions(test_soup)
    assert type(direc) is list
    assert len(direc) > 0
    for i in direc:
        assert type(i) is str
        assert len(i) > 0
    

### Prefix and suffix
These functions are to generate prefix and suffix for filename (in case titles itself are not unique).  
Prefix should end with '-' or other separator.  
Suffix should start with '-' or other separator.

In [13]:
# prefix
str_id = lambda i: '0' * (8 - len(str(i))) + str(i) + "-"

In [39]:
# suffix

### Target function
Download, parse and save entire recipe - this function is executed in parallel. This is kind of universal - it is enough to provide url and set of proper parsing functions. Type void.  
List of arguments:
* **step** `int` - number of executed step
* **url** `string` - url of page to be downloaded
* **retrieve_ingredients** `function` - function retriving ingredients from `soup`
* **retrieve_directions** `function` - function retrieving directions from `soup`
* **retrieve_title** `function` - function retrieving title from `soup`
* **path** *optional* `string` - custom path to save retrieved recipe
* **filename_prefix** *optional* `string` - string value of filename prefix
* **filename_suffix** *optional* `string` - string value of filename suffix

In [12]:
def save_recipe(step, url, retrieve_ingredients, retrieve_directions, retrieve_title, path="./", filename_prefix="", filename_suffix=""):
    try:
        start = time.time()
        
        # get page
        html_doc = urllib.request.urlopen(url).read().decode('utf-8')
        # parse
        soup = BeautifulSoup(html_doc, 'html.parser')
        # retrive information and save to dictionary
        title = retrieve_title(soup)
        recipe = dict()
        recipe['title'] = title
        recipe['ingredients'] = retrieve_ingredients(soup)
        recipe['directions'] = retrieve_directions(soup)
        recipe['link'] = url
        # save to file
        with open(path+'/'+filename_prefix+title.lower().replace(' ','_')+filename_suffix+'.json', 'w+') as f:
            f.write(json.dumps(recipe))
            
        end = time.time()
        elapsed = end-start
        
        print("Step:\t", step, "Time:\t", elapsed)
    
    except:
        print("Unable to get recipe from:\t", url)
        #for s in sys.exc_info():
        #    print(s)

### Core function
This function takes care of managing shild processes and running scrapping tasks.

In [15]:
def scrapping_core(begin, end, generator, 
                   ingredients, directions, title, 
                   target_func = save_recipe, path='./', prefix = None, suffix = None, 
                   sleep_time = 1, max_processes = 8):
    #
    # only downloads links from the list
    #
    
    processes = []
    for url, i in generator(begin, end):
        
        p = ""
        s = ""
        if prefix:
            p = prefix(i)
        if suffix:
            s = suffix(i)

        arguments = (i, url, ingredients, directions, title, path, p, s)
            
        inactive = []
        # visit list of processes
        for proc in processes:
            # when process is no longer active, join it and add to list of inactive processes
            if not proc.is_alive():
                proc.join()
                inactive.append(proc)
        # remove inactive processes from processes list
        while inactive:
            processes.remove(inactive.pop())
        # print("Number of active processes:\t", len(processes))
    
        # if number of active processes is acceptable, we can start new process
        if len(processes) < max_processes: 
            p = mp.Process(target = target_func, args = arguments)
            p.start()
            processes.append(p)
        else:
            print("List of processes is full")
    
        # sleep, to avoid ddos attack or to fit in robots.txt rules
        time.sleep(sleep_time)

    # join remaining processes
    while processes:
        temp = processes.pop()
        temp.join()
    

## Test

In [19]:
scrapping_core(0, 5, allrecipes_generator, ingredients_allrecipes, directions_allrecipes, title_allrecipes, prefix=str_id)

Step:	 0 Time:	 2.6241471767425537
Step:	 1 Time:	 2.617486000061035
Step:	 2 Time:	 2.373093843460083
Step:	 3 Time:	 2.5032308101654053
Step:	 4 Time:	 2.29815936088562
