In [None]:
from recipe_scrapers import scrape_me
import pandas as pd
from time import sleep
from requests import TooManyRedirects


def scrape_and_print(url, dont_print=True):
    scraper = scrape_me(url)
    
    ingredients = '\n'.join([
        '* {}'.format(ing)
        for ing in scraper.ingredients()
    ])
    
    print_kwargs = {
        'title': scraper.title(),
        'tot_time': scraper.total_time(),
        'ingredients': ingredients,
        'instructions': scraper.instructions(),
    }
    
    if not dont_print:
        print(
            '=== {title} ========================\n\n'
            'Time needed: {tot_time} minutes\n\n'
            'INGREDIENTS: \n{ingredients}\n\n'
            'INSTRUCTIONS:\n{instructions}'
            '====================================='
            '\n\n\n'
            .format(**print_kwargs)
        )
    
    return {
        'title': scraper.title(),
        'tot_time': scraper.total_time(),
        'ingredients': scraper.ingredients(),
        'instructions': scraper.instructions(),
    }

In [None]:
example_urls = [
    'https://www.allrecipes.com/recipe/236700/beef-bourguignon-without-the-burgundy/',
    'https://www.allrecipes.com/recipe/12682/apple-pie-by-grandma-ople/',
    'https://www.allrecipes.com/recipe/240376/homemade-pepperoni-pizza/'
]

for url in example_urls:
    _ = scrape_and_print(url, dont_print=False)

In [None]:
DF_recipe_urls = pd.concat([
    pd.read_csv('./data/recipes{}.txt'.format(i), header=None)
    for i in ['', 1, 2, 3]
])

In [None]:
def strip_and_clean(x):
    return x.strip().replace('<loc>', '').replace('</loc>', '')

In [None]:
unique_urls = sorted(map(strip_and_clean, DF_recipe_urls[0].unique()))

In [None]:
counter = 0
scrapes = {}
error_counter = 0

for url in unique_urls:
    if error_counter > 10:
        print('exceeded 10 errors')
        break
        
    try:
        scrape = scrape_and_print(url)
    except TooManyRedirects:
        error_counter += 1
        print('error for: {}\nerror_counter = {}'.format(url, error_counter))
        continue
        
    scrapes[url] = scrape
    counter += 1
    sleep(1)
    if counter % 100 == 0:
        print(counter)