In [1]:
import import_ipynb
import scraper_functions as sf
import general_functions as gf
import pandas as pd
import time

importing Jupyter notebook from scraper_functions.ipynb
importing Jupyter notebook from general_functions.ipynb


In [2]:
def main():
    start_time = time.time()
    # todo allow user to select how many recipes they want to look at
    search_keyword = 'samosa'
    search_url, search_html = get_search_html(search_keyword)
    # gf.time_checkpoint('1', start_time)
    list_of_page_urls = get_page_urls(search_keyword, search_html)
    # gf.time_checkpoint('2', start_time)
    list_of_html = get_html_texts(list_of_page_urls)
    # gf.time_checkpoint('3', start_time)
    recipe_df = extract_recipe_info(list_of_html)
    # gf.time_checkpoint('4', start_time)
    extract_recipe_ingredients(recipe_df)
    # gf.time_checkpoint('5', start_time)
    freq_table = analyze_df(recipe_df)
    print(freq_table)
    print(recipe_df[['name', 'author', 'url']])

In [3]:
def get_search_html(search_keyword):
    #  get html for search page
    search_url = sf.clean_search_url(search_keyword)
    search_html = sf.get_html(search_url)

    return search_url, search_html

In [4]:
def get_page_urls(search_keyword, search_html):
    # get list of urls using first page html
    page_urls = search_html.find_all('li', class_='o-Pagination__a-ListItem')

    max_page = 1

    for page in page_urls:
        clean_page_num = page.text.strip()
        try:
            clean_page_num = int(clean_page_num)
            if clean_page_num > max_page:
                max_page = clean_page_num
        except:
            pass

    list_of_page_urls = [sf.clean_search_url(search_keyword)]

    # create list of reciepe urls
    if max_page >= 2:
        for page_num in range(2, max_page + 1):
            clean_url = sf.clean_search_url(search_keyword, page_num=page_num)
            list_of_page_urls.append(clean_url)

    return list_of_page_urls

In [5]:
def get_html_texts(list_of_page_urls):
    # get html text for all page that come up in search of keyword
    list_of_html = []

    for url in list_of_page_urls:
        html = sf.get_html(url)
        list_of_html.append(html)

    return list_of_html

In [6]:
def extract_recipe_info(list_of_html):
    # create df of name, author and link to recipe
    recipe_df = pd.DataFrame(columns=['name', 'author', 'ingredients', 'url'])

    for html in list_of_html:
            searches_html = html.find_all('section', class_='o-RecipeResult o-ResultCard')

            for recipe in searches_html:
                recipe_name = recipe.find('a', class_='').span.text
                try:
                    recipe_author = recipe.find('span', class_='m-Info__a-AssetInfo').text.lstrip('Courtesy of ')
                except:
                    recipe_author = 'Unknown Author'
                recipe_url = f"http://{recipe.find('a', class_='')['href'].lstrip('/')}"
                recipe_df = recipe_df.append({'name': recipe_name, 'author': recipe_author, 'url': recipe_url},
                                             ignore_index=True)

    return recipe_df

In [7]:
def extract_recipe_ingredients(recipe_df):
    for recipe_url in recipe_df.url:
        try:
            recipe_ingredients = sf.get_ingredients_from_url(recipe_url)
        except:
            recipe_ingredients = None

        url_index = recipe_df.url[recipe_df.url == recipe_url].index[0]
        recipe_df.at[url_index, 'ingredients'] = recipe_ingredients

In [8]:
def analyze_df(recipe_df):
    main_ingredients_series = pd.Series(dtype=str)

    for ingredients_list in recipe_df['ingredients']:
        if ingredients_list is not None:
            ingredients_series = pd.Series(ingredients_list)
            main_ingredients_series = main_ingredients_series.append(ingredients_series, ignore_index=True)

    # todo make percentage and make another tab to put this in
    freq_table = main_ingredients_series.value_counts()

    return freq_table

In [9]:
if __name__ == '__main__':
    gf.display_settings(display_width=2000)
    main()

salt                                                           18
vegetable oil                                                  11
all-purpose flour                                               9
water                                                           7
kosher salt                                                     6
cumin seeds                                                     5
ground cumin                                                    5
lemon juice                                                     5
garam masala                                                    5
lemon                                                           4
minced fresh ginger                                             4
curry powder                                                    4
potatoes                                                        4
peas                                                            4
plain yogurt                                                    4
chopped fr