# Scraping A Large Dataset of Foods

In [68]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

def get_food_links(url):
    '''Takes a url and returns a dataframe with the food names and links to the food pages
    
    Args:
        url (str): url to scrape

    Returns:
        foods (dataframe): dataframe with food names and links to the food pages
    '''

    # Get the data
    r = requests.get(url)
    # soup response
    soup = BeautifulSoup(r.text, "html.parser")
    # Create a dataframe
    foods = pd.DataFrame(columns=["name", "url"])
    foods = foods.set_index("name")

    # Get unordered list of foods
    ul = soup.find_all('ul')[10]
    # Get the links from the foods
    a = ul.find_all('a')
    for i in a:
        # extract and add their title and href to the dataframe
        foods.loc[i.get("title")] = "https://us.openfoodfacts.org"+i.get("href")

    return foods

In [69]:
df = pd.DataFrame(columns=["name", "url"])
df = df.set_index("name")

for i in range(1, 31):
    url = "https://us.openfoodfacts.org/" + str(i)
    df = pd.concat([df, get_food_links(url)])


In [70]:
df

Unnamed: 0_level_0,url
name,Unnamed: 1_level_1
Nutella - Ferrero - 400 g,https://us.openfoodfacts.org/product/301762042...
Prince - Lu - 300 g,https://us.openfoodfacts.org/product/762221044...
Nutella - Ferrero - 1 kg,https://us.openfoodfacts.org/product/301762042...
Coca Cola Zero - 330 ml,https://us.openfoodfacts.org/product/544900013...
"Muesli Raisin, Figue, Abricot - Bjorg - 375 g",https://us.openfoodfacts.org/product/322982012...
...,...
Rocky Mountain Marshmallow Classic (150G) - 150 g e,https://us.openfoodfacts.org/product/005430009...
Steak Sauce - A.1. - 5 OZ,https://us.openfoodfacts.org/product/005440000...
Original Sauce - A.1. Original - 10 OZ (283g),https://us.openfoodfacts.org/product/005440000...
Dijon mustard - Grey Poupon - 454 g,https://us.openfoodfacts.org/product/005440060...
