# Initial setup

In [1]:
# import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [115]:
# Run selenium docker
!docker run -d -p 4455:4444 -p 7901:7900 --shm-size="2g" selenium/standalone-chrome:latest

b6d6030cec7e914b10a4fd3711ba4a685266755c44f9c3a7c13abd65e9444f7c


# Scraping the product information of the top 50 most scanned products on _OpenFoodFacts_ website

This notebook focuses on scraping detailed product information from the Open Food Facts website. The extracted data includes: generic_name, quantity, packaging, brands, categories, labels, origin, manufacturing_place, stores, countries, value, and barcode.

The process begins by collecting the URLs (href attributes) of the top 50 most popular (i.e., most scanned) products using the Selenium library. These URLs are then used to access individual product pages, from which the specified details are extracted using requests and BeautifulSoup.

#### 1.1. Extract the LINKES of top 50 most scanned products by Selenium


In [None]:
# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run Chrome in headless mode

# Make a remote access to the chrome driver
driver = webdriver.Remote(
    command_executor="http://localhost:4455/wd/hub",
    options=chrome_options
)

# URL for the most scanned products
URL_BY_POPULARITY = 'https://world.openfoodfacts.org/?sort_by=popularity'

driver.get('https://world.openfoodfacts.org/?sort_by=popularity')

product_links = []
try:
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "list_product_a"))
    )
    anchors = driver.find_elements(By.CLASS_NAME, "list_product_a")
    for a in anchors:
        href = a.get_attribute('href')
        if href:
            product_links.append(href)
finally:
    driver.quit()

print(product_links)


#### 1.2. Extract the NAMES of top 50 most scanned products by Selenium

In [None]:
# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in background

# Provide path to your chromedriver
driver = webdriver.Remote(
    command_executor="http://localhost:4455/wd/hub",  # or 4444 if using default
    options=chrome_options
)
driver.get('https://world.openfoodfacts.org/?sort_by=popularity')

POPULAR_PRODUCTS = []
try:
    # Wait until at least one product name loads
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "list_product_name"))
    )

    # Extract all product names
    products = driver.find_elements(By.CLASS_NAME, "list_product_name")
    for p in products:
        #print(p.text)
        POPULAR_PRODUCTS.append(p.text)

finally:
    driver.quit()

POPULAR_PRODUCTS

#### 1.3. Scrape product information for the popular products by Requests and BeautifulSoup

In [150]:
# List containing the id tags for info extraction by BeautifulSoup
info_fields = [
    'field_generic_name', 'field_quantity', 'field_packaging', 'field_brands',
    'field_categories', 'field_labels', 'field_origin', 'field_manufacturing_places',
    'field_stores', 'field_countries', 'field_value', 'barcode_paragraph' 
]

# Loop over the info_fields list and store the respective info for each item in INFO_POPULAR_PRODUCTS list
INFO_POPULAR_PRODUCTS = []
for links in product_links:
    r = requests.get(links)
    soup = BeautifulSoup (r.content)
    for info in info_fields:
        section = soup.find('p', id=info)
        if section:
            text = section.get_text(strip=True).split(':')[1]
            INFO_POPULAR_PRODUCTS.append(f"{info}:{text}")
        else:
            INFO_POPULAR_PRODUCTS.append(f"{info}:{np.nan}") # for missing values, numpy NAN is replaced

In [155]:
# Make a dataframe for the scraped data 
info_popular_products_df = pd.DataFrame(columns= info_fields)
info_popular_products_df.columns

Index(['field_generic_name', 'field_quantity', 'field_packaging',
       'field_brands', 'field_categories', 'field_labels', 'field_origin',
       'field_manufacturing_places', 'field_stores', 'field_countries',
       'field_value', 'barcode_paragraph'],
      dtype='object')

In [160]:
# Create an empty list to hold row dicts
rows = []

# Example: every 12 items is a new product block because info_fields list has 12 elements
for i in range(0, len(INFO_POPULAR_PRODUCTS), 12):
    product_block = INFO_POPULAR_PRODUCTS[i:i+12]
    row = {}
    for info in product_block:
        if ':' in info:
            key, value = info.split(':', 1)
            key = key.strip()
            value = value.strip()
            if key in info_popular_products_df.columns:
                row[key] = value
    rows.append(row)

# Now create the DataFrame
info_popular_products_df = pd.DataFrame(rows)
      

In [162]:
# rename the df columns

NewColumnNames= {'field_generic_name': 'generic_name', 'field_quantity':'quantity', 'field_packaging':'packaging',
       'field_brands':'brands', 'field_categories':'category', 'field_labels':'label', 'field_origin':'origin',
       'field_manufacturing_places':'manufacturing_place', 'field_stores':'store', 'field_countries':'country',
       'field_value':'value', 'barcode_paragraph':'barcode'}


info_popular_products_df.rename(columns=NewColumnNames, inplace=True)
info_popular_products_df

Unnamed: 0,generic_name,quantity,packaging,brands,category,label,origin,manufacturing_place,store,country,value,barcode
0,,33 cl,"Plastic,Bottle",Sidi Ali,"Beverages and beverages preparations,Beverages...",,,,يوسف,Morocco,,6111035000430(EAN / EAN-13)
1,Perly fromage frais,85 g,Plastic,Jaouda,"Dairies,Fermented foods,Fermented milk product...",,,Maroc,,"Morocco,United States",,6111242100992(EAN / EAN-13)
2,,2 L,,sidi ali,"Beverages and beverages preparations,Beverages...",Green Dot,,,,Morocco,,6111035002175(EAN / EAN-13)
3,,"1,5 L","Plastic,Bottle or vial,Bottle","Les Eaux Minérales d'oulmès,Sidi Ali","Beverages and beverages preparations,Beverages...","ISO 22000,ISO 14001,ISO 45001,ISO 9001",,,,Morocco,,6111035000058(EAN / EAN-13)
4,,33cl,"Plastic,Bottle","PepsiCo,pepsi","Beverages and beverages preparations,Beverages...",,,Casablanca Morocco,"Marjane,Carrefour",Morocco,,6111252421568(EAN / EAN-13)
5,,450 ml,,Jaouda,"Dairies,Meals,Milks (liquid and powder),Milks,...",,,,,Morocco,,6111266962187(EAN / EAN-13)
6,Spring water,1500 ml,"Aluminium-can,HdpeFilm-packet,PpFilm-wrapper,L...",Cristaline,"Beverages and beverages preparations,Beverages...",it,France,"Saint-Martin de Gurson,France,24610","Carrefour,Leclerc,Auchan,Intermarché,Super U,E...","Belgium,Côte d'Ivoire,France,Germany,Guadeloup...",,3274080005003(EAN / EAN-13)
7,,1kg,Plastic,MILKY FOOD PROFESSIONAL,"Dairies,Fermented foods,Fermented milk product...",,Maroc,Maroc,,Morocco,,6111246721261(EAN / EAN-13)
8,,1L,"Multilayer-composite,Tetra Pak",Jaouda,"Dairies,Milks (liquid and powder),Milks,Homoge...",No gluten,,,,"Democratic Republic of the Congo,Mauritania,Mo...",,6111242101180(EAN / EAN-13)
9,,160g,Plastic,Jaouda,"Dairies,Fermented foods,Fermented milk product...",No gluten,,,,"Morocco,Maroc",,6111242106949(EAN / EAN-13)


# Scraping the product information and nutritional values of a specific product by barcode on _OpenFoodFacts_ website

In [None]:
PRODUCT_NAME = 'evian-natural-mineral-water' # Optional
PRODUCT_BARCODE = '3068320014067' # Important
URL = f'https://world.openfoodfacts.org/product/{PRODUCT_BARCODE}/{PRODUCT_NAME}'
r = requests.get(URL)
soup = BeautifulSoup (r.content)
soup.prettify()
soup

#### 2.1. Scrape the product information by Requests and BeautifulSoup

In [165]:
info_fields = [
    'field_generic_name', 'field_quantity', 'field_packaging', 'field_brands',
    'field_categories', 'field_labels', 'field_origin', 'field_manufacturing_places',
    'field_stores', 'field_countries' 
]

for info in info_fields:
    section = soup.find('p', id=info)
    if section:
        text = section.get_text(strip=True).split(':')[1]
        print(f"{info}: {text}")
    else:
        print(f"{info}: section not found.")

field_generic_name: Evian Water
field_quantity: 75 cl
field_packaging: Lid or cap,Bottle cap,Label,Pet-bottle
field_brands: Evian
field_categories: Beverages and beverages preparations,Beverages,Waters,Spring waters,Mineral waters,Unsweetened beverages,Natural mineral waters
field_labels: Vegetarian,Vegan,Carbon compensated product,Carbon Trust,Carbon Trust Carbon Neutral,Certified B Corporation,EAC,Green Dot
field_origin: Cachat Spring (France)
field_manufacturing_places: France
field_stores: Coop,Migros,Kiosk,Woolworths,Coles,Tesco,ASDA,Waitrose,ICA,Lucky Supermarket (Cambodia)
field_countries: Australia,Cambodia,France,Morocco,Spain,Sweden,Switzerland,United Kingdom


#### 2.2. Scrape the nutritional values by BeautifulSoup

In [166]:
table = soup.find('table')
headers = [header.get_text(strip = True) for header in table.find_all('th')]

In [167]:
rows_all = []

for rows in table.find_all('tr'):
    for data in rows.find_all('td'):
        values = data.get_text(strip = True)
        rows_all.append(values)

In [168]:
# Convert to DataFrame
df = pd.DataFrame(np.array(rows_all).reshape(-1, 4), columns=headers)

df

Unnamed: 0,Nutrition facts,As soldfor 100 g / 100 ml,As soldper serving (250 ml),Compared to: Natural mineral waters
0,Energy,0 kj(0 kcal),0 kj(0 kcal),
1,Fat,0 g,0 g,
2,Saturated fat,0 g,0 g,
3,Carbohydrates,0 g,0 g,
4,Sugars,0 g,0 g,
5,Fiber,0 g,0 g,
6,Proteins,0 g,0 g,
7,Salt,0 g,0 g,-100%
8,Alcohol,0 % vol,0 % vol,
9,Silica,1.5 mg,3.75 mg,-53%
