In [74]:
import requests

# Define essential fields for everyday use, including nutrition-related ones
essential_fields = [
    "_id", "product_name", "brands", "categories", "countries",
    "ingredients_text", "labels", "nutriscore_grade", "nutriscore_score",
    "nova_group", "nutriments.energy_100g", "nutriments.fat_100g",
    "nutriments.saturated-fat_100g", "nutriments.carbohydrates_100g",
    "nutriments.sugars_100g", "nutriments.fiber_100g",
    "nutriments.proteins_100g", "nutriments.salt_100g", "nutriments.sodium_100g"
]

# API URL
api_url = "https://world.openfoodfacts.org/cgi/search.pl"


In [80]:
# Define parameters to fetch data
params = {
    "action": "process",
    "json": 1,
    "fields": ",".join(essential_fields),  # Fetch only essential fields
    "page_size": 100,  # Number of items per request
    "page": 1          # Start with the first page
}

# Fetch data from the API
response = requests.get(api_url, params=params)

# Check response and extract products
if response.status_code == 200:
    data = response.json()
    products = data.get("products", [])
    print(f"Fetched {len(products)} products.")
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")


Failed to fetch data. Status code: 429


In [42]:
import random

# Function to inspect all data for 20 randomly sampled products
def inspect_sampled_products_full(products, sample_size=20):
    if not products:
        print("No products available to inspect.")
        return

    # Randomly sample products (ensure we don't sample more than available)
    sample_size = min(sample_size, len(products))
    sampled_products = random.sample(products, sample_size)

    print(f"Displaying all data for {sample_size} randomly sampled products:\n")
    for i, product in enumerate(sampled_products, start=1):
        product_name = product.get("product_name", "Unknown Product")
        print(f"Product {i}: {product_name}")
        print("All Data:")
        for key, value in product.items():
            print(f"  {key}: {value}")
        print("-" * 80)

# Inspect a sample of 20 products
inspect_sampled_products_full(products, sample_size=20)


Displaying all data for 20 randomly sampled products:

Product 1: Biscuit soja orange
All Data:
  _id: 3175680011442
  brands: Gerblé
  categories: Snacks,Petit-déjeuners,Snacks sucrés,Biscuits et gâteaux,Biscuits sucrés & biscuits apéritifs,Biscuits,Biscuits croustillants,Biscuit sec croquant allégé en matière grasse,Biscuits secs,Biscuits sablés,Biscuit sec pour petit déjeuner,Biscuit sec aux fruits
  countries: France,Guadeloupe,Martinique,La Réunion
  ingredients_text: Farine de _blé_ 57,7%, sucre de canne roux, huile de colza, flocons de _soja_ 7%, germe de _blé_ 5,4%, oranges déshydratées 1,5%, magnésium, sucre, amidon de maïs, jus concentré de citron émulsifiant : lécithines de colza, sel de mer, arôme naturel d'orange, calcium, poudres à lever : carbonates de sodium amidon, vitamines (E, PP, B5, B6, B1, B9).
  labels: Fabriqué en France,Nutriscore,Nutriscore B,Farine de blé française,Sans huile de palme,Moins 40% de sucre
  nova_group: 4
  nutriments: {'carbohydrates_100g': 66,

In [48]:
# Function to display essential fields for randomly sampled products
def inspect_sampled_products_focused(products, sample_size=20):
    if not products:
        print("No products available to inspect.")
        return

    # Randomly sample products (ensure we don't sample more than available)
    sample_size = min(sample_size, len(products))
    sampled_products = random.sample(products, sample_size)

    print(f"Displaying detailed data for {sample_size} randomly sampled products:\n")
    for i, product in enumerate(sampled_products, start=1):
        product_name = product.get("product_name", "Unknown Product")
        brands = product.get("brands", "Not Specified")
        categories = product.get("categories", "Not Specified")
        countries = product.get("countries", "Not Specified")
        ingredients = product.get("ingredients_text", "Not Specified")
        labels = product.get("labels", "Not Specified")
        nutriscore = product.get("nutriscore_grade", "Not Specified")
        nova_group = product.get("nova_group", "Not Specified")
        nutriments = product.get("nutriments", {})
        
        print(f"Product {i}: {product_name}")
        print(f"  Brands: {brands}")
        print(f"  Categories: {categories}")
        print(f"  Countries: {countries}")
        print(f"  Ingredients: {ingredients}")
        print(f"  Labels: {labels}")
        print(f"  NutriScore Grade: {nutriscore}")
        print(f"  Nova Group: {nova_group}")
        print("  Nutrition Data:")
        for key in ["energy_100g", "fat_100g", "saturated-fat_100g", 
                    "carbohydrates_100g", "sugars_100g", "fiber_100g", 
                    "proteins_100g", "salt_100g", "sodium_100g"]:
            print(f"    {key.replace('_', ' ').title()}: {nutriments.get(key, 'Not Available')}")
        print("-" * 80)

# Inspect a sample of 20 products with focused fields
inspect_sampled_products_focused(products, sample_size=20)


Displaying detailed data for 20 randomly sampled products:

Product 1: Eau de source
  Brands: Cristaline
  Categories: Boissons, Eaux, Eaux de sources, Boissons sans sucre ajouté
  Countries: Belgique,Côte d'Ivoire,France,Luxembourg,Mali,Martinique,Russie,Suisse,Royaume-Uni
  Ingredients: Eau de source
  Labels: Triman, Sans Nitrates
  NutriScore Grade: a
  Nova Group: 1
  Nutrition Data:
    Energy 100G: 0
    Fat 100G: 0
    Saturated-Fat 100G: 0
    Carbohydrates 100G: 0
    Sugars 100G: Not Available
    Fiber 100G: Not Available
    Proteins 100G: Not Available
    Salt 100G: 0.000475
    Sodium 100G: 0.00019
--------------------------------------------------------------------------------
Product 2: hepar
  Brands: Hépar,Nestlé Waters,Nestlé
  Categories: Boissons,Eaux,Eaux de sources,Eaux minérales,Eaux minérales naturelles,Eau minérale naturelle non gazeuse
  Countries: Belgique,France
  Ingredients: Pour 100ml : Calcium = 549mg / Magnésium = 119mg / Sodium = 14.2mg / Sulfate =

In [48]:
# Manually specify all fields from the Open Food Facts data-fields.txt
fields = [
    # General Product Information
    "_id", "product_name", "brands", "categories", "countries", "countries_tags",
    "ingredients_text", "labels", "quantity", "packaging", "additives_tags",
    "allergens", "allergens_tags", "traces", "traces_tags", "ingredients_analysis_tags",
    
    # Nutri-Score and NOVA
    "nutriscore_grade", "nutriscore_score", "nova_group",
    
    # Nutritional Information (100g)
    "nutriments.energy_100g", "nutriments.energy-kj_100g", "nutriments.energy-kcal_100g",
    "nutriments.fat_100g", "nutriments.saturated-fat_100g", "nutriments.trans-fat_100g",
    "nutriments.cholesterol_100g", "nutriments.carbohydrates_100g", "nutriments.sugars_100g",
    "nutriments.fiber_100g", "nutriments.proteins_100g", "nutriments.salt_100g",
    "nutriments.sodium_100g", "nutriments.vitamin-a_100g", "nutriments.vitamin-c_100g",
    "nutriments.calcium_100g", "nutriments.iron_100g", "nutriments.potassium_100g",
    "nutriments.magnesium_100g", "nutriments.phosphorus_100g", "nutriments.zinc_100g",
    "nutriments.copper_100g", "nutriments.manganese_100g", "nutriments.selenium_100g",
    "nutriments.vitamin-d_100g", "nutriments.vitamin-e_100g", "nutriments.vitamin-k_100g",
    "nutriments.thiamin_100g", "nutriments.riboflavin_100g", "nutriments.niacin_100g",
    "nutriments.vitamin-b6_100g", "nutriments.folate_100g", "nutriments.vitamin-b12_100g",
    "nutriments.biotin_100g", "nutriments.pantothenic-acid_100g", "nutriments.water_100g",
    
    # Images and Links
    "image_url", "image_nutrition_url", "image_ingredients_url", "url",
    
    # Miscellaneous
    "environment_impact_level_tags", "ecoscore_grade", "ecoscore_score"
]



In [51]:
# Function to fetch multiple pages of data
def fetch_all_data(api_url, fields, max_pages=10, page_size=100):
    all_products = []
    for page in range(1, max_pages + 1):  # Fetch up to max_pages
        print(f"Fetching page {page}...")
        params = {
            "action": "process",
            "json": 1,
            "fields": ",".join(fields),  # Include all fields
            "page_size": page_size,
            "page": page
        }
        response = requests.get(api_url, params=params)
        if response.status_code == 200:
            result = response.json()
            products = result.get("products", [])
            all_products.extend(products)
            if len(products) == 0:  # Stop if no more products are returned
                print("No more products found.")
                break
        else:
            print(f"Failed to fetch page {page}. Status code: {response.status_code}")
            break

    print(f"Total products fetched: {len(all_products)}")
    return all_products

# API URL
api_url = "https://world.openfoodfacts.org/cgi/search.pl"

# Fetch all data with all fields
all_products = fetch_all_data(api_url, fields, max_pages=10, page_size=500)


Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Total products fetched: 1000


In [53]:
# Convert fetched products to a Pandas DataFrame
def products_to_dataframe(products):
    if not products:
        print("No products available to create a DataFrame.")
        return pd.DataFrame()
    
    # Normalize nested fields (e.g., nutriments) and convert to DataFrame
    df = pd.json_normalize(products)
    print(f"DataFrame created with {len(df)} rows and {len(df.columns)} columns.")
    return df

# Create a DataFrame from the fetched data
df = products_to_dataframe(all_products)


DataFrame created with 1000 rows and 55 columns.


In [54]:
# Display the DataFrame and list all included columns
def display_dataframe_with_columns(df, rows=20):
    if df.empty:
        print("DataFrame is empty. Nothing to display.")
    else:
        print(f"Displaying the first {rows} rows of the DataFrame:")
        display(df.head(rows))
        
        print("\nIncluded Columns:")
        for column in df.columns:
            print(f"  - {column}")

# Display the DataFrame with included columns
display_dataframe_with_columns(df, rows=20)


Displaying the first 20 rows of the DataFrame:


Unnamed: 0,_id,additives_tags,allergens,allergens_tags,brands,categories,countries,countries_tags,ecoscore_grade,environment_impact_level_tags,...,nutriments.vitamin-c_100g,nutriments.vitamin-d_100g,nutriments.phosphorus_100g,nutriments.pantothenic-acid_100g,nutriments.trans-fat_100g,nutriments.cholesterol_100g,nutriments.vitamin-a_100g,nutriments.manganese_100g,nutriments.selenium_100g,nutriments.vitamin-k_100g
0,3274080005003,[],,[],Cristaline,"Boissons et préparations de boissons,Boissons,...","Belgique,Côte d'Ivoire,France,Allemagne,Guadel...","[en:belgium, en:cote-d-ivoire, en:france, en:g...",not-applicable,[],...,,,,,,,,,,
1,7622210449283,"[en:e322, en:e322i, en:e450, en:e450i, en:e500...","en:eggs,en:gluten,en:milk,en:soybeans","[en:eggs, en:gluten, en:milk, en:soybeans]","Lu,Mondelez","Snacks,Sweet snacks,Cocoa and its products","Algeria,Belgium,France,French Polynesia,German...","[en:algeria, en:belgium, en:france, en:french-...",unknown,[],...,,,,,,,,,,
2,3017620425035,[en:e322],"en:milk,en:nuts,en:soybeans","[en:milk, en:nuts, en:soybeans]",Ferrero,"Breakfasts,Spreads,Sweet spreads,fr:Pâtes à ta...","Algeria,Belgium,France,Germany,Guadeloupe,Ital...","[en:algeria, en:belgium, en:france, en:germany...",d,[],...,,,,,,,,,,
3,3175680011480,"[en:e322, en:e336, en:e500, en:e503]",,"[en:gluten, en:sesame-seeds]",Gerblé,"Snacks,Snacks sucrés,Biscuits et gâteaux,Biscu...",France,[en:france],c,[],...,,,,,,,,,,
4,5449000214911,"[en:e150d, en:e338]",,[],Coca-Cola,"Boissons et préparations de boissons,Boissons,...","Belgique,France,Allemagne,Hongrie,Italie,Serbi...","[en:belgium, en:france, en:germany, en:hungary...",not-applicable,[],...,,,,,,,,,,
5,3017620422003,"[en:e322, en:e322i]","en:milk,en:nuts,en:soybeans","[en:milk, en:nuts, en:soybeans]","Nutella,Ferrero","Breakfasts,Spreads,Sweet spreads,fr:Pâtes à ta...","Belgium,France,Germany,Italy,Luxembourg,Morocc...","[en:belgium, en:france, en:germany, en:italy, ...",d,[],...,,,,,,,,,,
6,50184453,[],"en:celery,en:gluten","[en:celery, en:gluten]","Marmite,Unilever",Yeast extract spreads,"France,Irlande,Royaume-Uni","[en:france, en:ireland, en:united-kingdom]",unknown,[],...,,,,,,,,,,
7,5449000000996,"[en:e150d, en:e290, en:e338]",,[],Coca-Cola,"Getränke und Getränkezubereitungen,Getränke,Ko...","Frankreich,Deutschland,Litauen,Vereinigtes Kön...","[en:france, en:germany, en:lithuania, en:unite...",not-applicable,[],...,,,,,,,,,,
8,3268840001008,[],,[],Cristaline,"Boissons, Eaux, Eaux de sources, Boissons sans...","Belgique,Côte d'Ivoire,France,Luxembourg,Mali,...","[en:belgium, en:cote-d-ivoire, en:france, en:l...",not-applicable,[],...,,,,,,,,,,
9,5449000214799,"[en:e150d, en:e331, en:e338, en:e950, en:e951]",fr:Aspartame,[fr:aspartame],Coca Cola,"Boissons,Boissons gazeuses,Boissons édulcorées...","Belgique,France,Allemagne,Hongrie,Italie,Espag...","[en:belgium, en:france, en:germany, en:hungary...",not-applicable,[],...,,,,,,,,,,



Included Columns:
  - _id
  - additives_tags
  - allergens
  - allergens_tags
  - brands
  - categories
  - countries
  - countries_tags
  - ecoscore_grade
  - environment_impact_level_tags
  - image_url
  - ingredients_analysis_tags
  - ingredients_text
  - labels
  - nova_group
  - nutriscore_grade
  - nutriscore_score
  - packaging
  - product_name
  - quantity
  - traces
  - traces_tags
  - url
  - nutriments.carbohydrates_100g
  - nutriments.energy-kcal_100g
  - nutriments.energy-kj_100g
  - nutriments.energy_100g
  - nutriments.fat_100g
  - nutriments.fiber_100g
  - nutriments.proteins_100g
  - nutriments.salt_100g
  - nutriments.saturated-fat_100g
  - nutriments.sodium_100g
  - nutriments.sugars_100g
  - ecoscore_score
  - nutriments.magnesium_100g
  - nutriments.vitamin-b6_100g
  - nutriments.vitamin-e_100g
  - nutriments.vitamin-b12_100g
  - nutriments.calcium_100g
  - nutriments.potassium_100g
  - nutriments.copper_100g
  - nutriments.iron_100g
  - nutriments.zinc_100g
  - n

In [59]:
# Function to sanitize and export DataFrame to Excel
def sanitize_and_export_to_excel(df, filename="openfoodfacts_cleaned.xlsx"):
    if df.empty:
        print("DataFrame is empty. Nothing to export.")
        return
    
    try:
        # Sanitize DataFrame by removing problematic characters
        df_cleaned = df.applymap(
            lambda x: ''.join(ch for ch in str(x) if ch.isprintable()) if isinstance(x, str) else x
        )
        
        # Export cleaned DataFrame to Excel
        df_cleaned.to_excel(filename, index=False, engine="openpyxl")
        print(f"Data successfully exported to '{filename}'.")
    except Exception as e:
        print(f"Failed to export data: {e}")

# Sanitize and export the DataFrame
sanitize_and_export_to_excel(df, filename="openfoodfacts_cleaned.xlsx")


  df_cleaned = df.applymap(


Data successfully exported to 'openfoodfacts_cleaned.xlsx'.
