In [1]:
import requests
import pandas as pd

def fetch_products(query, query_type="search", page_size=100, page=1, sort_by="created_t"):

    if query_type == "brand":
        # Brand-based search
        url = f"https://world.openfoodfacts.org/brand/{query}.json"
        params = {
            "page_size": page_size,
            "page": page,
            "sort_by": sort_by
        }
    else:
        # Search term-based search
        url = "https://world.openfoodfacts.org/cgi/search.pl"
        params = {
            "search_terms": query,
            "search_simple": 1,
            "action": "process",
            "json": 1,
            "page_size": page_size,
            "page": page,
            "sort_by": sort_by
        }

    response = requests.get(url, params=params)
    if response.status_code != 200:
        raise Exception(f"API request failed: {response.status_code}")

    data = response.json()
    return data.get("products", [])

def extract_product_fields(product):
    nutriments = product.get("nutriments", {})
    return {
        # Identifiers
        "product_name": product.get("product_name", "N/A"),
        "brands": product.get("brands", "N/A"),
        "barcode": product.get("code", "N/A"),
        "categories": ", ".join(product.get("categories_tags", [])),
        "countries": ", ".join(product.get("countries_tags", [])),
        "image_url": product.get("image_url", "N/A"),
        "ingredients_text": product.get("ingredients_text", "N/A"),

        # Nutrition values
        "energy_kcal_100g": nutriments.get("energy-kcal_100g"),
        "fat_100g": nutriments.get("fat_100g"),
        "saturated_fat_100g": nutriments.get("saturated-fat_100g"),
        "carbohydrates_100g": nutriments.get("carbohydrates_100g"),
        "sugars_100g": nutriments.get("sugars_100g"),
        "fiber_100g": nutriments.get("fiber_100g"),
        "proteins_100g": nutriments.get("proteins_100g"),
        "salt_100g": nutriments.get("salt_100g"),

        # Nutrition grade and labels
        "nutrition_grade": product.get("nutrition_grades_tags", [None])[0],
        "nova_group": product.get("nova_group"),
        "ecoscore_grade": product.get("ecoscore_grade"),
        "labels": ", ".join(product.get("labels_tags", [])),
        "allergens": ", ".join(product.get("allergens_tags", [])),
        "packaging": ", ".join(product.get("packaging_tags", [])),

        # Timestamps
        "created_t": product.get("created_t"),
        "last_modified_t": product.get("last_modified_t"),
    }

def search_and_extract_all(query, query_type="search", page_size=100, sort_by="created_t", max_pages=None):

    all_records = []
    page = 1

    while True:
        print(f"Fetching page {page}...")
        products = fetch_products(query, query_type=query_type, page_size=page_size, page=page, sort_by=sort_by)

        if not products:
            print("No more products found.")
            break

        for product in products:
            record = extract_product_fields(product)
            all_records.append(record)

        if max_pages is not None and page >= max_pages:
            print("Reached max_pages limit.")
            break

        page += 1

    df = pd.DataFrame(all_records)
    return df


In [2]:
df_oreo = search_and_extract_all("oreo", query_type="brand")


Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
No more products found.


In [3]:
df_oreo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 724 entries, 0 to 723
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   product_name        724 non-null    object 
 1   brands              724 non-null    object 
 2   barcode             724 non-null    object 
 3   categories          724 non-null    object 
 4   countries           724 non-null    object 
 5   image_url           724 non-null    object 
 6   ingredients_text    724 non-null    object 
 7   energy_kcal_100g    606 non-null    float64
 8   fat_100g            611 non-null    float64
 9   saturated_fat_100g  596 non-null    float64
 10  carbohydrates_100g  611 non-null    float64
 11  sugars_100g         599 non-null    float64
 12  fiber_100g          377 non-null    float64
 13  proteins_100g       609 non-null    float64
 14  salt_100g           584 non-null    float64
 15  nutrition_grade     724 non-null    object 
 16  nova_gro

In [4]:
# df_oreo_original = df_oreo[df_oreo["product_name"].str.lower() == "oreo original"]

df_oreo_canada = df_oreo[df_oreo["countries"].str.contains("en:canada", na=False)]

df_oreo_canada



Unnamed: 0,product_name,brands,barcode,categories,countries,image_url,ingredients_text,energy_kcal_100g,fat_100g,saturated_fat_100g,...,proteins_100g,salt_100g,nutrition_grade,nova_group,ecoscore_grade,labels,allergens,packaging,created_t,last_modified_t
19,Oreo Selena Gomez,Oreo,66721029621,,en:canada,,,483.0,24.1,8.62,...,3.45,0.517,unknown,,,,,,1749775318,1749847609
30,Sesame Flavored Thin Oreo,Oreo,6901668939562,,en:canada,,,490.0,22.4,0.0,...,6.12,0.969,unknown,,,,,,1747798716,1747798829
56,Peanut Butter Oreo,Oreo,66721029706,,en:canada,,,483.0,20.7,5.17,...,6.9,1.08,unknown,,,,,,1742678696,1746812779
127,thin oreo,Oreo,66721028990,"en:snacks, en:sweet-snacks, en:biscuits-and-ca...",en:canada,https://images.openfoodfacts.org/images/produc...,"Sugars (sugar and/or golden sugar, glucose-fru...",483.0,20.7,6.9,...,3.45,0.819,e,4.0,,en:cocoa-life,"en:gluten, en:soybeans",,1713221135,1749582173
145,Oreo à l'érable,Oreo,66721028921,"en:snacks, en:sweet-snacks, en:biscuits-and-ca...","en:canada, en:france",https://images.openfoodfacts.org/images/produc...,"sucres, farine de blé, huile de palme modifié,...",517.0,24.1,8.62,...,3.45,0.647,e,4.0,,,"en:gluten, en:soybeans",,1705355336,1724709049
173,Golden oreo,Oreo,66721028099,,en:canada,https://images.openfoodfacts.org/images/produc...,"wheat flour, sugars (sugar, glucose-fructose),...",500.0,20.588235,5.882353,...,2.941176,0.845588,unknown,4.0,,,"en:gluten, en:soybeans",,1689551884,1748705049
182,The Orginal Oreo,Oreo,66721028105,"en:snacks, en:sweet-snacks, en:biscuits-and-ca...",en:canada,https://images.openfoodfacts.org/images/produc...,"sugars (sugar, glucose-fructose), wheat flour,...",471.0,20.6,5.88,...,2.94,0.81,e,4.0,,,"en:gluten, en:soybeans",,1686521334,1746498416
198,"Oreo, double stuffed",Oreo,66721028082,"en:snacks, en:sweet-snacks, en:biscuits-and-ca...",en:canada,https://images.openfoodfacts.org/images/produc...,"Sucres (sucre, glucose-fructose), Farine de bl...",482.758621,24.137931,8.62069,...,3.448276,0.646552,e,4.0,,,"en:gluten, en:soybeans",,1678147546,1746655946
222,Oreo,Oreo,66721009555,,en:canada,https://images.openfoodfacts.org/images/produc...,,470.588235,20.588235,5.882353,...,2.941176,0.955882,unknown,,,,,,1659275985,1748606206
239,Oreo,Oreo,66721027627,,en:canada,https://images.openfoodfacts.org/images/produc...,,454.545455,20.454545,4.545455,...,4.545455,0.965909,unknown,,,,,,1650765666,1748219869


In [5]:
output_path = r"C:\Users\jverc\OneDrive\02.DataScienceOD\exports\oreo_canada.csv" # Update the output path as needed

df_oreo_canada.to_csv(output_path, index=False, encoding="utf-8")