In [1]:
import requests
import pandas as pd

def fetch_products(query, query_type="search", page_size=100, page=1, sort_by="created_t"):

    if query_type == "brand":
        # Brand-based search
        url = f"https://world.openfoodfacts.org/brand/{query}.json"
        params = {
            "page_size": page_size,
            "page": page,
            "sort_by": sort_by
        }
    else:
        # Search term-based search
        url = "https://world.openfoodfacts.org/cgi/search.pl"
        params = {
            "search_terms": query,
            "search_simple": 1,
            "action": "process",
            "json": 1,
            "page_size": page_size,
            "page": page,
            "sort_by": sort_by
        }

    response = requests.get(url, params=params)
    if response.status_code != 200:
        raise Exception(f"API request failed: {response.status_code}")

    data = response.json()
    return data.get("products", [])

def extract_product_fields(product):
    nutriments = product.get("nutriments", {})
    return {
        # Identifiers
        "product_name": product.get("product_name", "N/A"),
        "brands": product.get("brands", "N/A"),
        "barcode": product.get("code", "N/A"),
        "categories": ", ".join(product.get("categories_tags", [])),
        "countries": ", ".join(product.get("countries_tags", [])),
        "image_url": product.get("image_url", "N/A"),
        "ingredients_text": product.get("ingredients_text", "N/A"),

        # Nutrition values
        "energy_kcal_100g": nutriments.get("energy-kcal_100g"),
        "fat_100g": nutriments.get("fat_100g"),
        "saturated_fat_100g": nutriments.get("saturated-fat_100g"),
        "carbohydrates_100g": nutriments.get("carbohydrates_100g"),
        "sugars_100g": nutriments.get("sugars_100g"),
        "fiber_100g": nutriments.get("fiber_100g"),
        "proteins_100g": nutriments.get("proteins_100g"),
        "salt_100g": nutriments.get("salt_100g"),

        # Nutrition grade and labels
        "nutrition_grade": product.get("nutrition_grades_tags", [None])[0],
        "nova_group": product.get("nova_group"),
        "ecoscore_grade": product.get("ecoscore_grade"),
        "labels": ", ".join(product.get("labels_tags", [])),
        "allergens": ", ".join(product.get("allergens_tags", [])),
        "packaging": ", ".join(product.get("packaging_tags", [])),

        # Timestamps
        "created_t": product.get("created_t"),
        "last_modified_t": product.get("last_modified_t"),
    }

def search_and_extract_all(query, query_type="search", page_size=100, sort_by="created_t", max_pages=None):

    all_records = []
    page = 1

    while True:
        print(f"Fetching page {page}...")
        products = fetch_products(query, query_type=query_type, page_size=page_size, page=page, sort_by=sort_by)

        if not products:
            print("No more products found.")
            break

        for product in products:
            record = extract_product_fields(product)
            all_records.append(record)

        if max_pages is not None and page >= max_pages:
            print("Reached max_pages limit.")
            break

        page += 1

    df = pd.DataFrame(all_records)
    return df


In [None]:
#Single brand example
# df_oreo = search_and_extract_all("oreo", query_type="brand")


Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
No more products found.


In [None]:
import pandas as pd

# Load brand names
brands_df = pd.read_csv(r"C:\Users\jverc\Downloads\Cleaned_Brand_Rankings.csv")
brand_list = brands_df["Brand"].dropna().unique().tolist()  

# Collecting results
all_brand_products = []

In [6]:
import time

# Loop through each brand and fetch data
for brand in brand_list:
    try:
        print(f"Searching brand: {brand}")
        df_brand = search_and_extract_all(brand, query_type="brand")  # Adjust max_pages as needed

        if df_brand.empty:
            print(f"No results for brand: {brand}")
            continue

        df_brand["searched_brand"] = brand  # Keep track of where each row came from
        all_brand_products.append(df_brand)

        time.sleep(1)  # Avoid hammering the API

    except Exception as e:
        print(f"Skipping brand '{brand}' due to error: {e}")

# Combine everything into one big DataFrame
df_all_brands = pd.concat(all_brand_products, ignore_index=True)

Searching brand: M&M's
Fetching page 1...
Fetching page 2...
Fetching page 3...
No more products found.
Searching brand: Ritz
Fetching page 1...
Fetching page 2...
No more products found.
Searching brand: Frito-Lay
Fetching page 1...
Fetching page 2...
Fetching page 3...
No more products found.
Searching brand: Lay's
Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Fetching page 11...
Fetching page 12...
Fetching page 13...
Fetching page 14...
Fetching page 15...
Fetching page 16...
Fetching page 17...
No more products found.
Searching brand: Betty Crocker
Fetching page 1...
Fetching page 2...
Fetching page 3...
No more products found.
Searching brand: Heinz Ketchup
Fetching page 1...
Skipping brand 'Heinz Ketchup' due to error: API request failed: 404
Searching brand: Jif
Fetching page 1...
Fetching page 2...
No more products found.
Searching br

  df_all_brands = pd.concat(all_brand_products, ignore_index=True)


In [7]:
df_all_brands.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23352 entries, 0 to 23351
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   product_name        23352 non-null  object 
 1   brands              23352 non-null  object 
 2   barcode             23352 non-null  object 
 3   categories          23352 non-null  object 
 4   countries           23352 non-null  object 
 5   image_url           23352 non-null  object 
 6   ingredients_text    23352 non-null  object 
 7   energy_kcal_100g    19622 non-null  float64
 8   fat_100g            20073 non-null  float64
 9   saturated_fat_100g  19156 non-null  float64
 10  carbohydrates_100g  20046 non-null  float64
 11  sugars_100g         19396 non-null  float64
 12  fiber_100g          13234 non-null  float64
 13  proteins_100g       20092 non-null  float64
 14  salt_100g           19039 non-null  float64
 15  nutrition_grade     23351 non-null  object 
 16  nova

In [17]:
df_all_brands

Unnamed: 0,product_name,brands,barcode,categories,countries,image_url,ingredients_text,energy_kcal_100g,fat_100g,saturated_fat_100g,...,salt_100g,nutrition_grade,nova_group,ecoscore_grade,labels,allergens,packaging,created_t,last_modified_t,searched_brand
0,Hirschenttecote,M,2186307018257,,en:germany,,,,,,...,,unknown,,,,,,1728501068,1728501068,M&M's
1,Rehschnitzel,M,2186401010256,,en:germany,,,,,,...,,unknown,,,,,,1728501011,1728501011,M&M's
2,Rindsfiletspitz,M,2170639010102,,en:germany,,,,,,...,,unknown,,,,,,1728500921,1728500921,M&M's
3,Memories soft doughnuts,M,6281016014432,,en:saudi-arabia,https://images.openfoodfacts.org/images/produc...,,,,,...,,unknown,,,,,,1725361622,1753615692,M&M's
4,Enoki Mushroom,m.,8719324855686,,en:germany,https://images.openfoodfacts.org/images/produc...,,,,,...,,unknown,,,,,,1699977550,1699977600,M&M's
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23347,Corn Chex,Chex,0016000208193,,en:united-states,,"Whole Grain Corn, Corn Meal, Sugar, Corn Starc...",385.0,2.56,0.000,...,1.7900,unknown,,,,,,1715475353,1750005657,Chex
23348,Chex Mix Remix Cheesy Pizza,Chex,0016000205536,,en:united-states,https://images.openfoodfacts.org/images/produc...,,433.0,13.30,5.000,...,2.1700,unknown,,,,,,1713149280,1749687013,Chex
23349,Honey Nut Flavour Chex,"Chex, General Mills",0065633405226,"en:plant-based-foods-and-beverages, en:plant-b...","en:canada, en:france",https://images.openfoodfacts.org/images/produc...,"Whole grain corn, Degermed corn meal, Sugar, C...",382.0,1.47,0.294,...,1.6200,e,4.0,,"en:no-gluten, en:kosher, en:no-artificial-flav...",en:nuts,,1577892911,1713560486,Chex
23350,Check Mix Bold Party Blend,Chex,0016000126077,en:snacks,en:united-states,https://images.openfoodfacts.org/images/produc...,"whole wheat, degermed yellow corn meal, enrich...",414.0,10.34,1.720,...,1.8393,d,4.0,,,"en:gluten, en:milk, en:soybeans",,1524696090,1749423732,Chex


In [18]:
df_all_brands.describe()

Unnamed: 0,energy_kcal_100g,fat_100g,saturated_fat_100g,carbohydrates_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,nova_group,created_t,last_modified_t
count,19622.0,20073.0,19156.0,20046.0,19396.0,13234.0,20092.0,19039.0,12493.0,23352.0,23352.0
mean,364.063886,17.020787,4.7187,45.720592,16.479548,3.840949,6.699974,2.614299,3.787801,1614233000.0,1715446000.0
std,198.800476,15.19757,6.072983,26.783914,18.701916,4.272045,11.963559,46.732356,0.588505,100192600.0,52167980.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1332525000.0,1424947000.0
25%,199.25,3.03,1.0,17.1875,3.0,1.2,3.45,0.38,4.0,1536033000.0,1689323000.0
50%,412.0,15.5,3.1,53.6,8.51,3.57,6.060606,0.925,4.0,1613223000.0,1745436000.0
75%,505.0,28.6,5.9,64.3,26.0,4.8,7.666667,1.45195,4.0,1715160000.0,1748663000.0
max,10000.0,350.0,265.7,700.0,550.0,90.0,1500.0,3450.0,4.0,1754004000.0,1754006000.0


In [15]:
import re


cols_to_check = [
    "energy_kcal_100g", "fat_100g", "saturated_fat_100g",
    "carbohydrates_100g", "sugars_100g", "fiber_100g",
    "proteins_100g", "salt_100g"
]

non_zero_mask = pd.Series(True, index=df_all_brands.index)

for col in cols_to_check:
    non_zero_mask &= df_all_brands[col].notna() & (df_all_brands[col] != 0)

df_clean = df_all_brands[
    (df_all_brands["energy_kcal_100g"] > 20) &
    (df_all_brands["energy_kcal_100g"] <= 900) &

    (df_all_brands["fat_100g"] <= 60) &
    (df_all_brands["saturated_fat_100g"] <= 30) &

    (df_all_brands["carbohydrates_100g"] >= 5) &
    (df_all_brands["carbohydrates_100g"] <= 100) &

    (df_all_brands["sugars_100g"] <= 70) &
    (df_all_brands["fiber_100g"] <= 30) &

    (df_all_brands["proteins_100g"] <= 70) &
    (df_all_brands["salt_100g"] <= 4.5) &

    (df_all_brands["brands"].str.match(r"^[\x00-\x7F]+$", na=False)) &
    (df_all_brands["product_name"].str.match(r"^[\x00-\x7F]+$", na=False)) &
    
    non_zero_mask  
]

In [16]:
df_clean.describe()

Unnamed: 0,energy_kcal_100g,fat_100g,saturated_fat_100g,carbohydrates_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,nova_group,created_t,last_modified_t
count,7171.0,7171.0,7171.0,7171.0,7171.0,7171.0,7171.0,7171.0,5241.0,7171.0,7171.0
mean,425.335741,20.48996,4.735947,52.94537,14.559793,4.168377,7.319577,1.072645,3.878268,1624905000.0,1725677000.0
std,144.239936,12.656428,4.206205,19.842508,15.498837,3.020222,4.369153,0.6123,0.370252,105591600.0,42529760.0
min,21.0,0.05,0.0064,5.0,0.1,0.01,0.1,2e-06,1.0,1332755000.0,1494573000.0
25%,379.0,8.125,1.9,50.0,3.3,2.46,5.26,0.666,4.0,1533357000.0,1723236000.0
50%,480.0,22.580645,3.57,56.6,6.5,3.61,6.67,1.0026,4.0,1639935000.0,1746251000.0
75%,523.0,30.577778,5.695,64.9,25.0,4.9,7.89,1.4,4.0,1743437000.0,1748666000.0
max,833.333333,57.1,29.4,95.8,70.0,30.0,51.6,4.5,4.0,1753966000.0,1753992000.0


In [19]:
df_clean

Unnamed: 0,product_name,brands,barcode,categories,countries,image_url,ingredients_text,energy_kcal_100g,fat_100g,saturated_fat_100g,...,salt_100g,nutrition_grade,nova_group,ecoscore_grade,labels,allergens,packaging,created_t,last_modified_t,searched_brand
26,Bohnen Kidney - M-Classic,"M, M-Classic, MClassic, Migros, Migros: M-Classic",7616800800509,"en:plant-based-foods-and-beverages, en:plant-b...","en:france, en:germany, en:switzerland",https://images.openfoodfacts.org/images/produc...,"red beans, water, cooking salt, acidifier: cit...",97.0,0.50,0.100,...,0.7500,a,3.0,,,,"en:box, en:can, en:canned",1437823409,1722792133,M&M's
27,Cheese Nibs,Ritz,0066721029195,,en:canada,,,500.0,23.30,5.000,...,1.5000,unknown,,,,,,1753052810,1753052810,Ritz
48,Ritz,Ritz,7622201747152,,en:saudi-arabia,https://images.openfoodfacts.org/images/produc...,"Wheat Flour, Non Hydrogenated Vegetable Oil [P...",510.0,26.30,11.100,...,1.0700,unknown,4.0,,,"en:gluten, en:soybeans",,1715676935,1753792573,Ritz
57,RITZ crackers original,Ritz,15983910,,en:france,https://images.openfoodfacts.org/images/produc...,,484.0,23.00,11.000,...,1.3000,unknown,,,,,,1662204254,1688203294,Ritz
60,Les cookies Coco,"Matatie, Ritz",0377009601665,"en:snacks, en:sweet-snacks, en:specific-produc...",en:france,https://images.openfoodfacts.org/images/produc...,"Farines (farine de millet*,farine de riz blanc...",529.0,30.00,11.000,...,0.3300,e,4.0,,"en:no-gluten, en:organic, en:crossed-grain-tra...",,en:packet,1627983121,1728357991,Ritz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23345,Chex mix,Chex,0016000214972,en:snack-mix,en:united-states,https://images.openfoodfacts.org/images/produc...,"Degermed Yellow Corn Meal, Whole Wheat, Enrich...",414.0,12.10,1.720,...,2.1600,d,4.0,,,"en:gluten, en:soybeans",,1724275343,1753306635,Chex
23346,Chex Mix Snack Mix Cheddar,Chex,0016000206984,en:crackers,en:united-states,https://images.openfoodfacts.org/images/produc...,"Degermed Yellow Corn Meal, Whole Wheat, Enrich...",433.0,11.70,1.670,...,1.7500,d,4.0,,,"en:gluten, en:milk, en:soybeans",,1723499763,1748462124,Chex
23348,Chex Mix Remix Cheesy Pizza,Chex,0016000205536,,en:united-states,https://images.openfoodfacts.org/images/produc...,,433.0,13.30,5.000,...,2.1700,unknown,,,,,,1713149280,1749687013,Chex
23349,Honey Nut Flavour Chex,"Chex, General Mills",0065633405226,"en:plant-based-foods-and-beverages, en:plant-b...","en:canada, en:france",https://images.openfoodfacts.org/images/produc...,"Whole grain corn, Degermed corn meal, Sugar, C...",382.0,1.47,0.294,...,1.6200,e,4.0,,"en:no-gluten, en:kosher, en:no-artificial-flav...",en:nuts,,1577892911,1713560486,Chex


In [20]:
import os

# Automatically get your Windows username
username = os.getlogin()

# Construct full export path inside your OneDrive project folder
output_path = fr"C:\Users\{username}\OneDrive\02.DataScienceOD\exports\top_100_brands.csv"

# Save the DataFrame
df_clean.to_csv(output_path, index=False, encoding="utf-8")

print(f"File exported to: {output_path}")

File exported to: C:\Users\jverc\OneDrive\02.DataScienceOD\exports\top_100_brands.csv
