# Food Scan Benchmark


This notebook provides a complete, end-to-end workflow for benchmarking Multimodal Large Language Models (MLLMs) on January's food image dataset (JFID).

**The process is as follows:**

1.  **Setup:** Install dependencies and configure API keys.
2.  **Define Components:** Set up Pydantic schemas, the dataset loader, the model wrapper, and evaluation metrics.
3.  **Run Evaluation:** Loop through the dataset, send images to a chosen MLLM, and collect predictions.
4.  **Analyze Results:** Calculate metrics and summarize the model's performance.

The dataset is automatically downloaded from a public S3 bucket and cached locally.


## Setup


Add your API keys to the environment:

```
echo OPENAI_API_KEY="sk-..."
echo GEMINI_API_KEY="..."
```


In [1]:
# Install packages
%pip install --upgrade litellm boto3 pandas tqdm python-dotenv pydantic tabulate

/Users/amirhosseinian/January/food-scan-benchmarks/.venv/bin/python: No module named pip
Note: you may need to restart the kernel to use updated packages.


## Core Components


### Schema Definition


In [2]:
from pydantic import BaseModel, Field
from typing import List


class Ingredient(BaseModel):
    name: str = Field(description="Name of the ingredient, e.g., 'scrambled eggs'")
    quantity: float = Field(description="Numerical quantity of the ingredient")
    unit: str = Field(description="Unit of measurement, e.g., 'cup', 'slice', 'g'")
    calories: float = Field(description="Estimated calories for this ingredient")
    carbs: float = Field(
        description="Estimated grams of carbohydrates for this ingredient"
    )
    protein: float = Field(description="Estimated grams of protein for this ingredient")
    fat: float = Field(description="Estimated grams of fat for this ingredient")


class TotalMacros(BaseModel):
    calories: float = Field(description="Total estimated calories for the entire meal")
    carbs: float = Field(
        description="Total estimated grams of carbohydrates for the entire meal"
    )
    protein: float = Field(
        description="Total estimated grams of protein for the entire meal"
    )
    fat: float = Field(description="Total estimated grams of fat for the entire meal")


class FoodAnalysis(BaseModel):
    meal_name: str = Field(
        description="A descriptive name for the meal, e.g., 'Breakfast Platter'"
    )
    ingredients: List[Ingredient] = Field(
        description="A list of all identified ingredients and their nutritional information"
    )
    total_macros: TotalMacros = Field(
        description="The sum of macros for all ingredients"
    )


### Model Costs Configuration


In [17]:
MODEL_COSTS = {
    "gpt-4.1": {"input": 2.00, "output": 8.00},
    "gpt-4o": {"input": 2.50, "output": 10.00},
    "gpt-4o-mini": {"input": 0.15, "output": 0.60},
    "gemini/gemini-2.5-flash-preview-05-20": {"input": 0.15, "output": 0.60},
    "gemini/gemini-2.5-pro-preview-06-05": {"input": 1.25, "output": 10.00},
}


def calculate_cost(model_name: str, input_tokens: int, output_tokens: int) -> float:
    """Calculate the cost for a model based on token usage."""
    if model_name not in MODEL_COSTS:
        return 0.0

    costs = MODEL_COSTS[model_name]
    input_cost = (input_tokens / 1_000_000) * costs["input"]
    output_cost = (output_tokens / 1_000_000) * costs["output"]
    return round(input_cost + output_cost, 6)

In [4]:
import base64
from pathlib import Path


def img2b64(path: Path) -> str:
    """Converts an image file to a base64 encoded string for API calls."""
    encoded = base64.b64encode(path.read_bytes()).decode()
    return f"data:image/jpeg;base64,{encoded}"

### Dataset Class


In [5]:
import pandas as pd
import ast
import boto3
from botocore import UNSIGNED
from botocore.client import Config
import tarfile


class FoodScanDataset:
    """Handles downloading, caching, and loading the food dataset."""

    _S3_BUCKET = "january-food-image-dataset-public"
    _S3_KEY = "food-scan-benchmark-dataset.tar.gz"

    def __init__(self, root: Path):
        self.root = root.expanduser()
        self.img_dir = self.root / "food-scan-benchmark-dataset" / "fsb_images"
        self.csv_path = (
            self.root / "food-scan-benchmark-dataset" / "food_scan_bench_v1.csv"
        )

        if not self.csv_path.exists():
            self._download_and_extract()

        self.df = pd.read_csv(self.csv_path)

    def _download_and_extract(self):
        print(f"Dataset not found in {self.root}. Downloading from S3...")
        self.root.mkdir(parents=True, exist_ok=True)
        local_archive = self.root / "fsb.tar.gz"

        s3 = boto3.client(
            "s3",
            config=Config(signature_version=UNSIGNED),
        )
        with open(local_archive, "wb") as f:
            s3.download_fileobj(self._S3_BUCKET, self._S3_KEY, f)

        print("Download complete. Extracting...")
        with tarfile.open(local_archive) as tar:
            tar.extractall(path=self.root)
        local_archive.unlink()
        print("Extraction complete.")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int):
        row = self.df.iloc[idx]
        img_path = self.img_dir / row.image_filename

        try:
            ingredients = ast.literal_eval(row.ingredients_list)
        except (ValueError, SyntaxError):
            ingredients = []

        return {
            "image_id": row.image_id,
            "image_path": img_path,
            "gt": {
                "meal_name": row.meal_name,
                "ingredients": ingredients,
                "macros": {
                    "calories": row.total_calories,
                    "carbs": row.total_carbs,
                    "protein": row.total_protein,
                    "fat": row.total_fat,
                },
            },
        }

### Model Wrapper


In [6]:
import litellm
from litellm.exceptions import APIError
from typing import Optional
import json


class LiteModel:
    """A robust wrapper around any LiteLLM-supported vision model."""

    def __init__(self, model_name: str, **litellm_kwargs):
        self.model_name = model_name
        self.kwargs = litellm_kwargs

    async def analyse(self, img_path: Path) -> Optional[dict]:
        """Analyzes an image and returns a structured dict with cost info, or None on failure."""
        b64_img = img2b64(img_path)

        messages = [
            {
                "role": "system",
                "content": "You are an expert nutritionist. Analyze the food image and provide a detailed breakdown in the requested JSON format.",
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Identify the food, its ingredients with quantities and macros, and the total macros.",
                    },
                    {"type": "image_url", "image_url": {"url": b64_img}},
                ],
            },
        ]

        try:
            resp = await litellm.acompletion(
                model=self.model_name,
                messages=messages,
                response_format=FoodAnalysis,
                temperature=0.0,
                **self.kwargs,
            )
            raw = resp.choices[0].message.content.strip()
            data = json.loads(raw)

            usage = resp.usage
            input_tokens = usage.prompt_tokens if usage else 0
            output_tokens = usage.completion_tokens if usage else 0
            cost = calculate_cost(self.model_name, input_tokens, output_tokens)

            result = FoodAnalysis(**data).model_dump()
            result["cost_usd"] = cost

            return result

        except APIError as e:
            print(f"API Error for {img_path.name}: {e}")
            return None
        except Exception as e:
            print(f"An unexpected error occurred for {img_path.name}: {e}")
            return None

### Metrics


In [7]:
import numpy as np


def ingredients_f1(gt, pred) -> float:
    """
    F-score on ingredient names (case-insensitive).
    """

    if isinstance(gt, str):
        try:
            gt = ast.literal_eval(gt)
        except Exception:
            gt = [gt]
    if isinstance(pred, str):
        try:
            pred = ast.literal_eval(pred)
        except Exception:
            pred = [pred]

    def _name(x):
        if isinstance(x, dict):
            x = x.get("name", "")
        return str(x).lower().strip()

    g_names = {_name(x) for x in gt if _name(x)}
    p_names = {_name(x) for x in pred if _name(x)}

    if not g_names and not p_names:
        return 1.0
    if not g_names or not p_names:
        return 0.0

    tp = len(g_names & p_names)
    if tp == 0:
        return 0.0
    precision = tp / len(p_names)
    recall = tp / len(g_names)
    return 2 * precision * recall / (precision + recall)


def macro_mae(gt_mac: dict, pred_mac: dict) -> float:
    """Calculates Mean Absolute Error over the four main macros."""
    keys = ["calories", "carbs", "protein", "fat"]
    errors = [abs(gt_mac[k] - pred_mac.get(k, 0)) for k in keys]
    return float(np.mean(errors))

## Evaluation Function


In [20]:
from tqdm.auto import tqdm
import asyncio
import time
from typing import Union


async def _process_sample(
    idx: int,
    ds: FoodScanDataset,
    llm: LiteModel,
    model_name: str,
) -> dict:
    start_time = time.time()
    sample = ds[idx]
    pred = await llm.analyse(sample["image_path"])
    end_time = time.time()
    gt = sample["gt"]

    item = {
        "image_id": sample["image_id"],
        "model": model_name,
        "response_time_seconds": round(end_time - start_time, 2),
    }

    if pred is None:
        item.update(
            {"f1_ing": 0.0, "mae_mac": None, "error": "failed", "cost_usd": 0.0}
        )
    else:
        f1 = ingredients_f1(gt["ingredients"], pred.get("ingredients", []))
        mae = macro_mae(gt["macros"], pred.get("total_macros", {}))
        item.update(
            {
                "f1_ing": f1,
                "mae_mac": mae,
                "error": None,
                "cost_usd": pred.get("cost_usd", 0.0),
            }
        )

    return item


async def _run_model_evaluation(
    model_name: str,
    ds: FoodScanDataset,
    n: int,
    max_concurrent: int,
    position: int,
) -> List[dict]:
    llm = LiteModel(model_name)
    sem = asyncio.Semaphore(max_concurrent)

    pbar = tqdm(
        total=n,
        desc=model_name.ljust(20),
        position=position,
        leave=True,
        dynamic_ncols=True,
        colour="cyan",
        bar_format="{desc}│{bar}│ {n:>2}/{total}",
    )

    async def _worker(i: int):
        async with sem:
            result = await _process_sample(i, ds, llm, model_name)
            pbar.update(1)
            return result

    results = await asyncio.gather(*(_worker(i) for i in range(n)))
    pbar.close()
    return results


async def run_evaluation(
    models: Union[str, List[str]],
    cache_dir: Path,
    max_items: Optional[int] = None,
    max_concurrent: int = 5,
) -> List[dict]:
    if isinstance(models, str):
        models = [models]

    ds = FoodScanDataset(cache_dir)
    n = min(max_items, len(ds)) if max_items else len(ds)

    model_tasks = [
        _run_model_evaluation(model_name, ds, n, max_concurrent, position=i)
        for i, model_name in enumerate(models)
    ]

    nested_results = await asyncio.gather(*model_tasks)
    return [item for sub in nested_results for item in sub]

## Run


In [21]:
import nest_asyncio

nest_asyncio.apply()

# Single model
MODELS = "gpt-4o"
# Multiple models
# MODELS = ["gpt-4o", "gpt-4o-mini"]
MODELS = [
    "gpt-4.1",
    "gpt-4o",
    "gemini/gemini-2.5-flash-preview-05-20",
    # "gemini/gemini-2.5-pro-preview-06-05",
]
# MODELS = ["gemini/gemini-2.5-flash-preview-05-20", "gemini/gemini-2.5-pro-preview-06-05"]

MAX_IMGS = 3
CACHE_DIR = Path(".cache/food_scan_bench")

# Run with multiple models
results = await run_evaluation(
    models=MODELS,
    cache_dir=CACHE_DIR,
    max_items=MAX_IMGS,
    max_concurrent=10,
)

pd.DataFrame(results).head(9)

gpt-4.1             │          │  0/3

gpt-4o              │          │  0/3

gemini/gemini-2.5-flash-preview-05-20│          │  0/3

Unnamed: 0,image_id,model,response_time_seconds,f1_ing,mae_mac,error,cost_usd
0,fsb_00000,gpt-4.1,14.09,0.333333,95.525,,0.006572
1,fsb_00001,gpt-4.1,14.57,0.0,26.65,,0.00638
2,fsb_00002,gpt-4.1,10.32,0.285714,84.2,,0.007572
3,fsb_00000,gpt-4o,9.56,0.666667,127.65,,0.005915
4,fsb_00001,gpt-4o,6.47,0.666667,12.675,,0.005215
5,fsb_00002,gpt-4o,11.78,0.4,23.925,,0.005785
6,fsb_00000,gemini/gemini-2.5-flash-preview-05-20,11.09,0.461538,158.35,,0.00116
7,fsb_00001,gemini/gemini-2.5-flash-preview-05-20,8.58,0.666667,17.65,,0.000818
8,fsb_00002,gemini/gemini-2.5-flash-preview-05-20,5.15,0.0,30.425,,0.000359
