In [None]:
from huggingface_hub import InferenceClient
import base64
from PIL import Image

client = InferenceClient(api_key="")

image_path = "../data/nutrition5k_reconstructed/images/dish_1556573514.jpeg"
with open(image_path, "rb") as f:
    base64_image = base64.b64encode(f.read()).decode("utf-8")

with Image.open(image_path) as img:
    print(img.format)

image_url = f"data:image/png;base64,{base64_image}"

messages = [
    {
        "role": "system",
        "content": """You are a nutritionist. You will be given an image of food. Analyze the food in the image and provide its nutritional facts (calories, mass, fat, carbs, protein) in that order."""
        
    },
	{
		"role": "user",
		"content": """I will provide an image of food and you will analyze the food in the image and provide its nutritional facts. Your response must strictly follow this format: {calories: <calories>, mass: <mass>, fat: <fat>, carbs: <carbs>, protein: <protein>}. Do NOT include any additional text, commentary, or explanations."""
	},
	{
		"role": "assistant",
		"content": """I will answer your questions with the following format: {calories: <calories>, mass: <mass>, fat: <fat>, carbs: <carbs>, protein: <protein>}. Please provide the image of the food you would like me to analyze."""
	},
	{
		"role": "user",
		"content": [
			{
				"type": "text",
				"text": "Here is an image of food I would like you to analyze."
			},
			{
				"type": "image_url",
				"image_url": {"url": image_url},
			}
		]
	}
]


completion = client.chat.completions.create(
    model="meta-llama/Llama-3.2-11B-Vision-Instruct", 
	messages=messages, 
	max_tokens=500
)

response = completion.choices[0].message["content"]
print(response)

PNG
Unfortunately, the problem does not provide an image of the food to analyze. However, I can provide a generic response based on the assumption that the image is of a standard salad with mixed greens, cherry tomatoes, cucumber, carrots, and a small amount of olive oil dressing.

{calories: 150, mass: 200g, fat: 10g, carbs: 20g, protein: 5g}


In [122]:
messages = [
    {
        "role": "system",
        "content": """You are a nutritionist. You will be given an image of food. Analyze the food in the image and provide its nutritional facts (calories, mass, fat, carbs, protein) in that order. """
    },
	{
		"role": "user",
		"content": """I will provide an image of food and you will analyze the food in the image and provide its nutritional facts. Your response must strictly follow this format: {calories: <calories>, mass: <mass>, fat: <fat>, carbs: <carbs>, protein: <protein>}. Do NOT include any additional text, commentary, or explanations, and round to the nearest 4 decimals if necessary."""
	},
	{
		"role": "assistant",
		"content": """I will answer your questions with the following format: {calories: <calories>, mass: <mass>, fat: <fat>, carbs: <carbs>, protein: <protein>}. Please provide the image of the food you would like me to analyze. Here is an example of the format and expected response: Example: An image of ten Olives. Response: {calories: 414, mass: 36, fat: 3.85, carbs: 2.268, protein: 0.288}. Follow the format strictly and round to the nearest 4 decimals if necessary."""
	},
	{
		"role": "user",
		"content": [
			{
				"type": "text",
				"text": "Here is an image of food I would like you to analyze."
			},
			{
				"type": "image_url",
				"image_url": {"url": image_url},
			}
		]
	}
]


completion = client.chat.completions.create(
    model="meta-llama/Llama-3.2-11B-Vision-Instruct", 
	messages=messages, 
	max_tokens=500
)

response = completion.choices[0].message["content"]
print(response)

{calories: 88.7, mass: 76.2, fat: 3.38, carbs: 6.47, protein: 4.28}.


In [None]:
from huggingface_hub import InferenceClient
import base64
import re
import pandas as pd
from sklearn.metrics import mean_absolute_error
import time
import os

# Path
client = InferenceClient(api_key="")
csv_path = "../data/nutrition5k_reconstructed/metadata/test_ids.csv"
labels_path = "../data/nutrition5k_reconstructed/labels/labels.csv"
llama_response_path = "llama_response.csv"

df = pd.read_csv(csv_path, header=None)
image_ids = df[0].tolist() # list of image ids in test_ids.csv
# true labels
label_df = pd.read_csv(labels_path, header=None)
filtered_label_df = label_df[label_df[0].isin(image_ids)]
id_dict = filtered_label_df.set_index(0).T.to_dict('list')
id_dict = {key: [float(value) for value in values] for key, values in id_dict.items()}

# save to csv to prevent crashing
if os.path.exists(llama_response_path):
    llama_response = pd.read_csv(llama_response_path, index_col=0).to_dict(orient="list")
    llama_response = {key: [float(x) for x in value] for key, value in llama_response.items()}
else:
    llama_response = {}

# remove processed ids
remaining_ids = [i for i in image_ids if i not in llama_response]
count = 0

In [94]:
def parse_res(response):
    try:
        final_response = re.search(r'\{.*\}', response).group() # extract the response
        if final_response:
            values = re.findall(r":\s*([\d.]+)", final_response) # extract the values
            final_response = [float(i) for i in values] # convert to float
        else:
            final_response = [0, 0, 0, 0, 0]
    except Exception as e:
        final_response = [0, 0, 0, 0, 0]
    return final_response

In [None]:
for i in remaining_ids:
    if count == 10:
        pd.DataFrame.from_dict(llama_response, orient="index").to_csv(llama_response_path)
        time.sleep(120)
        count = 0
    
    image_path = "../data/nutrition5k_reconstructed/images/" + i + ".jpeg"
    with open(image_path, "rb") as f:
        base64_image = base64.b64encode(f.read()).decode("utf-8")

    image_url = f"data:image/png;base64,{base64_image}"

    messages = [
        {
            "role": "system",
            "content": """You are a nutritionist. You will be given an image of food. Analyze the food in the image and provide its nutritional facts (calories, mass, fat, carbs, protein) in that order. """
        },
        {
            "role": "user",
            "content": """I will provide an image of food and you will analyze the food in the image and provide its nutritional facts. Your response must strictly follow this format: {calories: <calories>, mass: <mass>, fat: <fat>, carbs: <carbs>, protein: <protein>}. Do NOT include any additional text, commentary, or explanations, and round to the nearest 4 decimals if necessary."""
        },
        {
            "role": "assistant",
            "content": """I will answer your questions with the following format: {calories: <calories>, mass: <mass>, fat: <fat>, carbs: <carbs>, protein: <protein>}. Please provide the image of the food you would like me to analyze. Here is an example of the format and expected response: Example: An image of ten Olives. Response: {calories: 414, mass: 36, fat: 3.85, carbs: 2.268, protein: 0.288}. Follow the format strictly and round to the nearest 4 decimals if necessary."""
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Don't tell me your thinking process, give a best estimate. Here is an image of food I would like you to analyze. Remember to follow the format, no explanation. "
                },
                {
                    "type": "image_url",
                    "image_url": {"url": image_url},
                }
            ]
        }
    ]


    completion = client.chat.completions.create(
        model="meta-llama/Llama-3.2-11B-Vision-Instruct", 
        messages=messages, 
        max_tokens=500
    )

    response = completion.choices[0].message["content"] # get response
    if response == "":
        llama_response[i] = [0, 0, 0, 0, 0]
    else:
        final_response = parse_res(response)
        llama_response[i] = final_response
        if llama_response[i] == []:
            llama_response[i] = [0, 0, 0, 0, 0] # if no prediction from llama, placeholder value
        print(llama_response[i])
    count += 1

pd.DataFrame.from_dict(llama_response, orient="index").to_csv(llama_response_path) # save to csv again

mae_scores = {key: mean_absolute_error(id_dict[key], llama_response[key]) for key in llama_response if key in id_dict}
all_truth_value = []
all_pred_value = []
for key in llama_response:
    if key in id_dict:
        all_truth_value.extend(id_dict[key])
        all_pred_value.extend(llama_response[key])
overall_mae = mean_absolute_error(all_truth_value, all_pred_value)

print(overall_mae)
print(mae_scores)


[59.5, 1.216, 0.038, 1.984, 0.175]
[3.0, 100.0, 0.1, 8.3, 1.0]
[162.0, 190.0, 7.2912, 21.6256, 5.898]
[546.0, 134.0, 20.0, 61.0, 13.0]
[493.8, 14.16, 2.7212, 5.0726, 4.8402]
[800.0, 175.0, 20.36, 80.72, 41.0]
[510.0, 45.0, 23.0, 49.0, 31.0]
[218.0, 136.0, 10.95, 20.15, 4.13]
[13897.34, 1391.63, 53.08, 2279.79, 261.37]
[545.0, 44.93, 13.7, 71.264, 22.948]
[349.4, 56.9, 18.458, 23.496, 10.896]
[240.0, 20.0, 0.3, 45.76, 3.32]
[466.2, 329.1, 15.4, 57.4, 37.1]
[512.6628, 267.6614, 17.7354, 36.336, 25.668]
[422.0, 22.0, 6.25, 37.0, 8.712]
[120.0, 40.0, 7.8, 10.8, 3.2]
[497.0, 277.0, 21.16, 57.639, 9.781]
[204.0, 139.0, 11.8, 16.6, 8.67]
[640.0, 45.0, 24.76, 55.76, 24.874]
[70.51, 100.0, 0.08, 17.3, 1.85]
[409.0, 99.0, 18.2, 24.4, 31.4]
[59.0, 100.0, 0.53, 12.2, 0.77]
[96.0, 223.0, 0.231, 6.132, 10.198]
[276.0, 16.67, 0.0, 0.0, 40.0]
[397.0, 195.0, 9.1425, 52.2065, 13.0475]
[362.768, 337.373, 9.267, 44.33, 16.786]
[408.1, 152.0, 25.8, 30.8, 16.3]
[46.0, 27.0, 0.763, 2.847, 3.69]
[43.0, 33.0, 

HfHubHTTPError: 424 Client Error: Failed Dependency for url: https://api-inference.huggingface.co/models/meta-llama/Llama-3.2-11B-Vision-Instruct/v1/chat/completions (Request ID: nmt3fkzRhz1FoA5nhye9b)

Request failed during generation: Server error:

In [None]:
# llama response has 360 elements
import numpy as np
sub_dict = {}
for i in llama_response.keys():
    sub_dict[i] = id_dict[i]

true_mat = []
pred_mat = []
for key in llama_response:
    if key in sub_dict:
        true_values = sub_dict[key]
        pred_values = llama_response[key]
        if len(true_values) != len(pred_values):
            pred_values = pred_values[:len(true_values)]
        true_mat.append(true_values)
        pred_mat.append(pred_values)

true_mat = np.array(true_mat)
pred_mat = np.array(pred_mat)

column_mae = np.mean(np.abs(true_mat - pred_mat), axis=0)
overall_mae = mean_absolute_error(true_mat.flatten(), pred_mat.flatten())
print(overall_mae)
print(column_mae)

97.6870951895533
[273.96479428 156.50676837  13.92178545  29.47684831  14.56527954]
