## Settings

In [129]:
PROMPT = """
You are an image classification agent. Your role is to evaluate whether a given instruction has been correctly applied to an image.
You are given the modified image and an instruction.
 Response Format:
1. Provide a step-by-step analysis of the image in relation to the instruction.  
2. Conclude your response with either `<YES>` or `<NO>` on a new line, depending on whether the instruction was applied.  
3. Ensure that `<YES>` or `<NO>` is enclosed within less than (`<`) and greater than (`>`) signs and appears on a separate line at the end of the response.  
4. Ensure the less than (`<`) and greater than (`>`) signs are only used at the end of the response and nowhere else.

Was the instruction "{instruction}" applied to the image?
"""

In [130]:
import base64
from io import BytesIO
from PIL import Image
from openai import OpenAI
import os

client = OpenAI(
     base_url="https://api.groq.com/openai/v1", api_key=os.environ.get("GROQ_API_KEY")
)

import re


def assess_response(response: str) -> bool:
    matches = re.search(r"<(.{3})>", response)
    if not matches:
        return "<YES>" in response
    return matches.group(1) == "YES"


def check_modification(
    image_solution: Image.Image, instruction: str#, image_input: Image.Image
) -> bool:
    #buffered_input = BytesIO()
    #image_input.save(buffered_input, format="JPEG")
    #img_str_input = base64.b64encode(buffered_input.getvalue()).decode("utf-8")

    buffered_solution = BytesIO()
    image_solution.save(buffered_solution, format="JPEG")
    img_str_solution = base64.b64encode(buffered_solution.getvalue()).decode("utf-8")

    completion = client.chat.completions.create(
         model="llama-3.2-90b-vision-preview",
        #model="gpt-4o-mini",
        messages=[
            #{"role": "system", "content": SYSTEM},
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": PROMPT.format(instruction=instruction)
                    },
                    #{
                    #    "type": "image_url",
                    #    "image_url": {
                    #        "url": f"data:image/jpeg;base64,{img_str_input}",
                    #        "detail": "low",
                    #    },
                    #},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{img_str_solution}",
                            "detail": "low",
                        },
                    },
                ],
            },
        ],
        temperature=1,
        max_completion_tokens=4096,
        top_p=1,
        stream=False,
    )

    response = completion.choices[0].message.content
    return assess_response(response), response

### Evaluating against the right solution

In [48]:

def classification(row):
    row["instruction_applied"],row["response"] = check_modification(row["image_solution"][0],row["instruction"],row["image_input"])
    return row

from datasets import load_dataset

ds = load_dataset("CharlyR/varbench", "tikz", split="benchmark")

ds = ds.select_columns(["id","instruction","image_solution","image_input"])

ds = ds.map(classification)


Map: 100%|██████████| 100/100 [05:06<00:00,  3.06s/ examples]


In [None]:
ds.push_to_hub("CharlyR/vTikz-vlm_oracl_benchmark","input_provided_gpt4o-mini", split="test")

Map: 100%|██████████| 100/100 [00:00<00:00, 6847.62 examples/s]t/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 166.57ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.21s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/CharlyR/vTikz-vlm_oracl_benchmark/commit/c18c623f552654b99226f18344850e71f48f23d9', commit_message='Upload dataset', commit_description='', oid='c18c623f552654b99226f18344850e71f48f23d9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/CharlyR/vTikz-vlm_oracl_benchmark', endpoint='https://huggingface.co', repo_type='dataset', repo_id='CharlyR/vTikz-vlm_oracl_benchmark'), pr_revision=None, pr_num=None)

In [55]:
vlm_dataset = load_dataset("CharlyR/vTikz-vlm_oracl_benchmark","default", split="test")
input_provided_dataset = load_dataset("CharlyR/vTikz-vlm_oracl_benchmark","input_provided_gpt4o-mini", split="test")


Generating test split: 100%|██████████| 100/100 [00:00<00:00, 10380.66 examples/s]
Generating test split: 100%|██████████| 100/100 [00:00<00:00, 9784.46 examples/s]


In [56]:
vlm_df  = vlm_dataset.to_pandas()

print(len(vlm_df[vlm_df["instruction_applied"]==True]))

input_provided_df  = input_provided_dataset.to_pandas()
print(len(input_provided_df[input_provided_df["instruction_applied"]==True]))


49
72


### Evaluating against a wrong solution

In [55]:

from datasets import load_dataset

ds = load_dataset("CharlyR/varbench", "tikz", split="benchmark")

ds = ds.select_columns(["id","instruction","image_solution","image_input","code"])



In [56]:
# Convert to pandas
df = ds.to_pandas()

df["image_solution"] = df["image_solution"].apply(lambda x:x[0])
df

Unnamed: 0,id,instruction,image_solution,image_input,code
0,beam_coord_change,Move the coordinate system between EA and F.,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,"\documentclass[tikz,border=5pt]{standalone}\n\..."
1,bee_eyes,"Add eyes to the bee with pupils, on the front ...",{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,"\documentclass[tikz,border=5]{standalone}\n\us..."
2,bee_longer_body,"Make the body of the bee longer, and pointy",{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,"\documentclass[tikz,border=5]{standalone}\n\us..."
3,bee_mirrored,Mirror the bee vertically,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,"\documentclass[tikz,border=5]{standalone}\n\us..."
4,bee_red_stripes,Change the color of the stripes to red,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,"\documentclass[tikz,border=5]{standalone}\n\us..."
...,...,...,...,...,...
95,vima_no_256,Remove the measurements for 256kb,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,"\documentclass[tikz,border=5]{standalone}\n\us..."
96,vr_two_motors,Split the motors into two motors next to eacho...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,"\documentclass[tikz,border=5]{standalone}\n\us..."
97,workflow_box_label,Add a label k_n to each filled black rectangle...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,"\documentclass[tikz,border=5]{standalone}\n\us..."
98,workflow_loop_removed,Remove the entire bottom section of the diagra...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,"\documentclass[tikz,border=5]{standalone}\n\us..."


In [62]:
import pandas as pd


# Compute non_values
def compute_non_values(group: pd.DataFrame):
    all_solutions = group["image_solution"].tolist()

    def add_wrong_solutions(row):
        current_solution = row["image_solution"]
        removed_solution = all_solutions.copy()
        removed_solution.remove(current_solution)
        row["wrong_solutions"] = removed_solution
        return row

    group =  group.apply(add_wrong_solutions,axis=1)
    return group


df_full = df.groupby("code", group_keys=False).apply(compute_non_values)
df_full = df_full.explode(["wrong_solutions"])
df_full = df_full.dropna().reset_index()

  df_full = df.groupby("code", group_keys=False).apply(compute_non_values)


##### debug

In [58]:
i=0

In [None]:
from PIL import Image
import io
import matplotlib.pyplot as plt

def display(row):
    print(row["instruction"])  # Display the instruction text
    
    # Extract and open images
    input_data = row["image_input"]
    input_image = Image.open(io.BytesIO(input_data["bytes"]))
    
    wrong_data = row["wrong_solutions"]
    wrong_image = Image.open(io.BytesIO(wrong_data["bytes"]))
    
    right_data = row["image_solution"]
    right_image = Image.open(io.BytesIO(right_data["bytes"]))

    # Create a figure with 3 subplots
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))

    # Display images
    axes[0].imshow(input_image)
    axes[0].set_title("Input Image")
    axes[0].axis("off")

    axes[1].imshow(wrong_image)
    axes[1].set_title("Wrong Solution")
    axes[1].axis("off")

    axes[2].imshow(right_image)
    axes[2].set_title("Right Solution")
    axes[2].axis("off")

    plt.show() 
    
    
entry = df_full.iloc[i]
display(entry)
i+=1

##### Execute classification

In [125]:
import datasets

ds = datasets.Dataset.from_pandas(df_full)
ds = (
    ds.cast_column("image_input", datasets.Image(decode=True))
    .cast_column("image_solution", datasets.Image(decode=True))
    .cast_column("wrong_solutions", datasets.Image(decode=True))
)

In [132]:
def classification(row):
    row["instruction_applied"],row["response"] = check_modification(row["wrong_solutions"],row["instruction"])
    return row
ds = ds.map(classification)


Map: 100%|██████████| 232/232 [04:34<00:00,  1.18s/ examples]


In [133]:
#ds.push_to_hub("CharlyR/vTikz-vlm_oracl_benchmark","input_provided_gpt4o-mini_wrong_solution", split="test")
ds.push_to_hub("CharlyR/vTikz-vlm_oracl_benchmark","llama90bvision_wrong_solution", split="test")

Map: 100%|██████████| 232/232 [00:00<00:00, 7833.00 examples/s]t/s]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 318.85ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.60s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/CharlyR/vTikz-vlm_oracl_benchmark/commit/e9cdfa2f2a8a6173185e62c32690056efab661e5', commit_message='Upload dataset', commit_description='', oid='e9cdfa2f2a8a6173185e62c32690056efab661e5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/CharlyR/vTikz-vlm_oracl_benchmark', endpoint='https://huggingface.co', repo_type='dataset', repo_id='CharlyR/vTikz-vlm_oracl_benchmark'), pr_revision=None, pr_num=None)

In [134]:
vlm_dataset_wrong = load_dataset("CharlyR/vTikz-vlm_oracl_benchmark","llama90bvision_wrong_solution", split="test")
input_provided_dataset_wrong = load_dataset("CharlyR/vTikz-vlm_oracl_benchmark","input_provided_gpt4o-mini_wrong_solution", split="test")


Generating test split: 100%|██████████| 232/232 [00:00<00:00, 10347.94 examples/s]
Generating test split: 100%|██████████| 232/232 [00:00<00:00, 14408.93 examples/s]


In [136]:
vlm_df  = vlm_dataset_wrong.to_pandas()

print(len(vlm_df[vlm_df["instruction_applied"]==True]))

input_provided_df  = input_provided_dataset_wrong.to_pandas()
print(len(input_provided_df[input_provided_df["instruction_applied"]==True]))

48
20
