# Feature box detection via gemini
------
The goal is to evaluate if gemini models have trouble identifying features that are similar.

An initial set of features is generated from an image.    
These features are extended by calling the llm again, and telling him to give a reformulation of the feature.     
box identification via gemini is conducted, with randomized inputs.

### Generate the features dataset

In [8]:
import os

from openai import OpenAI
from build.lib.vif_agent.prompt import FEATURE_IDENTIFIER_PROMPT
from vif_agent.modules.identification.prompt import DETECTION_PROMPT
from vif_agent.modules.search.search import VLMSearchModule

client = OpenAI(
    api_key=os.environ.get("OROUTER_API_KEY_PERSO"),
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
)
model = "google/gemini-2.0-flash-exp:free"
temperature = 0.3
vlm_search_module = VLMSearchModule(
    client=client,
    model=model,
    temperature=temperature,
)

prompt_reformulate: str = (
    'Given the following list of feature description "{features}", Give me a new array, within which each description is reformulated using different words if possible, use alternative words only when they are popular, otherwise just reformulate. Give me only the new array, I need to be able to parse your answer directly.'
)
import ast
import time

def reformulate_features(features: list[str])->list[str]:
    features_string = "["+",".join(['"'+f+'"' for f in features])+"]"
    question = prompt_reformulate.format(features = features_string)
    retry = True
    while retry:
        try:
            response = client.chat.completions.create(
                model=model,
                temperature=temperature,
                messages=[{"role": "user", "content": question}],
            )
            retry=False
        except Exception as e :
            print("Retrying reformulation" + str(e))
            time.sleep(60)
    return ast.literal_eval(response.choices[0].message.content)


def compute_features(row):
    retry=True
    while retry:
        try:
            features = vlm_search_module.get_features(row["image_input"])
        except Exception as e :
            print("Retrying get features" + str(e))
            time.sleep(60)
    row["features"] = features
    row["alternative_features"] = reformulate_features(features)
    return row

In [9]:
from datasets import load_dataset,load_from_disk
import os

import numpy as np

try:
    ds = load_from_disk(".dataset/feature_generated")
except FileNotFoundError as e:
    ds = load_dataset("CharlyR/varbench", "tikz", split="benchmark")

    ds = ds.select_columns(["type","instruction","code","image_input"])
    initial_list = ds["code"]
    _ , unique_indices = np.unique(initial_list, return_index=True, axis=0)
    filtered_dataset = ds.select(unique_indices.tolist())
    ds = ds.filter(lambda row: row["type"]=="animal").train_test_split(test_size=0.06)["test"]
    ds = ds.map(compute_features)
    ds.save_to_disk(".dataset/feature_generated")

Using the latest cached version of the dataset since CharlyR/varbench couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'tikz' at /home/creux/.cache/huggingface/datasets/CharlyR___varbench/tikz/0.0.0/8cb86a70365f8fbffb37aa80db514f9f86b93be9 (last modified on Fri Jun  6 10:42:21 2025).
Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Retrying get featuresError code: 400 - [{'error': {'code': 400, 'message': 'API key not valid. Please pass a valid API key.', 'status': 'INVALID_ARGUMENT', 'details': [{'@type': 'type.googleapis.com/google.rpc.ErrorInfo', 'reason': 'API_KEY_INVALID', 'domain': 'googleapis.com', 'metadata': {'service': 'generativelanguage.googleapis.com'}}, {'@type': 'type.googleapis.com/google.rpc.LocalizedMessage', 'locale': 'en-US', 'message': 'API key not valid. Please pass a valid API key.'}]}}]


Map:   0%|          | 0/3 [00:02<?, ? examples/s]


KeyboardInterrupt: 

### Embedding computation and new feature set creation

In [None]:
from sentence_transformers import SentenceTransformer
import torch

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")


def create_features_with_similar(row):
    new_rows_features = []
    new_rows_sim = []
    for f_i, alt_feature in enumerate(row["alternative_features"]):
        feature = row["features"][f_i]
        emb_ori_feature = embedding_model.encode(feature)
        emb_sim_feature = embedding_model.encode(alt_feature)
        similarity = embedding_model.similarity(emb_ori_feature, emb_sim_feature)[0][
            0
        ].item()
        new_rows_features.append(row["features"] + [alt_feature])
        new_rows_sim.append(similarity)
    return {
        "new_features": new_rows_features,
        "new_row_sim": new_rows_sim,
        "added_feature": row["alternative_features"],
        "original_feature": row["features"],
    }


ds2=ds.map(create_features_with_similar)

In [None]:
df2 = ds2.to_pandas().explode(["new_features","new_row_sim","added_feature","original_feature"])
df2 = df2.drop(columns=["type","code","instruction","features","alternative_features"])

### Calling gemini box detection

In [None]:
from collections import defaultdict
from vif_agent.modules.identification.utils import get_boxes
import io
from PIL import Image
import random

client = OpenAI(
    api_key=os.environ.get("GOOGLE_API_KEY"),
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
)
model = "gemini-2.0-flash"
temperature = 0.3


def create_rows_shuffled(row):
    new_rows = []

    for _ in range(3):
        new_rows.append(
            random.sample(list(row["new_features"]), len(list(row["new_features"])))
        )

    row["new_features"] = new_rows
    return row


def identify_features_box(row):
    image = Image.open(io.BytesIO(row["image_input"]["bytes"]))
    try:
        detected = [
            box["label"]
            for box in get_boxes(
                client=client,
                image=image,
                model=model,
                temperature=temperature,
                features=row["new_features"],
            )
        ]
    except:
        return []

    new_rows = defaultdict(list)

    for i, feature in enumerate(row["new_features"]):
        new_rows["feature"].append(feature)
        new_rows["detected"].append(feature in detected)
        new_rows["order"].append(i)

        is_original = feature == row["original_feature"]
        is_added = feature == row["added_feature"]
        new_rows["is_original"].append(is_original)
        new_rows["is_added"].append(is_added)

        similar_feature = None
        if feature == row["added_feature"]:
            similar_feature = row["original_feature"]
        elif feature == row["original_feature"]:
            similar_feature = row["added_feature"]

        new_rows["similar_feature"].append(similar_feature)
        new_rows["prob_sim"].append(row["new_row_sim"] if is_original or is_added else None)

    return new_rows

In [None]:
df2 = df2.apply(create_rows_shuffled,axis=1).explode("new_features")

In [None]:
from datasets import Dataset
ds2 = Dataset.from_pandas(df2)
ds2t = ds2.map(identify_features_box)

Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map: 100%|██████████| 51/51 [04:06<00:00,  4.83s/ examples]


In [None]:
df2t = ds2t.to_pandas().explode(
    [
        "similar_feature",
        "prob_sim",
        "is_original",
        "is_added",
        "feature",
        "detected",
        "order",
    ]
)

In [None]:
df2t['detected'] = df2t['detected'].astype(bool)
df2t['is_original'] = df2t['is_original'].astype(bool)
df2t['is_added'] = df2t['is_added'].astype(bool)

df2t[["is_original","is_added","detected"]].groupby(["is_original","is_added"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,detected
is_original,is_added,Unnamed: 2_level_1
False,False,0.960784
False,True,0.843137
True,False,0.862745


In [None]:
df2t[["detected","order"]].groupby("order").mean()

Unnamed: 0_level_0,detected
order,Unnamed: 1_level_1
0,0.960784
1,0.960784
2,0.960784
3,0.980392
4,0.960784
5,0.960784
6,0.960784
7,0.960784
8,0.941176
9,0.941176


In [None]:
df2t[["detected","feature"]].groupby("feature").mean().sort_values("detected")

Unnamed: 0_level_0,detected
feature,Unnamed: 1_level_1
amber left eye,0.333333
dark left ear,0.333333
rose right inner ear,0.333333
azure round backdrop,0.666667
pale middle whisker,0.666667
pink left inner ear,0.921569
pink right inner ear,0.921569
pink cat nose,0.921569
white chest oval,0.941176
white middle whisker,0.941176
