[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)]()

In [None]:
!pip install openai -q

#Loading the dataset with base model answers

In [None]:
from datasets import load_dataset
dataset = load_dataset("leinms/flickr30k-qwen3vl-baseline")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Repo card metadata block was not found. Setting CardData to empty.


#Judge LLM prompt for evaluation

In [None]:
prompt = """
You are an evaluation model. Your task is to score a candidate image caption (“model_answ”)
based on (1) the input image and (2) the gold caption (“golden_answer”).
You must evaluate the candidate caption strictly according to the criteria described below.
Your output MUST follow the ResponseStructure schema and must contain only valid integers.

----------------------------------------------------------------------
EVALUATION CRITERIA (0–10 each, except final_score):

1. semantic_similarity (0–10)
   - Measures how close the meaning of model_answ is to golden_answer.
   - 0 = completely different meaning
   - 5 = partially overlapping meaning, some correct concepts
   - 10 = nearly identical meaning, all major ideas align

2. object_coverage (0–10)
   - Evaluates how well the candidate caption covers the key objects and entities
     that appear in the image AND are mentioned in the golden caption.
   - 0 = covers none of the important objects
   - 10 = fully covers all important objects

3. hallucination_score (0–10)
   - Measures the absence of hallucinations (objects or facts NOT present in the image).
   - 0 = many hallucinations
   - 5 = small mistakes or 1–2 minor hallucinated details
   - 10 = no hallucinated objects or false claims

4. completeness (0–10)
   - Checks whether the model caption includes all essential elements needed
     to describe the scene similarly to the golden caption.
   - 0 = major omissions
   - 10 = fully complete, nothing important missing

5. instruction_following (0–10)
   - Measures whether the model caption follows the format and intent
     implied by the golden caption (e.g., factual description, no storytelling,
     no added opinions, no unnecessary style).
   - 0 = fails to follow basic instructions
   - 10 = fully consistent with the expected style and constraints

6. fluency (0–10)
   - Measures textual quality: clarity, grammar, readability, and natural flow.
   - 0 = hard to read, ungrammatical
   - 10 = fluent, well-formed, natural English

7. final_score (0–100)
   - An overall integrated score.
   - Must be a reasonable weighted aggregation of the above criteria.
   - You decide the weighting, but it must correlate with quality.
   - 0 = unusable caption
   - 100 = excellent caption, nearly perfect

----------------------------------------------------------------------
ADDITIONAL RULES:

- Rely on the IMAGE first, and the golden caption second.
- Be strict: do NOT inflate scores.
- Respond ONLY with valid ResponseStructure JSON fields.
- Do NOT include any explanation, comments, or extra text.

----------------------------------------------------------------------
INPUT PROVIDED:
- An image ("image_url")
- golden_answer: the correct caption
- model_answ: the caption to evaluate

Evaluate the model caption and produce numeric scores for all fields.
"""

In [None]:
import os
import base64
from io import BytesIO
from datasets import load_dataset
from openai import OpenAIb
from pydantic import BaseModel, Field

client = OpenAI(api_key="YOUR_API_KEY")

class ResponseStructure(BaseModel):
    # 0–10
    semantic_similarity: int = Field(..., ge=0, le=10, description="Семантическая близость к золотому описанию")
    object_coverage: int = Field(..., ge=0, le=10, description="Покрытие важных объектов")
    hallucination_score: int = Field(..., ge=0, le=10, description="Отсутствие галлюцинаций (10 = нет ошибок)")
    completeness: int = Field(..., ge=0, le=10, description="Полнота описания")
    instruction_following: int = Field(..., ge=0, le=10, description="Следование формату/инструкции")
    fluency: int = Field(..., ge=0, le=10, description="Связность и грамотность текста")
    final_score: int = Field(..., ge=0, le=100, description="Итоговый интегральный балл")


def pil_image_to_data_url(pil_img, fmt="JPEG"):
    buf = BytesIO()
    pil_img.save(buf, format=fmt)
    buf.seek(0)
    img_bytes = buf.read()
    img_b64 = base64.b64encode(img_bytes).decode("utf-8")
    mime = "image/jpeg" if fmt.lower() in ("jpeg", "jpg") else f"image/{fmt.lower()}"
    return f"data:{mime};base64,{img_b64}"


# --- один вызов модели для одного примера датасета ---

def run_model_for_example(img, alt_text, baseline_answ):
    data_url = pil_image_to_data_url(img)

    response = client.responses.parse(
        model="gpt-4o-2024-08-06",
        instructions=prompt,
        text_format=ResponseStructure,
        input=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "input_text",
                        "text": (
                            f"golden_answer: {alt_text}\n"
                            f"model_answ: {baseline_answ}\n"
                        ),
                    },
                    {
                        "type": "input_image",
                        "image_url": data_url,
                    },
                ],

            }
        ]
    )

    return response.output_parsed

In [None]:
from datasets import load_dataset

ds = dataset["test"].select_columns(["image", "alt_text", "baseline_answ"])

ex = ds[0]
answer = run_model_for_example(ex["image"], ex["alt_text"], ex["baseline_answ"])
print(answer)


In [None]:
from datasets import load_dataset
from tqdm import tqdm

ds = dataset["test"].select_columns(["image", "alt_text", "baseline_answ"])

all_outputs = []


for ex in tqdm(ds, total=len(ds), desc="Processing dataset"):
    try:
        ans = run_model_for_example(
            ex["image"],
            ex["alt_text"],
            ex["baseline_answ"],
        )
    except Exception as e:
        print(f"Ошибка на примере {i}: {e}")
        ans = None

    all_outputs.append(ans)


Processing dataset: 100%|██████████| 1000/1000 [56:46<00:00,  3.41s/it]
