In [2]:
import sys
from pathlib import Path

%load_ext autoreload
%autoreload 2

root_path = Path("/home/olivieri/exp").resolve()
src_path = root_path / "src"
sys.path.append(f"{str(src_path)}")

In [3]:
from IPython.display import Markdown
from concurrent.futures import ThreadPoolExecutor, as_completed # for paralellism
import time

from prompter import *
from data import *
from utils import *
from model import *

# Inference

In [7]:
# model_name = "gemma3:4b-it-q4_K_M"
# model_name = "gemma3:4b-it-qat"
# model_name = "gemma3:12b"
# model_name = "gemma3:12b-it-qat"
# model_name = "gemma3:27b"
# model_name = "gemma3:27b-it-qat"
# model_name = "qwen2.5vl:7b-q4_K_M"
# model_name = "hf.co/unsloth/Mistral-Small-3.1-24B-Instruct-2503-GGUF:Q4_K_M"
# model_name = "qwen2.5vl:32b-q4_K_M"

model_name = "gemma3:4b-it-qat"

vlm = OllamaMLLM(model_name)

llm_judge: GoogleAIStudioMLLM = GoogleAIStudioMLLM(model_name="gemini-2.0-flash", api_key=os.environ["GOOGLE_AI_KEY_1"])

# Setting
BY_MODEL = "LRASPP_MobileNet_V3"
SPLIT_BY = "non-splitted"

promptBuilder = PromptBuilder(
    by_model            = BY_MODEL,
    alpha               = 0.7,
    image_size          = 224,
    array_size          = (32, 32),
    class_map           = CLASS_MAP, # imported from 'class_map.py'
    color_map           = COLOR_MAP_DICT,
    split_by            = SPLIT_BY
)

gen_params = GenParams(seed=get_seed(), temperature=0.1)

## Inference

### Single Sample

In [8]:
promptBuilder.load_modules(
    context_module          = ContextModule(variation="default"),
    color_map_module        = Names_ColorMapModule(variation="default"),
    input_format_module     = SepMasks_Ovr_InputFormatModule("original"),
    task_module             = TaskModule(variation="default"),
    output_format_module    = OutputFormatModule(variation="default"),
    support_set_module      = SupportSetModule(variation="default", sup_set_idxs=()),
    query_module            = QueryModule(variation="default"),
    eval_module             = EvalModule(variation="7_incomplet+strict+precision+error_types+spatial_locs")
)

In [9]:
query_idx = 22

In [10]:
inference_prompt = promptBuilder.build_inference_prompt(query_idx)
# display_prompt(class_splitted_inference_prompt[12])

In [11]:
answer_pr = await vlm.predict_one(
    inference_prompt,
    query_idx,
    gen_params=gen_params,
    only_text=True
)
answer_pr

{'img_idx': 22,
 'content': 'The prediction mask captures the motorcycle region quite accurately, mirroring the ground truth almost perfectly. There are no significant deviations observed in the segmentation of the motorcycle.'}

In [None]:
eval_prompt = promptBuilder.build_eval_prompt(query_idx, answer_pr)
len(eval_prompt)

In [22]:
eval_pr = await llm_judge.predict_one(
    eval_prompt,
    query_idx,
    gen_params=gen_params,
    only_text=True,
    parse_to_dict=True
)
eval_pr

{'img_idx': 22,
 'content': {'pred': 'incorrect',
  'score': 1,
  'reason': 'The predicted answer states that the MOTORBIKE region is captured quite accurately, which is the opposite of the ground truth. The ground truth mentions that the prediction mask is coarser, boundaries are less defined, and there are over-segmentation issues. The predicted answer also mentions fragmentation, but the overall assessment of accuracy is incorrect.'}}

## Pipeline

In [None]:
input_format = SepMasks_Ovr_InputFormatModule("original")

promptBuilder.load_modules(
    context_module          = ContextModule(variation="default"),
    color_map_module        = Patches_ColorMapModule(variation="default"),
    input_format_module     = input_format,
    task_module             = TaskModule(variation="default"),
    output_format_module    = OutputFormatModule(variation="default"),
    support_set_module      = SupportSetModule(variation="default", sup_set_idxs=(16, 2, 18)),
    query_module            = QueryModule(variation="default"),
    eval_module             = EvalModule(variation="7_incomplet+strict+precision+error_types+spatial_locs")
)

prompt_desc = input_format.__class__.__name__.removesuffix("_InputFormatModule")

In [5]:
epoch_idxs = [x for x in list(range(80)) if x not in promptBuilder.sup_set_idxs]
len(epoch_idxs)

77

**Save Paths**

In [6]:
answer_path = Path("/home/olivieri/exp/my_data/by_model/LRASPP_MobileNet_V3/non-splitted/answer_prs/gemini-2.0-flash/baseline/ConcatMasks_Ovr_Hz.jsonl")
eval_path = Path("/home/olivieri/exp/my_data/by_model/LRASPP_MobileNet_V3/non-splitted/eval_prs/gemini-2.0-flash/baseline/ConcatMasks_Ovr_Hz.jsonl")

### Answering

In [7]:
inference_prompts = [promptBuilder.build_inference_prompt(q_i) for q_i in epoch_idxs]
len(inference_prompts)

77

In [9]:
append_many_to_jsonl(answer_path, [{"state": promptBuilder.get_state()}]) 

In [10]:
print(f"Evaluating {prompt_desc}.")

epoch_pred_list = [{"state": promptBuilder.get_state()}]

epoch_pred_list += await vlm.predict_many(
    inference_prompts,
    epoch_idxs,
    gen_params=gen_params,
    jsonl_save_path=answer_path,
    only_text=True,
    batch_size=13,
    cooldown_period=60
)

Evaluating ConcatMasks_Ovr_Hz.


100%|[38;2;103;173;91m██████████[0m| 6/6 [05:16<00:00, 52.75s/item]


### Evaluation

In [11]:
epoch_answer_list = get_many_eval_pr(answer_path, return_state=False, format_to_dict=False)
len(epoch_answer_list)

77

In [13]:
eval_prompts = [promptBuilder.build_eval_prompt(img_idx, epoch_answer_list[i]["content"]) for i, img_idx in my_tqdm(epoch_idxs)]
eval_prompts[0:2]

100%|[38;2;103;173;91m██████████[0m| 77/77 [00:00<00:00, 2805.70item/s]


[["You are an intelligent chatbot designed for evaluating the correctness of AI assistant predictions for question-answer pairs.\nYour task is to compare the predicted answer with the ground-truth answer and determine if the predicted answer is correct or not. Here's how you can accomplish the task:\n\n------\n\n## INSTRUCTIONS:\n- Focus on the correctness, completeness and accuracy of the predicted answer with the ground-truth.\n- Consider predictions with less or more specific details (as long as they show some consistency with the ground truth) as correct evaluation.\n- Be strict with your evaluations.\n- Expect precision from the predicted answer, it is not enough for it to roughly capture the essence of the ground truth, the prediction has to overlap sufficiently to the ground truth to be considered correct.\n- Be critical of significant inconsistencies of error types and spatial locations.\n\nPlease evaluate the following answer:\n\nGround truth correct Answer:\nThe ground truth 

In [14]:
append_many_to_jsonl(eval_path, [{"state": promptBuilder.get_state()}]) 

In [16]:
print(f"Evaluating {prompt_desc}.")

epoch_eval_list = [{"state": promptBuilder.get_state()}]

epoch_eval_list += await llm_judge.predict_many(
    eval_prompts,
    epoch_idxs,
    gen_params=gen_params,
    jsonl_save_path=eval_path,
    only_text=True,
    parse_to_dict=True,
    batch_size=13,
    cooldown_period=60
)

Evaluating ConcatMasks_Ovr_Hz.


100%|[38;2;103;173;91m██████████[0m| 6/6 [05:09<00:00, 51.55s/item]
