In [5]:
import sys
from pathlib import Path

%load_ext autoreload
%autoreload 2

root_path = Path("/home/olivieri/exp").resolve()
src_path = root_path / "src"
sys.path.append(f"{str(src_path)}")

In [6]:
from IPython.display import Markdown
import pandas as pd
import json
from glob import glob
import os
import numpy as np
import xarray as xr

from data import *
from path import get_eval_prs_path

# Eval. Annotations

In [7]:
# Setting
BY_MODEL = "LRASPP_MobileNet_V3"
SPLIT_BY = "non-splitted"

In [18]:
root_prs_path = get_selected_annots_path(BY_MODEL, SPLIT_BY) / "llm_judge_assessment"
answer_prs = get_many_item(root_prs_path / "answer_prs.jsonl", return_state=False)

In [13]:
answer_gts = get_many_answer_gt(BY_MODEL, SPLIT_BY)

In [29]:
idx = 0

print("GTS:")
display(Markdown(answer_gts[idx]))
print("PRS:")
display(Markdown(answer_prs[idx]))

GTS:


The ground truth AEROPLANE regions have been segmented in a coarser and incomplete way, especially regarding the wings, and two tiny AEROPLANE patches have been hallucinated on the right edge. The prediction mask for the ground truth PERSON region on the center-bottom-left is slightly more blob-like.

PRS:


The ground truth AEROPLANE region has been segmented with inaccurate boundaries, especially on the right wing and the tail of the plane in the background. The PERSON region on the bottom has been under-segmented.


# Eval Prompt Assessment

In [20]:
root_prs_path = get_selected_annots_path(BY_MODEL, SPLIT_BY) / "eval_prs" / root_exp_name

prs_path = glob(f"{root_prs_path}/*.jsonl")
variations_names = sorted([os.path.splitext(os.path.basename(pr_p))[0] for pr_p in prs_path])
variations_names

['SepMasks_Ovr']

In [29]:
# levels = ["exp", "var", ""]
data_da = None

pred_prs_per_var_list = []

for variation_name in variations_names:
    
    eval_prs = get_many_eval_pr(BY_MODEL, SPLIT_BY, f"{root_exp_name}/{variation_name}", return_state=False)
    prs_per_img_idx_df = pd.DataFrame.from_dict(eval_prs, orient='index').sort_index().sort_index(axis=1)
    prs_per_img_idx_da = xr.DataArray(prs_per_img_idx_df, coords=[sorted(prs_per_img_idx_df.index), sorted(prs_per_img_idx_df.columns)], dims=["img_idx", "metric"])
    prs_per_img_idx_da.loc[..., "pred"] = prs_per_img_idx_da.sel(metric="pred") == "correct"

    if data_da is None:
        coords = [variations_names, prs_per_img_idx_df.index, prs_per_img_idx_df.columns] # indexes names
        sorted_coords = [sorted(dim_values) for dim_values in coords]
        dims = ["var", "img_idx", "metric"] # dimensions names
        shape = [len(l) for l in sorted_coords]
        data_da = xr.DataArray(np.empty(shape, dtype=object), coords=sorted_coords, dims=dims)

    data_da.loc[variation_name] = prs_per_img_idx_da

In [35]:
data_da.sel(metric="pred")

In [6]:
df_pred_gts = pd.DataFrame()
df_score_gts = pd.DataFrame()
df_pred_prs = pd.DataFrame()
df_score_prs = pd.DataFrame()

for var in variations:

    eval_gts = get_many_eval_gt(BY_MODEL, return_state=False)
    eval_prs = get_many_eval_pr(BY_MODEL, "llm_judge_assessment", var, return_state=False)
    # eval_prs = get_many_item(pr_path, return_state=False)

    _df_all_gts = pd.DataFrame.from_dict(eval_gts).transpose()
    _df_pred_gts = _df_all_gts["pred"].map(lambda x: x if x is None else x == "correct")
    _df_score_gts = _df_all_gts["score"]
    
    _df_all_prs = pd.DataFrame.from_dict(eval_prs).transpose()
    _df_pred_prs = _df_all_prs["pred"].map(lambda x: x if x is None else x == "correct")
    _df_score_prs = _df_all_prs["score"]

    df_pred_gts = pd.concat([df_pred_gts, _df_pred_gts], axis=1)
    df_score_gts = pd.concat([df_score_gts, _df_score_gts], axis=1)
    
    df_pred_prs = pd.concat([df_pred_prs, _df_pred_prs], axis=1)
    df_score_prs = pd.concat([df_score_prs, _df_score_prs], axis=1)

df_pred_prs.columns = variations
df_score_prs.columns = variations
df_pred_gts.columns = variations
df_score_gts.columns = variations

## Pred

In [7]:
pred_acc = df_pred_prs.eq(df_pred_gts).where(df_pred_prs.notna() & df_pred_gts.notna(), None).mean()

## Score

In [8]:
score_MAE = np.abs(df_score_gts - df_score_prs).mean(axis=0)
score_ME = (df_score_gts - df_score_prs).mean(axis=0)

## Overall

In [9]:
score_table = pd.concat([pred_acc, score_MAE, score_ME], axis=1)
score_table.columns = ["pred_acc.", "score MAE", "score ME"]
score_table

Unnamed: 0,pred_acc.,score MAE,score ME
