In [2]:
import re
import json
import pandas as pd
import numpy as np
from scipy.stats import beta
from prettytable import PrettyTable
from omegaconf import OmegaConf
from sklearn.metrics import f1_score, confusion_matrix, roc_curve, multilabel_confusion_matrix, matthews_corrcoef
from scipy.stats import spearmanr
from collections import Counter

from cgeval.rating import Ratings, Label, Observation
from cgeval import Report
from cgeval.report import GenericReport
from cgeval.distribution import Beta, BetaParams

In [3]:
BASE_PATH = f'../../out/pipeline'


MODEL = {
    'Llama-3.3-70B': f'{BASE_PATH}/2025-05-15_sentiment_analysis_llama3-3',
    'Llama-2-7B': f'{BASE_PATH}/2025-05-15_sentiment_analysis_llama2',
    'Mistral-7B': f'{BASE_PATH}/2025-05-15_sentiment_analysis_mistral',
}

IMAGE_MODEL_ANIMAL = {
    'Stable Diffusion 3.5': f'{BASE_PATH}/2025-06-25_animal_detection_stable-diffusion_animal',
    'Stable Cascade': f'{BASE_PATH}/2025-06-25_animal_detection_stable-cascade_animal',
    'FLUX.1-dev': f'{BASE_PATH}/2025-06-25_animal_detection_flux_animal'
}

IMAGE_MODEL_COUNT = {
    'Stable Diffusion 3.5': f'{BASE_PATH}/2025-06-25_animal_detection_stable-diffusion_count',
    'Stable Cascade': f'{BASE_PATH}/2025-06-25_animal_detection_stable-cascade_count',
    'FLUX.1-dev': f'{BASE_PATH}/2025-06-25_animal_detection_flux_count'
}

cls_name2id = {
    'FinancialBERT': 'FIB',
    'lxyuan_DistilBert': 'DSS',
    'ollama3.2': 'LL3',
    'Yolov8': 'VO8',
    'DETR': 'DTR',
    'LLaVA': 'LLV'
}

model_name2id = {
    'Llama-3.3-70B': 'L370B',
    'Llama-2-7B': 'L27B',
    'Mistral-7B': 'Mi7B',
    'Stable Diffusion 3.5': 'SD35',
    'Stable Cascade': 'StCa',
    'FLUX.1-dev': 'FLX1'
}

In [4]:
def pretty_print_latex(latex_str):
    lines = latex_str.replace(r" \\ ", r" \\" + "\n").splitlines()
    formatted_lines = []
    indent_level = 0
    for line in lines:
        if r"\begin" in line:
            formatted_lines.append(line)
            indent_level += 1
        elif r"\end" in line:
            indent_level -= 1
            formatted_lines.append(line)
        else:
            formatted_lines.append("    " * indent_level + line)
    return "\n".join(formatted_lines)

In [5]:
def compare_beta_distributions(dist_1, dist_2):
    n_samples = 100000
    samples_1 = beta.rvs(dist_1.params.a, dist_1.params.b, size=n_samples)
    samples_2 = beta.rvs(dist_2.params.a, dist_2.params.b, size=n_samples)

    return np.mean(samples_1 > samples_2)

In [6]:
def load_dist(model_path, cls):
    report = GenericReport()
    report.load(f"{model_path}/quantify/cls_report_{cls.id}.json")

    report = vars(report)
    dist = report['dist_report']

    # oracle = dist[0]
    p = dist[1]
    # p_obs = dist[2]

    p_dist = Beta(params=BetaParams(p['a'], p['b']))
    return p_dist

In [7]:
def load_obs_dist(model_path, cls):
    report = GenericReport()
    report.load(f"{model_path}/quantify/cls_report_{cls.id}.json")

    report = vars(report)
    dist = report['dist_report']

    # oracle = dist[0]
    # p = dist[1]
    p_obs = dist[2]

    p_obs_dist = Beta(params=BetaParams(p_obs['a'], p_obs['b']))
    return p_obs_dist

In [8]:
def load_oracle_dist(model_path, cls):
    report = GenericReport()
    report.load(f"{model_path}/quantify/cls_report_{cls.id}.json")

    report = vars(report)
    dist = report['dist_report']

    oracle = dist[0]
    # p = dist[1]
    # p_obs = dist[2]

    oracle_dist = Beta(params=BetaParams(oracle['a'], oracle['b']))
    return oracle_dist

In [9]:
t = PrettyTable(['Classifier', 'P(L370B > L27B)', 'P(L370B > Mi7B)', 'P(L27B > Mi7B)'])

cfg =  OmegaConf.load(f'{next(iter(MODEL.values()))}/config.yaml')
for cls in cfg.classifier:
    models = iter(MODEL.keys())
    sys_1 = MODEL[next(models)]
    sys_2 = MODEL[next(models)]
    sys_3 = MODEL[next(models)]

    sys_1_d = load_dist(sys_1, cls)
    sys_2_d = load_dist(sys_2, cls)
    sys_3_d = load_dist(sys_3, cls)

    p_1_2 = compare_beta_distributions(sys_1_d, sys_2_d)
    p_1_3 = compare_beta_distributions(sys_1_d, sys_3_d)
    p_2_3 = compare_beta_distributions(sys_2_d, sys_3_d)

    t.add_row([cls_name2id[cls.id], p_1_2, p_1_3, p_2_3])

sys_1_d = load_oracle_dist(sys_1, cls)
sys_2_d = load_oracle_dist(sys_2, cls)
sys_3_d = load_oracle_dist(sys_3, cls)

p_1_2 = compare_beta_distributions(sys_1_d, sys_2_d)
p_1_3 = compare_beta_distributions(sys_1_d, sys_3_d)
p_2_3 = compare_beta_distributions(sys_2_d, sys_3_d)
    
t.add_row(['Human', p_1_2, p_1_3, p_2_3])


# ClfA:  p(SysA > SysB) p(SysA > SysC) p(SysB > SysC)
# ClfB:  p(SysA > SysB) p(SysA > SysC) p(SysB > SysC)
# ClfC:  p(SysA > SysB) p(SysA > SysC) p(SysB > SysC)

print('Compare Calibrated Distributions (BCC)')
t

Compare Calibrated Distributions (BCC)


Classifier,P(L370B > L27B),P(L370B > Mi7B),P(L27B > Mi7B)
FIB,0.7318,0.86904,0.70112
DSS,0.73961,0.92693,0.80705
LL3,0.89839,0.94518,0.64502
Human,0.84647,0.92689,0.66538


In [10]:
l = pretty_print_latex(t.get_latex_string())

print(l)

\begin{tabular}{cccc}
    Classifier & P(L370B > L27B) & P(L370B > Mi7B) & P(L27B > Mi7B) \\
    FIB & 0.7318 & 0.86904 & 0.70112 \\
    DSS & 0.73961 & 0.92693 & 0.80705 \\
    LL3 & 0.89839 & 0.94518 & 0.64502 \\
    Human & 0.84647 & 0.92689 & 0.66538 \\
\end{tabular}


In [11]:
t = PrettyTable(['Classifier', 'P(L370B > L27B)', 'P(L370B > Mi7B)', 'P(L27B > Mi7B)'])

cfg =  OmegaConf.load(f'{next(iter(MODEL.values()))}/config.yaml')
for cls in cfg.classifier:
    models = iter(MODEL.keys())
    sys_1 = MODEL[next(models)]
    sys_2 = MODEL[next(models)]
    sys_3 = MODEL[next(models)]

    sys_1_d = load_obs_dist(sys_1, cls)
    sys_2_d = load_obs_dist(sys_2, cls)
    sys_3_d = load_obs_dist(sys_3, cls)

    p_1_2 = compare_beta_distributions(sys_1_d, sys_2_d)
    p_1_3 = compare_beta_distributions(sys_1_d, sys_3_d)
    p_2_3 = compare_beta_distributions(sys_2_d, sys_3_d)

    t.add_row([cls_name2id[cls.id], p_1_2, p_1_3, p_2_3])

    
sys_1_d = load_oracle_dist(sys_1, cls)
sys_2_d = load_oracle_dist(sys_2, cls)
sys_3_d = load_oracle_dist(sys_3, cls)

p_1_2 = compare_beta_distributions(sys_1_d, sys_2_d)
p_1_3 = compare_beta_distributions(sys_1_d, sys_3_d)
p_2_3 = compare_beta_distributions(sys_2_d, sys_3_d)
    
t.add_row(['Human', p_1_2, p_1_3, p_2_3])


# ClfA:  p(SysA > SysB) p(SysA > SysC) p(SysB > SysC)
# ClfB:  p(SysA > SysB) p(SysA > SysC) p(SysB > SysC)
# ClfC:  p(SysA > SysB) p(SysA > SysC) p(SysB > SysC)

print('Compare Observed Distributions (CC)')
t

Compare Observed Distributions (CC)


Classifier,P(L370B > L27B),P(L370B > Mi7B),P(L27B > Mi7B)
FIB,1.0,1.0,1.0
DSS,7e-05,0.92387,1.0
LL3,0.66883,0.9859,0.96162
Human,0.84846,0.92555,0.6686


In [12]:
l = pretty_print_latex(t.get_latex_string())

print(l)

\begin{tabular}{cccc}
    Classifier & P(L370B > L27B) & P(L370B > Mi7B) & P(L27B > Mi7B) \\
    FIB & 1.0 & 1.0 & 1.0 \\
    DSS & 7e-05 & 0.92387 & 1.0 \\
    LL3 & 0.66883 & 0.9859 & 0.96162 \\
    Human & 0.84846 & 0.92555 & 0.6686 \\
\end{tabular}


In [13]:
t = PrettyTable(['Classifier', 'P(SD35 > StCa)', 'P(SD35 > FLX1)', 'P(StCa > FLX1)'])

# 'Stable Diffusion 3.5': 'SD35',
# 'Stable Cascade': 'StCa',
# 'FLUX.1-dev': 'FLX1'

cfg =  OmegaConf.load(f'{next(iter(IMAGE_MODEL_ANIMAL.values()))}/config.yaml')
for cls in cfg.classifier:
    models = iter(IMAGE_MODEL_ANIMAL.keys())
    sys_1 = IMAGE_MODEL_ANIMAL[next(models)]
    sys_2 = IMAGE_MODEL_ANIMAL[next(models)]
    sys_3 = IMAGE_MODEL_ANIMAL[next(models)]

    sys_1_d = load_dist(sys_1, cls)
    sys_2_d = load_dist(sys_2, cls)
    sys_3_d = load_dist(sys_3, cls)

    p_1_2 = compare_beta_distributions(sys_1_d, sys_2_d)
    p_1_3 = compare_beta_distributions(sys_1_d, sys_3_d)
    p_2_3 = compare_beta_distributions(sys_2_d, sys_3_d)

    t.add_row([cls_name2id[cls.id], p_1_2, p_1_3, p_2_3])

sys_1_d = load_oracle_dist(sys_1, cls)
sys_2_d = load_oracle_dist(sys_2, cls)
sys_3_d = load_oracle_dist(sys_3, cls)

p_1_2 = compare_beta_distributions(sys_1_d, sys_2_d)
p_1_3 = compare_beta_distributions(sys_1_d, sys_3_d)
p_2_3 = compare_beta_distributions(sys_2_d, sys_3_d)
    
t.add_row(['Human', p_1_2, p_1_3, p_2_3])

t

Classifier,P(SD35 > StCa),P(SD35 > FLX1),P(StCa > FLX1)
VO8,1.0,0.71162,0.0
DTR,1.0,0.6917,0.0
LLV,1.0,0.48717,0.0
Human,1.0,0.49843,0.0


In [14]:
l = pretty_print_latex(t.get_latex_string())

print(l)

\begin{tabular}{cccc}
    Classifier & P(SD35 > StCa) & P(SD35 > FLX1) & P(StCa > FLX1) \\
    VO8 & 1.0 & 0.71162 & 0.0 \\
    DTR & 1.0 & 0.6917 & 0.0 \\
    LLV & 1.0 & 0.48717 & 0.0 \\
    Human & 1.0 & 0.49843 & 0.0 \\
\end{tabular}


In [20]:
t = PrettyTable(['Classifier', 'P(SD35 > StCa)', 'P(SD35 > FLX1)', 'P(StCa > FLX1)'])

# 'Stable Diffusion 3.5': 'SD35',
# 'Stable Cascade': 'StCa',
# 'FLUX.1-dev': 'FLX1'

cfg =  OmegaConf.load(f'{next(iter(IMAGE_MODEL_ANIMAL.values()))}/config.yaml')
for cls in cfg.classifier:
    models = iter(IMAGE_MODEL_ANIMAL.keys())
    sys_1 = IMAGE_MODEL_ANIMAL[next(models)]
    sys_2 = IMAGE_MODEL_ANIMAL[next(models)]
    sys_3 = IMAGE_MODEL_ANIMAL[next(models)]

    sys_1_d = load_obs_dist(sys_1, cls)
    sys_2_d = load_obs_dist(sys_2, cls)
    sys_3_d = load_obs_dist(sys_3, cls)

    p_1_2 = compare_beta_distributions(sys_1_d, sys_2_d)
    p_1_3 = compare_beta_distributions(sys_1_d, sys_3_d)
    p_2_3 = compare_beta_distributions(sys_2_d, sys_3_d)

    t.add_row([cls_name2id[cls.id], p_1_2, p_1_3, p_2_3])

sys_1_d = load_oracle_dist(sys_1, cls)
sys_2_d = load_oracle_dist(sys_2, cls)
sys_3_d = load_oracle_dist(sys_3, cls)

p_1_2 = compare_beta_distributions(sys_1_d, sys_2_d)
p_1_3 = compare_beta_distributions(sys_1_d, sys_3_d)
p_2_3 = compare_beta_distributions(sys_2_d, sys_3_d)
    
t.add_row(['Human', p_1_2, p_1_3, p_2_3])

t

Classifier,P(SD35 > StCa),P(SD35 > FLX1),P(StCa > FLX1)
VO8,1.0,1.0,0.0
DTR,1.0,1.0,0.0
LLV,1.0,0.99868,0.0
Human,1.0,0.49837,0.0


In [None]:
l = pretty_print_latex(t.get_latex_string())

print(l)

In [15]:
t = PrettyTable(['Classifier', 'P(SD35 > StCa)', 'P(SD35 > FLX1)', 'P(StCa > FLX1)'])

# 'Stable Diffusion 3.5': 'SD35',
# 'Stable Cascade': 'StCa',
# 'FLUX.1-dev': 'FLX1'

cfg =  OmegaConf.load(f'{next(iter(IMAGE_MODEL_COUNT.values()))}/config.yaml')
for cls in cfg.classifier:
    models = iter(IMAGE_MODEL_COUNT.keys())
    sys_1 = IMAGE_MODEL_COUNT[next(models)]
    sys_2 = IMAGE_MODEL_COUNT[next(models)]
    sys_3 = IMAGE_MODEL_COUNT[next(models)]

    sys_1_d = load_dist(sys_1, cls)
    sys_2_d = load_dist(sys_2, cls)
    sys_3_d = load_dist(sys_3, cls)

    p_1_2 = compare_beta_distributions(sys_1_d, sys_2_d)
    p_1_3 = compare_beta_distributions(sys_1_d, sys_3_d)
    p_2_3 = compare_beta_distributions(sys_2_d, sys_3_d)

    t.add_row([cls_name2id[cls.id], p_1_2, p_1_3, p_2_3])

sys_1_d = load_oracle_dist(sys_1, cls)
sys_2_d = load_oracle_dist(sys_2, cls)
sys_3_d = load_oracle_dist(sys_3, cls)

p_1_2 = compare_beta_distributions(sys_1_d, sys_2_d)
p_1_3 = compare_beta_distributions(sys_1_d, sys_3_d)
p_2_3 = compare_beta_distributions(sys_2_d, sys_3_d)
    
t.add_row(['Human', p_1_2, p_1_3, p_2_3])

t

Classifier,P(SD35 > StCa),P(SD35 > FLX1),P(StCa > FLX1)
VO8,1.0,0.07458,0.0
DTR,1.0,0.19404,0.0
LLV,1.0,0.26272,0.0
Human,1.0,0.27823,0.0


In [16]:
l = pretty_print_latex(t.get_latex_string())

print(l)

\begin{tabular}{cccc}
    Classifier & P(SD35 > StCa) & P(SD35 > FLX1) & P(StCa > FLX1) \\
    VO8 & 1.0 & 0.07458 & 0.0 \\
    DTR & 1.0 & 0.19404 & 0.0 \\
    LLV & 1.0 & 0.26272 & 0.0 \\
    Human & 1.0 & 0.27823 & 0.0 \\
\end{tabular}


In [17]:
t = PrettyTable(['Classifier', 'P(SD35 > StCa)', 'P(SD35 > FLX1)', 'P(StCa > FLX1)'])

# 'Stable Diffusion 3.5': 'SD35',
# 'Stable Cascade': 'StCa',
# 'FLUX.1-dev': 'FLX1'

cfg =  OmegaConf.load(f'{next(iter(IMAGE_MODEL_COUNT.values()))}/config.yaml')
for cls in cfg.classifier:
    models = iter(IMAGE_MODEL_COUNT.keys())
    sys_1 = IMAGE_MODEL_COUNT[next(models)]
    sys_2 = IMAGE_MODEL_COUNT[next(models)]
    sys_3 = IMAGE_MODEL_COUNT[next(models)]

    sys_1_d = load_obs_dist(sys_1, cls)
    sys_2_d = load_obs_dist(sys_2, cls)
    sys_3_d = load_obs_dist(sys_3, cls)

    p_1_2 = compare_beta_distributions(sys_1_d, sys_2_d)
    p_1_3 = compare_beta_distributions(sys_1_d, sys_3_d)
    p_2_3 = compare_beta_distributions(sys_2_d, sys_3_d)

    t.add_row([cls_name2id[cls.id], p_1_2, p_1_3, p_2_3])

sys_1_d = load_oracle_dist(sys_1, cls)
sys_2_d = load_oracle_dist(sys_2, cls)
sys_3_d = load_oracle_dist(sys_3, cls)

p_1_2 = compare_beta_distributions(sys_1_d, sys_2_d)
p_1_3 = compare_beta_distributions(sys_1_d, sys_3_d)
p_2_3 = compare_beta_distributions(sys_2_d, sys_3_d)
    
t.add_row(['Human', p_1_2, p_1_3, p_2_3])

t

Classifier,P(SD35 > StCa),P(SD35 > FLX1),P(StCa > FLX1)
VO8,1.0,0.99756,0.0
DTR,1.0,0.19338,0.0
LLV,1.0,0.0,0.0
Human,1.0,0.27839,0.0


In [18]:
l = pretty_print_latex(t.get_latex_string())

print(l)

\begin{tabular}{cccc}
    Classifier & P(SD35 > StCa) & P(SD35 > FLX1) & P(StCa > FLX1) \\
    VO8 & 1.0 & 0.99756 & 0.0 \\
    DTR & 1.0 & 0.19338 & 0.0 \\
    LLV & 1.0 & 0.0 & 0.0 \\
    Human & 1.0 & 0.27839 & 0.0 \\
\end{tabular}


In [19]:
cfg =  OmegaConf.load(f'{next(iter(IMAGE_MODEL_COUNT.values()))}/config.yaml')
for cls in cfg.classifier:
    models = iter(IMAGE_MODEL_COUNT.keys())
    sd = IMAGE_MODEL_COUNT[next(models)]
    flx = IMAGE_MODEL_COUNT[next(models)]

    sys_1_d = load_obs_dist(sd, cls)
    sys_3_d = load_obs_dist(flx, cls)

    print()
    print(cls.id)

    print(sys_1_d.params)
    print(sys_3_d.params)

    p_1_3 = compare_beta_distributions(sys_1_d, sys_3_d)

    print(p_1_3)



Yolov8
BetaParams(a=6104.3, b=3944.65)
BetaParams(a=994.67, b=8882.48)
1.0

DETR
BetaParams(a=6229.89, b=3705.02)
BetaParams(a=890.49, b=8798.84)
1.0

LLaVA
BetaParams(a=5650.36, b=4316.89)
BetaParams(a=1.98, b=9939.01)
1.0
