In [1]:
all_paths = [
            ['/home/dongkeun/reasoning-models-confidence/evalchemy/logs/non_reasoning/triviaqa_val_1k/Qwen__Qwen2.5-32B-Instruct'],
            # ['/home/dongkeun/reasoning-models-confidence/evalchemy/logs/reasoning_force/triviaqa_val_1k/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B'],
             ]

In [None]:
from sklearn.metrics import roc_auc_score, brier_score_loss
import numpy as np
import json
import pandas as pd
import os

def get_full_path(path):
    # List all files (ignore directories)
    files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
    
    if len(files) != 1:
        raise ValueError(f"Expected exactly one file in {path}, but found {len(files)} files.")
    
    return os.path.join(path, files[0])

def get_results(path):
    path = get_full_path(path)
    data = json.load(open(path))
    # data.keys()
    data_key = list(data['results'].keys())[0]
    data_ = data['results'][data_key]['examples']
    df = pd.DataFrame(data_)
    df['model_confidence'] = df['model_confidence'].apply(clean_confidence)

    df_faulty = df[~df['model_confidence'].isin(class_names)]

    df["confidence_score"] = df["model_confidence"].map(confidence_mapping)
    df_clean = df.dropna(subset=["confidence_score", "correct"])


    probs = df_clean["confidence_score"].values
    labels = df_clean["correct"].values

    ece = compute_ece(probs, labels)
    auroc = roc_auc_score(labels, probs)
    brier = brier_score_loss(labels, probs)

    accuracy = data['results'][data_key]['accuracy']

    return {
        "Faulty": len(df_faulty),
        "Accuracy": accuracy,
        "ECE": ece,
        "Brier Score": brier,
        "AUROC": auroc
    }


def clean_confidence(x):
    if x is None:
        return None
    if x.startswith('{'):
        x = x[1:]
    if '\\' in x:
        # remove the escape character
        x = x.replace('\\', '')
    x = x.replace('$', '')
    x = x.replace('"', '')
    x = x.replace('_', ' ')
    

    x = x.replace('text{', '')

    if 'slight' in x:
        return 'chances are slight'
    

    return x.lower() if x else x

def compute_ece(probs, labels, n_bins=10):
    bins = np.linspace(0.0, 1.0, n_bins + 1)
    bin_indices = np.digitize(probs, bins) - 1
    ece = 0.0
    for i in range(n_bins):
        bin_mask = bin_indices == i
        if np.any(bin_mask):
            bin_conf = np.mean(probs[bin_mask])
            bin_acc = np.mean(labels[bin_mask])
            bin_frac = np.sum(bin_mask) / len(labels)
            ece += bin_frac * abs(bin_conf - bin_acc)
    return ece

class_names = [
    'almost no chance',
    'highly unlikely',
    'chances are slight',
    'unlikely',
    'less than even',
    'better than even',
    'likely',
    'very good chance',
    'highly likely',
    'almost certain'
]


# Confidence mapping
confidence_mapping = {
    "almost no chance": 0.05,
    "highly unlikely": 0.15,
    "chances are slight": 0.25,
    "unlikely": 0.35,
    "less than even": 0.45,
    "better than even": 0.55,
    "likely": 0.65,
    "very good chance": 0.75,
    "highly likely": 0.85,
    "almost certain": 0.95
}


In [3]:
all_results = []

for paths in all_paths:
    results = []
    for path in paths:
        result = get_results(path)
        results.append(result)
    all_results.append(results)

In [4]:
for results in all_results:
    for result in results:
        print(
            result['Accuracy'],
            result['ECE'],
            result['Brier Score'],
            result['AUROC'],
            sep=', ',
            end=', '
        )   
    print()

0.8, 0.20000000000000007, 0.07450000000000001, 1.0, 
