In [None]:
from sklearn.metrics import roc_auc_score, brier_score_loss
import numpy as np
import json
import pandas as pd
import os


def get_full_path(path):
    # List all files (ignore directories)
    files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
    
    if len(files) != 1:
        raise ValueError(f"Expected exactly one file in {path}, but found {len(files)} files.")
    
    return os.path.join(path, files[0])

def read_data(path):
    full_path = get_full_path(path)
    data = json.load(open(full_path))
    # data.keys()
    data_key = list(data['results'].keys())[0]
    data_ = data['results'][data_key]['examples']
    df = pd.DataFrame(data_)
    df['model_confidence'] = df['model_confidence'].apply(clean_confidence)

    df["confidence_score"] = df["model_confidence"].map(confidence_mapping)

    return df

def calc_metrics(df):
    accuracy = sum(df["correct"].values) / len(df)
    df_faulty = df[~df['model_confidence'].isin(class_names)]
    df_clean = df.dropna(subset=["confidence_score", "correct"])

    probs = df_clean["confidence_score"].values
    labels = df_clean["correct"].values


    ece = compute_ece(probs, labels)
    auroc = roc_auc_score(labels, probs)
    brier = brier_score_loss(labels, probs)

    return {
        "Faulty": len(df_faulty),
        "Accuracy": accuracy,
        "ECE": ece,
        "Brier Score": brier,
        "AUROC": auroc
    }

def clean_confidence(x):
    if x is None:
        return None
    if x.startswith('{'):
        x = x[1:]
    if '\\' in x:
        # remove the escape character
        x = x.replace('\\', '')
    x = x.replace('$', '')
    x = x.replace('"', '')
    x = x.replace('_', ' ')
    

    x = x.replace('text{', '')

    if 'slight' in x:
        return 'chances are slight'
    

    return x.lower() if x else x

def compute_ece(probs, labels, n_bins=10):
    bins = np.linspace(0.0, 1.0, n_bins + 1)
    bin_indices = np.digitize(probs, bins) - 1
    ece = 0.0
    for i in range(n_bins):
        bin_mask = bin_indices == i
        if np.any(bin_mask):
            bin_conf = np.mean(probs[bin_mask])
            bin_acc = np.mean(labels[bin_mask])
            bin_frac = np.sum(bin_mask) / len(labels)
            ece += bin_frac * abs(bin_conf - bin_acc)
    return ece

def token_length(text, tokenizer):
    tokens = tokenizer.tokenize(text)
    return len(tokens)

class_names = [
    'almost no chance',
    'highly unlikely',
    'chances are slight',
    'unlikely',
    'less than even',
    'better than even',
    'likely',
    'very good chance',
    'highly likely',
    'almost certain'
]


# Confidence mapping
confidence_mapping = {
    "almost no chance": 0.05,
    "highly unlikely": 0.15,
    "chances are slight": 0.25,
    "unlikely": 0.35,
    "less than even": 0.45,
    "better than even": 0.55,
    "likely": 0.65,
    "very good chance": 0.75,
    "highly likely": 0.85,
    "almost certain": 0.95
}


In [None]:
df = read_data('/home/dongkeun/reasoning-models-confidence/evalchemy/logs/reasoning_slope/r1-triviaqa-slope/deepseek-ai__DeepSeek-R1-Distill-Qwen-32B')
df.head(1)

In [None]:
# split the dfs using the value of 'prompt_pct column
dfs = [df_subset for _, df_subset in df.groupby('prompt_pct')]
dfs_metrics = [calc_metrics(df_subset) for df_subset in dfs]

# print the metrics
full_results = {
    "Accuracy": [],
    "ECE": [],
    "Brier Score": [],
    "AUROC": []
}
for df in dfs:
    metrics = calc_metrics(df)
    # print all the metrics in a single line
    for metric, value in metrics.items():
        if metric != "Faulty":
            print(value, end=", ")
            full_results[metric].append(value)
    print()

In [None]:
from scipy.stats import linregress

# calculate the slope of each metric using linear regression
for metric, values in full_results.items():
    x = list(range(len(values)))
    y = values

    slope, intercept, r_value, p_value, std_err = linregress(x, y)
    # print until the fourth decimal
    slope = round(slope, 3)
    p_value = round(p_value, 3)
    p_value_is_significant = p_value < 0.05
    print(slope, end=", ")
    print(p_value_is_significant, end=", ")
    