In [1]:
import json
import numpy as np
import glob, os

USER_STUDIES_DIR = "/home/shared/vlm_rationales_eval/user_studies_data/"

setting = 'showbothmetrics_llava1.5_with_image_q20_i10_s0'
# setting = 'prodmetric_llava1.5_with_image_q20_i10_s0'
print(f"Setting: {setting}")
files = glob.glob(f'{USER_STUDIES_DIR}/prolific_batches/batch_interaction_data/{setting}/*.json')
data = {}
for file in files:
    with open(file) as f:
        data.update(json.load(f))
print(f"{len(data)} sessions loaded from {len(files)} files")

Setting: showbothmetrics_llava1.5_with_image_q20_i10_s0
30 sessions loaded from 10 files


In [2]:
all_instances = sum([data[uid]['interactions'] for uid in data], [])
len(all_instances)

300

In [None]:
# Evaluate the interactions
from collections import Counter
import math

def compute_margin(p, n):
    return 1.96*math.sqrt(p*(1-p)/n) if n > 0 else 0

def compute_proportion_ci(p, n):
    margin = compute_margin(p, n)
    return (p - margin, p + margin)

def compute_mean_ci(values):
    n = len(values)
    mean_val = np.mean(values)
    std_val = np.std(values, ddof=1)
    se = std_val/np.sqrt(n) if n > 0 else 0
    return (mean_val - 1.96*se, mean_val + 1.96*se)

def evaluate_answers(stage, instances):
    ground_truths = np.array([1-x['question']['prediction_is_correct'] for x in instances])       # 0 means AI is correct, 1 means AI is incorrect
    preds = np.array([x['user_selections'][stage] for x in instances])
    true_positives = np.sum(np.logical_and(preds == 0, ground_truths == 0))
    false_positives = np.sum(np.logical_and(preds == 0, ground_truths == 1))
    true_negatives = np.sum(np.logical_and(preds == 1, ground_truths == 1))
    false_negatives = np.sum(np.logical_and(preds == 1, ground_truths == 0))

    unsure_rate = np.mean(preds == 2)
    accuracy = (true_positives + true_negatives) / (true_positives + false_positives + true_negatives + false_negatives)
    total_accuracy = (true_positives + true_negatives) / len(ground_truths)
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    false_positive_rate = false_positives / (false_positives + true_negatives) if false_positives + true_negatives > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

    # Utility: 0 if user is unsure, 1 if user correctly predicts AI correctness, -1 if user incorrectly predicts AI correctness
    individual_utilities = np.array([0 if preds[i] == 2 else 1-2*np.abs(preds[i] - ground_truths[i]) for i in range(len(preds))])
    utility = np.mean(individual_utilities)
    
    # Compute 95% CIs.
    total_accuracy_ci = compute_proportion_ci(total_accuracy, len(ground_truths))
    unsure_rate_ci    = compute_proportion_ci(unsure_rate, len(preds))
    
    non_unsure = preds != 2
    non_unsure_n = np.sum(non_unsure)
    accuracy_ci = compute_proportion_ci(accuracy, non_unsure_n) if non_unsure_n > 0 else (0, 0)
    
    precision_n = true_positives + false_positives
    precision_ci = compute_proportion_ci(precision, precision_n) if precision_n > 0 else (0, 0)
    
    recall_n = true_positives + false_negatives
    recall_ci = compute_proportion_ci(recall, recall_n) if recall_n > 0 else (0, 0)
    
    fpr_n = false_positives + true_negatives
    fpr_ci = compute_proportion_ci(false_positive_rate, fpr_n) if fpr_n > 0 else (0, 0)
    
    utility_ci = compute_mean_ci(individual_utilities)

    return {
        'total_accuracy': total_accuracy,
        'total_accuracy_ci': total_accuracy_ci,
        'accuracy': accuracy,
        'accuracy_ci': accuracy_ci,
        'precision': precision,
        'precision_ci': precision_ci,
        'recall': recall,
        'recall_ci': recall_ci,
        'false_positive_rate': false_positive_rate,
        'fpr_ci': fpr_ci,
        'f1': f1,
        'unsure_rate': unsure_rate,
        'unsure_rate_ci': unsure_rate_ci,
        'utility': utility,
        'utility_ci': utility_ci,
        'preds': preds,
        'ground_truths': ground_truths,
        'individual_utilities': individual_utilities,
    }

answeronly_results = evaluate_answers('answeronly', all_instances)
withexplanation_results = evaluate_answers('withexplanation', all_instances)
withexplanationquality_results = evaluate_answers('withexplanationquality', all_instances)

# print("Stage                   \tUnsure Rate\tTotalAcc\tNotUnsureAcc\tPrecision\tRecall\t\tF1\t\tFPR\t\tUtility")
# print("-"*140)
# for stage, results in zip(
#     ['Answer Only', 'With Explanation', 'With Explanation + Quality'], 
#     [answeronly_results, withexplanation_results, withexplanationquality_results]
# ):
#     print(f"{stage:<25}\t{results['unsure_rate']:.1%}\t\t{results['total_accuracy']:.1%}\t\t{results['accuracy']:.1%}\t\t{results['precision']:.3f}\t\t{results['recall']:.3f}\t\t{results['f1']:.3f}\t\t{results['false_positive_rate']:.3f}\t\t{results['utility']:.3f}")
    
print("Stage                   \tUnsure Rate\tTotalAcc\tNotUnsureAcc\tPrecision\tRecall\t\tF1\tFPR\t\tUtility")
print("-"*150) 
for stage_name, results in zip(
    ['Answer Only', 'With Explanation', 'With Explanation + Quality'], 
    [answeronly_results, withexplanation_results, withexplanationquality_results]
):
    # Calculate margins as half the width of each CI.
    unsure_margin    = (results['unsure_rate_ci'][1] - results['unsure_rate_ci'][0]) / 2
    total_acc_margin = (results['total_accuracy_ci'][1] - results['total_accuracy_ci'][0]) / 2
    acc_margin       = (results['accuracy_ci'][1] - results['accuracy_ci'][0]) / 2
    prec_margin      = (results['precision_ci'][1] - results['precision_ci'][0]) / 2 if results['precision_ci'] != (0, 0) else 0
    recall_margin    = (results['recall_ci'][1] - results['recall_ci'][0]) / 2 if results['recall_ci'] != (0, 0) else 0
    fpr_margin       = (results['fpr_ci'][1] - results['fpr_ci'][0]) / 2 if results['fpr_ci'] != (0, 0) else 0
    utility_margin   = (results['utility_ci'][1] - results['utility_ci'][0]) / 2

    # For proportion metrics, we print as percentages; f1 is printed as a decimal (CI not computed); utility remains as a number.
    print(f"{stage_name:<25}\t"
          f"{results['unsure_rate']:.1%} ± {unsure_margin:.1%}\t"
          f"{results['total_accuracy']:.1%} ± {total_acc_margin:.1%}\t"
          f"{results['accuracy']:.1%} ± {acc_margin:.1%}\t"
          f"{results['precision']:.1%} ± {prec_margin:.1%}\t"
          f"{results['recall']:.1%} ± {recall_margin:.1%}\t"
          f"{results['f1']:.3f}\t"
          f"{results['false_positive_rate']:.1%} ± {fpr_margin:.1%}\t"
          f"{results['utility']:.3f} ± {utility_margin:.3f}")
    
    


Stage                   	Unsure Rate	TotalAcc	NotUnsureAcc	Precision	Recall		F1	FPR		Utility
------------------------------------------------------------------------------------------------------------------------------------------------------
Answer Only              	70.3% ± 5.2%	17.0% ± 4.3%	57.3% ± 10.3%	57.5% ± 11.3%	85.7% ± 9.8%	0.689	77.5% ± 12.9%	0.043 ± 0.062
With Explanation         	31.3% ± 5.2%	38.0% ± 5.5%	55.3% ± 6.8%	55.8% ± 7.4%	85.7% ± 6.5%	0.676	80.9% ± 8.0%	0.073 ± 0.094
With Explanation + Quality	10.7% ± 3.5%	57.3% ± 5.6%	64.2% ± 5.7%	61.4% ± 6.6%	89.6% ± 5.0%	0.729	65.3% ± 8.4%	0.253 ± 0.103
