In [1]:
import json
import numpy as np
import glob, os

USER_STUDIES_DIR = "/home/shared/vlm_rationales_eval/user_studies_data/"

setting = 'showbothmetrics_llava1.5_with_image_q20_i10_s0'
print(f"Setting: {setting}")
files = glob.glob(f'{USER_STUDIES_DIR}/prolific_batches/batch_interaction_data/{setting}/*.json')
data = {}
for file in files:
    with open(file) as f:
        data.update(json.load(f))
print(f"{len(data)} sessions loaded from {len(files)} files")

Setting: showbothmetrics_llava1.5_with_image_q20_i10_s0
16 sessions loaded from 5 files


In [2]:
all_instances = sum([data[uid]['interactions'] for uid in data], [])
len(all_instances)

160

In [4]:
# Evaluate the interactions
from collections import Counter

def evaluate_answers(stage, instances):
    ground_truths = np.array([1-x['question']['prediction_is_correct'] for x in instances])       # 0 means AI is correct, 1 means AI is incorrect
    preds = np.array([x['user_selections'][stage] for x in instances])
    true_positives = np.sum(np.logical_and(preds == 0, ground_truths == 0))
    false_positives = np.sum(np.logical_and(preds == 0, ground_truths == 1))
    true_negatives = np.sum(np.logical_and(preds == 1, ground_truths == 1))
    false_negatives = np.sum(np.logical_and(preds == 1, ground_truths == 0))

    unsure_rate = np.mean(preds == 2)
    accuracy = (true_positives + true_negatives) / (true_positives + false_positives + true_negatives + false_negatives)
    total_accuracy = (true_positives + true_negatives) / len(ground_truths)
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    false_positive_rate = false_positives / (false_positives + true_negatives) if false_positives + true_negatives > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

    # Utility: 0 if user is unsure, 1 if user correctly predicts AI correctness, -1 if user incorrectly predicts AI correctness
    individual_utilities = np.array([0 if preds[i] == 2 else 1-2*np.abs(preds[i] - ground_truths[i]) for i in range(len(preds))])
    utility = np.mean(individual_utilities)

    return {
        'total_accuracy': total_accuracy,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'false_positive_rate': false_positive_rate,
        'f1': f1,
        'unsure_rate': unsure_rate,
        'utility': utility,
        'preds': preds,
        'ground_truths': ground_truths,
        'individual_utilities': individual_utilities,
    }

answeronly_results = evaluate_answers('answeronly', all_instances)
withexplanation_results = evaluate_answers('withexplanation', all_instances)
withexplanationquality_results = evaluate_answers('withexplanationquality', all_instances)

print("Stage                   \tUnsure Rate\tTotalAcc\tNotUnsureAcc\tPrecision\tRecall\t\tF1\t\tFPR\t\tUtility")
print("-"*140)
for stage, results in zip(
    ['Answer Only', 'With Explanation', 'With Explanation + Quality'], 
    [answeronly_results, withexplanation_results, withexplanationquality_results]
):
    print(f"{stage:<25}\t{results['unsure_rate']:.1%}\t\t{results['total_accuracy']:.1%}\t\t{results['accuracy']:.1%}\t\t{results['precision']:.3f}\t\t{results['recall']:.3f}\t\t{results['f1']:.3f}\t\t{results['false_positive_rate']:.3f}\t\t{results['utility']:.3f}")


Stage                   	Unsure Rate	TotalAcc	NotUnsureAcc	Precision	Recall		F1		FPR		Utility
--------------------------------------------------------------------------------------------------------------------------------------------
Answer Only              	73.1%		16.2%		60.5%		0.625		0.926		0.746		0.938		0.056
With Explanation         	35.0%		37.5%		57.7%		0.589		0.883		0.707		0.841		0.100
With Explanation + Quality	14.4%		53.8%		62.8%		0.615		0.882		0.724		0.689		0.219
