In [1]:
%pip install numpy

Defaulting to user installation because normal site-packages is not writeable
Collecting numpy
  Downloading numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[K     |████████████████████████████████| 17.3 MB 15.5 MB/s eta 0:00:01
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.24.4
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
import json
import glob, os
from itertools import combinations

USER_STUDIES_DIR = "/home/shared/vlm_rationales_eval/user_studies_data/"
settings = [
    'showbothmetrics_llava1.5_with_image_q20_i10_s0',
    'prodmetric_llava1.5_with_image_q20_i10_s0',
    'vf_numeric_llava1.5_with_image_q20_i10_s0'
]

# Dictionary to hold the set of user_ids for each setting.
settings_user_ids = {}

for setting in settings:
    print(f"Processing setting: {setting}")
    pattern = os.path.join(USER_STUDIES_DIR, "prolific_batches", "batch_interaction_data", setting, "*.json")
    files = glob.glob(pattern)
    user_ids = set()
    
    for file in files:
        with open(file) as f:
            batch_data = json.load(f)
            # Add all user_ids from the current JSON file.
            user_ids.update(batch_data.keys())
    
    settings_user_ids[setting] = user_ids
    print(f"{len(user_ids)} participants in {setting}")

# Compare overlaps between each pair of settings.
print("\nOverlap between settings:")
for s1, s2 in combinations(settings, 2):
    overlap = settings_user_ids[s1].intersection(settings_user_ids[s2])
    print(f"Overlap between {s1} and {s2}: {len(overlap)} participants")
    
# Optionally, check for participants common to all three settings.
triple_overlap = settings_user_ids[settings[0]].intersection(settings_user_ids[settings[1]], settings_user_ids[settings[2]])
print(f"\nParticipants present in all three settings: {len(triple_overlap)}")


Processing setting: showbothmetrics_llava1.5_with_image_q20_i10_s0
30 participants in showbothmetrics_llava1.5_with_image_q20_i10_s0
Processing setting: prodmetric_llava1.5_with_image_q20_i10_s0
31 participants in prodmetric_llava1.5_with_image_q20_i10_s0
Processing setting: vf_numeric_llava1.5_with_image_q20_i10_s0
28 participants in vf_numeric_llava1.5_with_image_q20_i10_s0

Overlap between settings:
Overlap between showbothmetrics_llava1.5_with_image_q20_i10_s0 and prodmetric_llava1.5_with_image_q20_i10_s0: 0 participants
Overlap between showbothmetrics_llava1.5_with_image_q20_i10_s0 and vf_numeric_llava1.5_with_image_q20_i10_s0: 1 participants
Overlap between prodmetric_llava1.5_with_image_q20_i10_s0 and vf_numeric_llava1.5_with_image_q20_i10_s0: 0 participants

Participants present in all three settings: 0


In [None]:
import json
import numpy as np
import glob, os

USER_STUDIES_DIR = "/home/shared/vlm_rationales_eval/user_studies_data/"

# setting = 'showbothmetrics_llava1.5_with_image_q20_i10_s0'
# setting = 'prodmetric_llava1.5_with_image_q20_i10_s0'
setting = 'vf_numeric_llava1.5_with_image_q20_i10_s0'
print(f"Setting: {setting}")
files = glob.glob(f'{USER_STUDIES_DIR}/prolific_batches/batch_interaction_data/{setting}/*.json')
data = {}
for file in files:
    with open(file) as f:
        batch_data = json.load(f)
        for user_id, session in batch_data.items():
            if user_id in data:
                data[user_id].append(session)  # Append to list instead of overwriting
            else:
                data[user_id] = [session]  # Initialize as list

print(f"{sum(len(sessions) for sessions in data.values())} total sessions loaded from {len(files)} files")

: 

In [31]:
all_instances = sum([session['interactions'] for sessions in data.values() for session in sessions], [])
print(len(all_instances))

310


In [32]:
# Evaluate the interactions
from collections import Counter
import math

def compute_margin(p, n):
    return 1.96*math.sqrt(p*(1-p)/n) if n > 0 else 0

def compute_proportion_ci(p, n):
    margin = compute_margin(p, n)
    return (p - margin, p + margin)

def compute_mean_ci(values):
    n = len(values)
    mean_val = np.mean(values)
    std_val = np.std(values, ddof=1)
    se = std_val/np.sqrt(n) if n > 0 else 0
    return (mean_val - 1.96*se, mean_val + 1.96*se)

def evaluate_answers(stage, instances):
    ground_truths = np.array([1-x['question']['prediction_is_correct'] for x in instances])       # 0 means AI is correct, 1 means AI is incorrect
    preds = np.array([x['user_selections'][stage] for x in instances])
    true_positives = np.sum(np.logical_and(preds == 0, ground_truths == 0))
    false_positives = np.sum(np.logical_and(preds == 0, ground_truths == 1))
    true_negatives = np.sum(np.logical_and(preds == 1, ground_truths == 1))
    false_negatives = np.sum(np.logical_and(preds == 1, ground_truths == 0))

    unsure_rate = np.mean(preds == 2)
    accuracy = (true_positives + true_negatives) / (true_positives + false_positives + true_negatives + false_negatives)
    total_accuracy = (true_positives + true_negatives) / len(ground_truths)
    precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
    false_positive_rate = false_positives / (false_positives + true_negatives) if false_positives + true_negatives > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

    # Utility: 0 if user is unsure, 1 if user correctly predicts AI correctness, -1 if user incorrectly predicts AI correctness
    individual_utilities = np.array([0 if preds[i] == 2 else 1-2*np.abs(preds[i] - ground_truths[i]) for i in range(len(preds))])
    utility = np.mean(individual_utilities)
    
    # Compute 95% CIs.
    total_accuracy_ci = compute_proportion_ci(total_accuracy, len(ground_truths))
    unsure_rate_ci    = compute_proportion_ci(unsure_rate, len(preds))
    
    non_unsure = preds != 2
    non_unsure_n = np.sum(non_unsure)
    accuracy_ci = compute_proportion_ci(accuracy, non_unsure_n) if non_unsure_n > 0 else (0, 0)
    
    precision_n = true_positives + false_positives
    precision_ci = compute_proportion_ci(precision, precision_n) if precision_n > 0 else (0, 0)
    
    recall_n = true_positives + false_negatives
    recall_ci = compute_proportion_ci(recall, recall_n) if recall_n > 0 else (0, 0)
    
    fpr_n = false_positives + true_negatives
    fpr_ci = compute_proportion_ci(false_positive_rate, fpr_n) if fpr_n > 0 else (0, 0)
    
    utility_ci = compute_mean_ci(individual_utilities)

    return {
        'total_accuracy': total_accuracy,
        'total_accuracy_ci': total_accuracy_ci,
        'accuracy': accuracy,
        'accuracy_ci': accuracy_ci,
        'precision': precision,
        'precision_ci': precision_ci,
        'recall': recall,
        'recall_ci': recall_ci,
        'false_positive_rate': false_positive_rate,
        'fpr_ci': fpr_ci,
        'f1': f1,
        'unsure_rate': unsure_rate,
        'unsure_rate_ci': unsure_rate_ci,
        'utility': utility,
        'utility_ci': utility_ci,
        'preds': preds,
        'ground_truths': ground_truths,
        'individual_utilities': individual_utilities,
    }

answeronly_results = evaluate_answers('answeronly', all_instances)
withexplanation_results = evaluate_answers('withexplanation', all_instances)
withexplanationquality_results = evaluate_answers('withexplanationquality', all_instances)

# print("Stage                   \tUnsure Rate\tTotalAcc\tNotUnsureAcc\tPrecision\tRecall\t\tF1\t\tFPR\t\tUtility")
# print("-"*140)
# for stage, results in zip(
#     ['Answer Only', 'With Explanation', 'With Explanation + Quality'], 
#     [answeronly_results, withexplanation_results, withexplanationquality_results]
# ):
#     print(f"{stage:<25}\t{results['unsure_rate']:.1%}\t\t{results['total_accuracy']:.1%}\t\t{results['accuracy']:.1%}\t\t{results['precision']:.3f}\t\t{results['recall']:.3f}\t\t{results['f1']:.3f}\t\t{results['false_positive_rate']:.3f}\t\t{results['utility']:.3f}")
    
# print("Stage                   \tUnsure Rate\tTotalAcc\tNotUnsureAcc\tPrecision\tRecall\t\tF1\tFPR\t\tUtility")
print("Stage                   \tUnsure Rate\tTotalAcc\tNotUnsureAcc\tUtility")
# print("-"*150) 
print("-"*100)
for stage_name, results in zip(
    ['Answer Only', 'With Explanation', 'With Explanation + Quality'], 
    [answeronly_results, withexplanation_results, withexplanationquality_results]
):
    # Calculate margins as half the width of each CI.
    unsure_margin    = (results['unsure_rate_ci'][1] - results['unsure_rate_ci'][0]) / 2
    total_acc_margin = (results['total_accuracy_ci'][1] - results['total_accuracy_ci'][0]) / 2
    acc_margin       = (results['accuracy_ci'][1] - results['accuracy_ci'][0]) / 2
    prec_margin      = (results['precision_ci'][1] - results['precision_ci'][0]) / 2 if results['precision_ci'] != (0, 0) else 0
    recall_margin    = (results['recall_ci'][1] - results['recall_ci'][0]) / 2 if results['recall_ci'] != (0, 0) else 0
    fpr_margin       = (results['fpr_ci'][1] - results['fpr_ci'][0]) / 2 if results['fpr_ci'] != (0, 0) else 0
    utility_margin   = (results['utility_ci'][1] - results['utility_ci'][0]) / 2

    # For proportion metrics, we print as percentages; f1 is printed as a decimal (CI not computed); utility remains as a number.
    print(f"{stage_name:<25}\t"
          f"{results['unsure_rate']:.1%} ± {unsure_margin:.1%}\t"
          f"{results['total_accuracy']:.1%} ± {total_acc_margin:.1%}\t"
          f"{results['accuracy']:.1%} ± {acc_margin:.1%}\t"
        #   f"{results['precision']:.1%} ± {prec_margin:.1%}\t"
        #   f"{results['recall']:.1%} ± {recall_margin:.1%}\t"
        #   f"{results['f1']:.3f}\t"
        #   f"{results['false_positive_rate']:.1%} ± {fpr_margin:.1%}\t"
          f"{results['utility']:.3f} ± {utility_margin:.3f}")
    
    print("copiable results:", end=' ')
    print(f"{results['unsure_rate']:.1%} ± {unsure_margin:.1%}, ", end='')
    print(f"{results['total_accuracy']:.1%} ± {total_acc_margin:.1%}, ", end='')
    print(f"{results['accuracy']:.1%} ± {acc_margin:.1%}, ", end='')
    print(f"{results['utility']:.3f} ± {utility_margin:.3f}")
    


Stage                   	Unsure Rate	TotalAcc	NotUnsureAcc	Utility
----------------------------------------------------------------------------------------------------
Answer Only              	71.3% ± 5.0%	16.5% ± 4.1%	57.3% ± 10.3%	0.042 ± 0.060
copiable results: 71.3% ± 5.0%, 16.5% ± 4.1%, 57.3% ± 10.3%, 0.042 ± 0.060
With Explanation         	33.2% ± 5.2%	37.1% ± 5.4%	55.6% ± 6.8%	0.074 ± 0.091
copiable results: 33.2% ± 5.2%, 37.1% ± 5.4%, 55.6% ± 6.8%, 0.074 ± 0.091
With Explanation + Quality	10.3% ± 3.4%	57.7% ± 5.5%	64.4% ± 5.6%	0.258 ± 0.102
copiable results: 10.3% ± 3.4%, 57.7% ± 5.5%, 64.4% ± 5.6%, 0.258 ± 0.102
