In [None]:
import pandas as pd
import numpy as np


columns_to_read = [
    'question',
    'correct_answer',
    'predicted_answer',
    'is_correct',
    'image_path',
    'vf_questions',
    'vf_answers_GPT',
    'strict_sim',
    'Human Annotated VF Questions Qualities',
    'Human Annotated VF Answers Correctness',
    'Human Annotated completeness'
]

data = pd.read_excel('../results/Human Annotation of LLaVA+ Rationales.xlsx', sheet_name="LLAVA-1.5 Rationales-val 500 se", usecols=columns_to_read).dropna()
data

In [None]:
print(data.shape)

Number of all annotated instances up to data == 71 

In [None]:
import ast

# Number of all Human Annotated VF Questions Qualities
# Flatten them, and count the numbers of -1, 0, and 1

question_qualities = []
for i in data['Human Annotated VF Questions Qualities']:
    question_qualities.extend(ast.literal_eval(i))
question_qualities = np.array(question_qualities)
print(question_qualities, question_qualities.shape)
# print number of -1, 0, 1
print(f"Number of -1: {np.sum(question_qualities == -1)}")
print(f"Number of 0: {np.sum(question_qualities == 0)}")
print(f"Number of 1: {np.sum(question_qualities == 1)}")
print(f"Relevant questions percentage: {np.sum(question_qualities == 1) / question_qualities.shape[0]}")

vf_answers_GPT = []
for i in data['vf_answers_GPT']:
    vf_answers_GPT.extend(ast.literal_eval(i))
# convert yes to 1, no to 0
vf_answers_GPT = [1 if i == 'Yes' else 0 for i in vf_answers_GPT]
vf_answers_GPT = np.array(vf_answers_GPT)
print(vf_answers_GPT, vf_answers_GPT.shape)

vf_answers_human = []
for i in data['Human Annotated VF Answers Correctness']:
    vf_answers_human.extend(ast.literal_eval(i))
vf_answers_human = np.array(vf_answers_human)
print(vf_answers_human, vf_answers_human.shape)

# Merge these three arrays into a 2D dataframe
# 1st column: question quality
# 2nd column: GPT answer
# 3rd column: human answer
df = pd.DataFrame({
    'question_quality': question_qualities,
    'GPT_answer': vf_answers_GPT,
    'human_answer': vf_answers_human
})
df

In [None]:
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import cohen_kappa_score
from sklearn.utils import resample

# for the relevant questions (question_quality == 1), calculate the accuracy/correlation/agreement of GPT and human
relevant_questions = df[df['question_quality'] == 1]
print(relevant_questions['human_answer'].value_counts())

GPT_answers, human_answers = relevant_questions['GPT_answer'], relevant_questions['human_answer']

# Accuracy
accuracy = np.sum(GPT_answers == human_answers) / relevant_questions.shape[0]
print(f"Accuracy: {accuracy}")

# Bootstrapping for error bars (confidence intervals)
n_iterations = 1000
bootstrap_accuracies = []

for _ in range(n_iterations):
    sample = resample(relevant_questions)
    sample_accuracy = np.sum(sample['GPT_answer'] == sample['human_answer']) / sample.shape[0]
    bootstrap_accuracies.append(sample_accuracy)

# Calculate the 95% confidence interval
lower_bound = np.percentile(bootstrap_accuracies, 2.5)
upper_bound = np.percentile(bootstrap_accuracies, 97.5)
print(f"95% Confidence Interval for Accuracy: [{lower_bound}, {upper_bound}]")

# Correlation (Pearson and Spearman)
# pearson_corr, _ = pearsonr(relevant_questions['GPT_answer'], relevant_questions['human_answer'])
spearman_corr, _ = spearmanr(GPT_answers, human_answers)
# print(f"Pearson Correlation: {pearson_corr}")
print(f"Spearman Correlation: {spearman_corr}")

# Agreement: Cohen's kappa
kappa = cohen_kappa_score(GPT_answers, human_answers)
print(f"Cohen's Kappa: {kappa}")

In [None]:
# Autoevaled completeness
autoeval_completeness = []
for i in data['strict_sim']:
    # True --> 1, False --> 0
    autoeval_completeness.append(int(i))
print(autoeval_completeness)


# Human annotated completeness
human_completeness = []
for i in data['Human Annotated completeness']:
    human_completeness.append(int(i))
print(human_completeness)

# Accuracy
accuracy = np.sum(np.array(human_completeness) == np.array(autoeval_completeness)) / len(human_completeness)
print(f"Accuracy: {accuracy}")

# Bootstrapping for error bars (confidence intervals)
n_iterations = 1000
bootstrap_accuracies = []

for _ in range(n_iterations):
    sample = resample(relevant_questions)
    sample_accuracy = np.sum(sample['GPT_answer'] == sample['human_answer']) / sample.shape[0]
    bootstrap_accuracies.append(sample_accuracy)

# Calculate the 95% confidence interval
lower_bound = np.percentile(bootstrap_accuracies, 2.5)
upper_bound = np.percentile(bootstrap_accuracies, 97.5)
print(f"95% Confidence Interval for Accuracy: [{lower_bound}, {upper_bound}]")

# Correlation and agreement
spearman_corr, _ = spearmanr(human_completeness, autoeval_completeness)
print(f"Spearman Correlation: {spearman_corr}")
kappa = cohen_kappa_score(human_completeness, autoeval_completeness)
print(f"Cohen's Kappa: {kappa}")