## Load the dataset

In [None]:
import pandas as pd
import numpy as np

# Read data
file_path = '../results/Human Annotation of LLaVA+ Rationales.xlsx'

anno_data = pd.read_excel(file_path, sheet_name="Sentence_VF anno", header=1)
autoeval_data = pd.read_excel(file_path, sheet_name="LLAVA-1.5 Rationales-50 set-Two", header=1)

# Select columns M, S, and V by their column names
anno_data = anno_data[['Sentence-wise Visual Fidelity', 'Sentence-wise Visual Fidelity.2', 'Sentence-wise Visual Fidelity.3']].dropna()
autoeval_data = autoeval_data[['autoeval s_VF(v3)']].dropna()[:len(anno_data)]

data = pd.concat([anno_data, autoeval_data], axis=1).rename(columns={
    'Sentence-wise Visual Fidelity': 'Rater1',
    'Sentence-wise Visual Fidelity.2': 'Rater2',
    'Sentence-wise Visual Fidelity.3': 'Rater3',
    'autoeval s_VF(v3)': 'GPT4o'
})

data

In [None]:
import ast
import pandas as pd

# Function to flatten a column into a one-dimensional format
def flatten_column(column):
    return column.explode().reset_index(drop=True)

# Function to convert string representations of lists to actual Python lists
def convert_to_list(cell):
    if pd.isna(cell):
        return []  # Handle NaN or empty cells
    return ast.literal_eval(cell)

# Apply the conversion function to all columns
df = data.map(convert_to_list)

# Flatten each specified column
flattened_df = pd.concat([flatten_column(df[col]) for col in df.columns], axis=1).dropna()

flattened_df.reset_index(drop=True, inplace=True)

# flattened_df
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(flattened_df)

# 1. Interannotator agreement: Fleiss' Kappa

In [None]:
# %pip install statsmodels

In [None]:
import pandas as pd
from statsmodels.stats.inter_rater import fleiss_kappa

human_anno_df = flattened_df[['Rater1', 'Rater2', 'Rater3']]

# Convert ratings into a count matrix suitable for Fleiss' kappa calculation
def create_count_matrix(df):
    categories = sorted(df.stack().unique())  # All unique categories
    count_matrix = []

    for _, row in df.iterrows():
        counts = [sum(row == category) for category in categories]
        count_matrix.append(counts)

    return pd.DataFrame(count_matrix, columns=categories)

# Create the count matrix for Fleiss' kappa calculation
count_matrix = create_count_matrix(human_anno_df)

# Calculate Fleiss' kappa
kappa = fleiss_kappa(count_matrix.to_numpy(), method='fleiss')
print(f"Fleiss' kappa: {kappa}")

# 2. Majority Vote, and how it compares to GPT annotations

In [None]:
flattened_df['Majority Vote'] = flattened_df[['Rater1', 'Rater2', 'Rater3']].mode(axis=1)[0]
flattened_df['Matches GPT4o'] = flattened_df['Majority Vote'] == flattened_df['GPT4o']

# Display the updated DataFrame
print(flattened_df)

# Calculate the number of matches and mismatches
matches = flattened_df['Matches GPT4o'].sum()
total = flattened_df.shape[0]
mismatches = total - matches

print(f"Number of matches: {matches}")
print(f"Number of mismatches: {mismatches}")
print(f"Match percentage: {matches / total * 100:.2f}%")

In [None]:
# Calculate correlation
from scipy.stats import spearmanr
from sklearn.metrics import cohen_kappa_score

# Ensure 'Majority Vote' and 'GPT4o' are numeric
flattened_df['Majority Vote'] = pd.to_numeric(flattened_df['Majority Vote'], errors='coerce')
flattened_df['GPT4o'] = pd.to_numeric(flattened_df['GPT4o'], errors='coerce')
# Calculate Spearman correlation between the majority vote and GPT4o
spearman_corr, spearman_pval = spearmanr(flattened_df['Majority Vote'], flattened_df['GPT4o'])

print(f"Spearman correlation with GPT4o: {spearman_corr:.4f} (p-value: {spearman_pval})")

# Calculate Cohen's kappa between Majority Vote and GPT4o's label
cohen_kappa = cohen_kappa_score(flattened_df['Majority Vote'], flattened_df['GPT4o'])
print(f"Cohen's kappa: {cohen_kappa}")