In [24]:
# Re-import the necessary modules and re-load the data
import pandas as pd
import krippendorff
import numpy as np

# Reload the dataset
data = pd.read_csv('../../../testing\human_eval_long.csv', encoding='utf-8')

# First, let's convert the categorical responses to numeric
# Assuming responses are in format 'A1', 'A2', etc.
for col in ['hallucination', 'answer_acc', 'user_sat', 'coherence', 'context_qual', 'overall']:
    data[col] = data[col].map({'A1': 1, 'A2': 2, 'A3': 3, 'A4': 4, 'A5': 5})
    

# Function to calculate Krippendorff's alpha for each dimension
def calculate_krippendorff_alpha(data, dimension):
    # Pivot the data to create a matrix where rows are items and columns are raters
    reliability_data = data.pivot(index='question_number', columns='id', values=dimension)
    # Convert to numpy array
    reliability_matrix = reliability_data.values
    # Calculate Krippendorff's alpha
    alpha = krippendorff.alpha(reliability_data=reliability_matrix, level_of_measurement='ordinal')
    return alpha

# Calculate agreement for each dimension
evaluation_columns = ['hallucination', 'answer_acc', 'user_sat', 'coherence', 'context_qual', 'overall']
agreement_results = {}

for col in evaluation_columns:
    alpha = calculate_krippendorff_alpha(data, col)
    agreement_results[col] = alpha

# Display results
print("Inter-Annotator Agreement (Krippendorff's Alpha) for each dimension:")
for dimension, alpha in agreement_results.items():
    print(f"{dimension}: {alpha:.3f}")

Inter-Annotator Agreement (Krippendorff's Alpha) for each dimension:
hallucination: 0.218
answer_acc: 0.040
user_sat: 0.055
coherence: 0.390
context_qual: 0.178
overall: 0.162


In [27]:
import pandas as pd
import krippendorff
import numpy as np

# 1) Load your long-format data
data = pd.read_csv("../../../testing/human_eval_long.csv", encoding="utf-8")

# 2) Convert 'A1'...'A5' to numeric 1..5
for col in ['hallucination', 'answer_acc', 'user_sat', 'coherence', 'context_qual', 'overall']:
    data[col] = data[col].map({'A1': 1, 'A2': 2, 'A3': 3, 'A4': 4, 'A5': 5})

# 3) Krippendorff’s alpha calculation
def calculate_krippendorff_alpha(df, dimension):
    # Pivot: rows = items (question_number), columns = raters (id), values = ratings
    reliability_data = df.pivot(index='question_number', columns='id', values=dimension)
    reliability_matrix = reliability_data.values
    # Compute alpha for ordinal data
    alpha = krippendorff.alpha(reliability_data=reliability_matrix, level_of_measurement='ordinal')
    return alpha

# 4) Mean Spearman correlation among raters for each dimension
def calculate_mean_spearman_correlation(df, dimension):
    # Pivot the same way
    reliability_data = df.pivot(index='question_number', columns='id', values=dimension)
    # The correlation matrix: how each rater correlates with the others
    # Using Spearman since data is ordinal
    corr_matrix = reliability_data.corr(method='spearman')

    # We only want the mean of the upper triangle (excluding diagonal)
    # because the lower triangle is a duplicate of the upper.
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    # .stack() collapses the matrix into a Series of correlation values
    # .mean() then averages them
    mean_corr = upper_tri.stack().mean()
    return mean_corr

# 5) Loop over your dimensions and compute both metrics
evaluation_columns = ['hallucination', 'answer_acc', 'user_sat', 'coherence', 'context_qual', 'overall']
results_alpha = {}
results_corr = {}

for col in evaluation_columns:
    alpha_val = calculate_krippendorff_alpha(data, col)
    corr_val  = calculate_mean_spearman_correlation(data, col)

    results_alpha[col] = alpha_val
    results_corr[col]  = corr_val

# 6) Print everything
print("Inter-Annotator Agreement (Krippendorff's Alpha) and Mean Spearman Correlation:\n")
for dimension in evaluation_columns:
    alpha_val = results_alpha[dimension]
    corr_val  = results_corr[dimension]
    print(f"{dimension}: alpha={alpha_val:.3f} | mean_spearman_corr={corr_val:.3f}")


Inter-Annotator Agreement (Krippendorff's Alpha) and Mean Spearman Correlation:

hallucination: alpha=0.218 | mean_spearman_corr=0.011
answer_acc: alpha=0.040 | mean_spearman_corr=0.098
user_sat: alpha=0.055 | mean_spearman_corr=0.086
coherence: alpha=0.390 | mean_spearman_corr=-0.150
context_qual: alpha=0.178 | mean_spearman_corr=0.104
overall: alpha=0.162 | mean_spearman_corr=0.135
