In [None]:
import pandas as pd

def process_survey_data(input_csv, output_cleaned_csv, output_emails_csv_a1, output_emails_csv_a2):
    """
    Process a single CSV file to separate emails and clean data based on 'gsovp' column.

    Parameters
    ----------
    input_csv : str
        Path to the input CSV file.
    output_cleaned_csv : str
        File path to save the cleaned CSV (excluding the email column).
    output_emails_csv_a1 : str
        File path to save emails where 'gsovp' == 'A1'.
    output_emails_csv_a2 : str
        File path to save emails where 'gsovp' == 'A2'.
    """
    # Read the input CSV
    df = pd.read_csv(input_csv)

    # Filter only completed surveys
    df = df[df['lastpage'] == 70]

    emails_a1 = pd.DataFrame()
    emails_a2 = pd.DataFrame()

    if 'email' in df.columns:
        email_df = df[['email']].copy()

        if 'gsovp' in df.columns:
            # Separate emails based on 'gsovp' values
            emails_a1 = email_df[df['gsovp'] == 'A1']
            emails_a2 = email_df[df['gsovp'] == 'A2']

        # Drop the email column from the main DataFrame
        df = df.drop(columns=['email'])

    # Further cleaning of df
    # print rows and columns after dropping irrelevant columns
    print("Before dropping irrelevant columns ")
    print(f'Rows: {df.shape[0]}')
    print(f'Columns: {df.shape[1]}\n')
    # Drop irrelevant columns directly to ensure anonymity and clean structure
    columns_to_drop = ['email', 'submitdate', 'seed', 'startlanguage']  # Columns we don't need
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')

    # Drop all columns containing 'Time' in the name
    df = df.drop(columns=[col for col in df.columns if 'Time' in col], errors='ignore')

    # print rows and columns after dropping irrelevant columns
    print("After dropping irrelevant columns ")
    print(f'Rows: {df.shape[0]}')
    print(f'Columns: {df.shape[1]}\n')

    # Drop lastpage column
    df = df.drop(columns=['lastpage'])
    # Drop 'gsovp' column
    df = df.drop(columns=['gsovp'])

    # Print rows and column after dropping incomplete surveys
    print("After dropping incomplete surveys ")
    print(f'Rows: {df.shape[0]}')
    print(f'Columns: {df.shape[1]}\n')

    # Map education status to roles
    role_mapping = {
        'A2': 'prospective',
        'A3': 'prospective',
        'A4': 'enrolled',
        'A5': 'international',
        'A1': 'other',
        'A6': 'other',
        'A7': 'other',
        'A8': 'other'
    }
    df['role'] = df['educationstatus'].map(role_mapping)


    # Save the cleaned data and email groups to separate files
    df.to_csv(output_cleaned_csv, index=False)
    emails_a1.to_csv(output_emails_csv_a1, index=False)
    emails_a2.to_csv(output_emails_csv_a2, index=False)

    print(f"Cleaned dataset saved to: {output_cleaned_csv}")
    print(f"Emails with 'gsovp' missing or 'A1' saved to: {output_emails_csv_a1}")
    print(f"Emails with 'gsovp' == 'A2' saved to: {output_emails_csv_a2}")

process_survey_data(
    input_csv="../../../../temp/results-survey3.csv",
    output_cleaned_csv="../../../testing/human_eval.csv",
    output_emails_csv_a1="../../../testing/human_mail_1.csv",
    output_emails_csv_a2="../../../testing/human_mail_a2.csv"
)


Before dropping irrelevant columns 
Rows: 23
Columns: 956

After dropping irrelevant columns 
Rows: 23
Columns: 608

After dropping incomplete surveys 
Rows: 23
Columns: 606

Cleaned dataset saved to: ../../../testing/human_eval.csv
Emails with 'gsovp' missing or 'A1' saved to: ../../../testing/human_mail_1.csv
Emails with 'gsovp' == 'A2' saved to: ../../../testing/human_mail_a2.csv


In [4]:
import pandas as pd

def transform_survey_wide_to_long(input_csv: str, output_csv: str, survey_csv: str) -> None:
    """
    Transforms the wide-format survey data into a long format such that each row
    corresponds to one question-answer pair from one participant.

    :param input_csv:  Path to the original wide CSV file.
    :param output_csv: Path to the output CSV file in long format.
    :param survey_csv: Path to the survey CSV file.
    """

    # Read the wide-format CSV
    df = pd.read_csv(input_csv)

    # Define the mapping from randomgroup -> the actual questions answered
    QUESTION_GROUPS = {
        1: range(1, 8),    # ans1 ... ans7
        2: range(8, 15),   # ans8 ... ans14
        3: range(15, 22),  # ans15 ... ans21
        4: range(22, 29),  # ans22 ... ans28
        5: range(29, 36),  # ans29 ... ans35
        6: range(36, 43),  # ans36 ... ans42
        7: range(43, 49),  # ans43 ... ans48
        8: range(49, 55),  # ans49 ... ans54
        9: range(55, 61),  # ans55 ... ans60
        10: range(61, 67), # ans61 ... ans66
    }

    # Columns at participant level you want to carry into the long dataframe.
    # Adjust this list to your needs (e.g., add or remove columns as needed).
    participant_cols = [
        'id', 'langprof', 'randomgroup', 'age', 'gender', 'gender[other]',
        'educationlevel', 'educationstatus', 'program', 'familiar', 'description',
        'interviewtime', 'role'
    ]

    # Prepare a list to collect all the long-format rows
    long_rows = []

    # Iterate over each participant (row in the wide dataframe)
    for _, row in df.iterrows():
        # Identify which questions this participant answered
        rg = row['randomgroup']
        question_list = QUESTION_GROUPS.get(rg, [])

        # For each question in that randomgroup
        for q_num in question_list:
            # Build a dict that will become one row in the long dataframe
            new_row = {}

            # 1) Bring over all participant-level data
            for col in participant_cols:
                # Some participants might have missing columns for 'gender[other]' etc.
                # so we safely do a .get() if you want to be defensive.
                # Here, we assume the column exists in df.
                new_row[col] = row[col]

            # 2) Question-specific columns: qidXX, langqXX, commentXX
            qid_col     = f'qid{q_num}'
            langq_col   = f'langq{q_num}'
            comment_col = f'comment{q_num}'

            # It's good practice to check if the columns exist, in case of partial data
            new_row['question_number'] = q_num
            new_row['qid']     = row[qid_col]     if qid_col     in df.columns else None
            new_row['langq']   = row[langq_col]   if langq_col   in df.columns else None
            new_row['comment'] = row[comment_col] if comment_col in df.columns else None

            # 3) The 6 Likert ratings: ansXX[SQ001] ... ansXX[SQ006]
            #    We'll store them in columns named dim1 ... dim6 (or pick any naming scheme)
            for sq_idx in range(1, 7):
                sq_col = f'ans{q_num}[SQ00{sq_idx}]'
                if sq_col in df.columns:
                    new_row[f'dim{sq_idx}'] = row[sq_col]
                else:
                    new_row[f'dim{sq_idx}'] = None

            # Append the newly created dictionary to our list of rows
            long_rows.append(new_row)

    # Turn that list of dictionaries into a DataFrame
    long_df = pd.DataFrame(long_rows)

    # # for whatever reason qid and langq are not in the original data for some randomgroups
    # fill them in from the df_for_survey.csv
    df_survey = pd.read_csv(survey_csv)
    # Override data_eval columns 'qid' and 'langq' with df_survey columns 'question_id' and 'df_language' 
    # for rows where df_survey 'index +1' is equal to data_eval 'question_number'
    for idx, row in df_survey.iterrows():
        question_number = idx + 1
        long_df.loc[long_df['question_number'] == question_number, 'qid'] = row['question_id']
        long_df.loc[long_df['question_number'] == question_number, 'langq'] = row['df_language']

    long_df.drop(columns=['langprof'], inplace=True)

    # Rename column
    long_df.rename(columns={'dim1': 'hallucination'}, inplace=True)
    long_df.rename(columns={'dim2': 'answer_acc'}, inplace=True)
    long_df.rename(columns={'dim3': 'user_sat'}, inplace=True)
    long_df.rename(columns={'dim4': 'coherence'}, inplace=True)
    long_df.rename(columns={'dim5': 'context_qual'}, inplace=True)
    long_df.rename(columns={'dim6': 'overall'}, inplace=True)

    # Finally, write out the long-format data to CSV
    long_df.to_csv(output_csv, index=False, quoting=1)
    return long_df

# -------------- USAGE EXAMPLE -------------- #

data = transform_survey_wide_to_long(
    input_csv="../../../testing/human_eval.csv", 
    output_csv="../../../testing/human_eval_long.csv",
    survey_csv="../../../data/human_eval/df_for_survey.csv")
data.head()

Unnamed: 0,id,randomgroup,age,gender,gender[other],educationlevel,educationstatus,program,familiar,description,...,question_number,qid,langq,comment,hallucination,answer_acc,user_sat,coherence,context_qual,overall
0,26,7,A2,A2,,A5,A4,Cognitive Science,A3,,...,43,123.0,de,Bei dieser Antwort frage ich mich wirklich war...,A3,A1,A2,A4,A2,A2
1,26,7,A2,A2,,A5,A4,Cognitive Science,A3,,...,44,153.0,en,,A3,A3,A3,A2,A4,A3
2,26,7,A2,A2,,A5,A4,Cognitive Science,A3,,...,45,328.0,de,,A5,A3,A4,A1,A3,A2
3,26,7,A2,A2,,A5,A4,Cognitive Science,A3,,...,46,123.0,en,,A3,A2,A3,A2,A2,A2
4,26,7,A2,A2,,A5,A4,Cognitive Science,A3,,...,47,298.0,de,,A4,A5,A4,A4,A4,A4


In [8]:
import pandas as pd

# Load the long-format dataset
data = pd.read_csv("../../../testing/human_eval_long.csv", encoding='utf-8')

# Convert categorical ratings to numeric for computation
rating_map = {'A1': 1, 'A2': 2, 'A3': 3, 'A4': 4, 'A5': 5}
for col in ['hallucination', 'answer_acc', 'user_sat', 'coherence', 'context_qual', 'overall']:
    data[col] = data[col].map(rating_map)

# Group by question and calculate the mean for each dimension
average_scores = data.groupby(['question_number', 'qid', 'langq']).agg({
    'hallucination': 'mean',
    'answer_acc': 'mean',
    'user_sat': 'mean',
    'coherence': 'mean',
    'context_qual': 'mean',
    'overall': 'mean'
}).reset_index()

# Save the result to a new DataFrame
average_scores_df = average_scores.rename(columns={
    'hallucination': 'avg_hallucination',
    'answer_acc': 'avg_answer_acc',
    'user_sat': 'avg_user_sat',
    'coherence': 'avg_coherence',
    'context_qual': 'avg_context_qual',
    'overall': 'avg_overall'
})


average_scores_df.to_csv("../../../testing/human_eval_aggregated.csv", index=False, quoting=1)
# show row 27
average_scores_df.iloc[27]

question_number            28
qid                       9.0
langq                      en
avg_hallucination         5.0
avg_answer_acc            4.0
avg_user_sat              3.0
avg_coherence        4.666667
avg_context_qual     3.666667
avg_overall          3.666667
Name: 27, dtype: object

In [None]:
# Re-import the necessary modules and re-load the data
import pandas as pd
import krippendorff
import numpy as np

# Reload the dataset
# data = pd.read_csv('human_eval_long.csv', encoding='utf-8')

# First, let's convert the categorical responses to numeric
# Assuming responses are in format 'A1', 'A2', etc.
for col in ['hallucination', 'answer_acc', 'user_sat', 'coherence', 'context_qual', 'overall']:
    data[col] = data[col].map({'A1': 1, 'A2': 2, 'A3': 3, 'A4': 4, 'A5': 5})
    

# Function to calculate Krippendorff's alpha for each dimension
def calculate_krippendorff_alpha(data, dimension):
    # Pivot the data to create a matrix where rows are items and columns are raters
    reliability_data = data.pivot(index='question_number', columns='id', values=dimension)
    # Convert to numpy array
    reliability_matrix = reliability_data.values
    # Calculate Krippendorff's alpha
    alpha = krippendorff.alpha(reliability_data=reliability_matrix, level_of_measurement='ordinal')
    return alpha

# Calculate agreement for each dimension
evaluation_columns = ['hallucination', 'answer_acc', 'user_sat', 'coherence', 'context_qual', 'overall']
agreement_results = {}

for col in evaluation_columns:
    alpha = calculate_krippendorff_alpha(data, col)
    agreement_results[col] = alpha

# Display results
print("Inter-Annotator Agreement (Krippendorff's Alpha) for each dimension:")
for dimension, alpha in agreement_results.items():
    print(f"{dimension}: {alpha:.3f}")

0      3
1      3
2      5
3      3
4      4
5      5
6      5
7      4
8      2
9      5
10     5
11     5
12     5
13     5
14     3
15     2
16     4
17     1
18     5
19     5
20     5
21     4
22     5
23     5
24     5
25     3
26     4
27     4
28     2
29     4
30     1
31     4
32     2
33     3
34     3
35     3
36     5
37     4
38     4
39     5
40     5
41     3
42     3
43     5
44     4
45     5
46     4
47     5
48     5
49     4
50     5
51     5
52     5
53     3
54     5
55     5
56     5
57     2
58     5
59     4
60     5
61     5
62     3
63     5
64     5
65     5
66     3
67     4
68     4
69     5
70     2
71     3
72     4
73     5
74     3
75     4
76     2
77     4
78     3
79     3
80     4
81     5
82     5
83     3
84     5
85     3
86     5
87     3
88     5
89     5
90     5
91     5
92     5
93     5
94     3
95     4
96     3
97     3
98     3
99     2
100    3
101    5
102    4
103    5
104    5
105    4
106    5
107    5
108    5
109    5
110    5
1

AssertionError: 

In [19]:
import pandas as pd
from pingouin import intraclass_corr
# import pingouin as pg

# Define the mapping dictionary
mapping = {'A1': 1, 'A2': 2, 'A3': 3, 'A4': 4, 'A5': 5}

# List of columns to be converted
columns_to_convert = ['hallucination', 'answer_acc', 'user_sat', 'coherence', 'context_qual', 'overall']

# Apply the mapping to each specified column
data[columns_to_convert] = data[columns_to_convert].replace(mapping)

# only keep rows where randomgroup is 3, 4, 5, 7, 8
data = data[data['randomgroup'].isin([3, 4, 5, 7, 8])]


# 2) Run the ICC
icc_results = intraclass_corr(data=data,
                                 targets='question_number',  # your "item" (question)
                                 raters='id',    # your rater (participant)
                                 ratings='hallucination') # the rating scores

# 3) Inspect results
icc_results


ValueError: Either missing values are present in data or data are unbalanced. Please remove them manually or use nan_policy='omit'.

In [20]:
# Restructure the dataset for ICC calculation
# Each row should represent a unique combination of question_number (target), id (rater), and their rating

# Melt the dataset to long format for each rating column
melted_data = pd.melt(
    data,
    id_vars=['question_number', 'id'],
    value_vars=columns_to_convert,
    var_name='rating_type',
    value_name='rating'
)
# show all rows
pd.set_option('display.max_rows', None)

for rating_type in melted_data['rating_type'].unique():
    subset = melted_data[melted_data['rating_type'] == rating_type]
    icc = intraclass_corr(
        data=subset,
        targets='question_number',
        raters='id',
        ratings='rating',
        nan_policy='omit'
    )

AssertionError: Data must have at least 5 non-missing values.

In [21]:
data['hallucination'][0]

3