# Data cleanup notebook


## 1. Imports

In [1]:
import torch
from torchmetrics.nominal import FleissKappa
import pandas as pd
from utils import parse_ratings, count_categories

import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

## 2. Clean up messed up files

### Load files

In [2]:
# Participant 2

first_file_2 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_7a185ca4-3d37-4487-8aa7-ad9e8f6fe884.csv" # 13:10
second_file_2 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_861cdaee-9f2e-4454-8252-d8ff397eb14e.csv" # 15:48
third_file_2 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_f60e0e84-2638-44da-871b-4847b751fabb.csv" # 16:23


# Participant 6

first_file_6 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_48d1eb99-5d33-476a-a1a4-75917aa92e92.csv" # 12:21
second_file_6 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_b3d2cab5-ffca-467d-bddb-b9e188e5a85a.csv" # 11:09
third_file_6 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_c2e282a4-f8e7-4542-95ab-a9a74b6f57e0.csv" # 13:39


# Participant 10

first_file_10 =  "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_e274815b-f334-4f2c-8cda-3788070d4bee.csv" # 13:20
second_file_10 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_5a066984-b053-4ae4-97ad-675900d79540.csv" # 14:39


# Synthetic feedback

synthetic_file = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/synthetic_feedback/synthetic_feedback.csv"


### Sort out files participant 2

In [3]:
df_2_1 = pd.read_csv(first_file_2, sep=";")
df_2_2 = pd.read_csv(second_file_2, sep=";")
df_2_3 = pd.read_csv(third_file_2, sep=";")

df_2_1 = df_2_1[:186]
df_2_2 = df_2_2[186:]
df_2_3 = df_2_3[30:]

df_2 = pd.concat([df_2_1, df_2_2, df_2_3], ignore_index=True)
print(len(df_2))

281


### Sort out files participant 6

In [4]:
df_6_1 = pd.read_csv(second_file_6, sep=";")
df_6_2 = pd.read_csv(first_file_6, sep=";")
df_6_3 = pd.read_csv(third_file_6, sep=";")

print(len(df_6_1))
print(len(df_6_2))
print(len(df_6_3))

df_6_1 = df_6_1[:]
df_6_2 = df_6_2[84:]
df_6_3 = df_6_3[:]

df_6 = pd.concat([df_6_1, df_6_2, df_6_3], ignore_index=True)
print(len(df_6))

84
168
114
282


### Sort out files participant 10

In [5]:
df_10_1 = pd.read_csv(first_file_10, sep=";")
df_10_2 = pd.read_csv(second_file_10, sep=";")

df_10 = pd.concat([df_10_1, df_10_2], ignore_index=True)
print(len(df_10))

282


### Check if there are duplicate rows in the dataframes

In [6]:
duplicates_6 = df_6.iloc[:, :-2].duplicated()
duplicates_2 = df_2.iloc[:, :-2].duplicated()
print(duplicates_6.sum())
print(duplicates_2.sum())

0
0


### Sort out synthetic feedback

In [7]:
df_synthetic_both = pd.read_csv(synthetic_file, sep=";")

print("Synthetic feedback shape:", df_synthetic_both.shape)


# Save the current headers since forgot to store headers in csv file
old_headers = df_synthetic_both.columns.tolist()

# print("Old headers:", old_headers)

# Step 2: Insert the headers as the first row
df_synthetic_both.loc[-1] = old_headers # Add headers as a new row
df_synthetic_both.index = df_synthetic_both.index + 1 # Shift index
df_synthetic_both = df_synthetic_both.sort_index() # Sort index to place the new row at the top


# Step 3: Assign new headers (optional)
df_synthetic_both.columns = ['file', 
                            'frame_ID', 
                            'frame_type', 
                            'frame_text', 
                            'precondition_id', 
                            'precondition_text', 
                            'precondition_position', 
                            'response_text', 
                            'prompt_config_examples', 
                            'prompt_config_chain_of_thought', 
                            'feedback_extraction', 
                            'feedback_detection', 
                            'additional_feedback',
                            'synthetic_feedback',
                ]

print(df_synthetic_both.columns)


df_synthetic_both['prompt_config_examples'] = (df_synthetic_both['prompt_config_examples']                                              
                                                .astype(str)
                                                .str.strip()
                                                .str.lower()
                                                .map({'true': True, 'false': False})
)

df_synthetic_both['prompt_config_chain_of_thought'] = (df_synthetic_both['prompt_config_chain_of_thought']
                                                .astype(str)
                                                .str.strip()
                                                .str.lower()
                                                .map({'true': True, 'false': False})
)



print("Synthetic feedback shape:", df_synthetic_both.shape)



# Odd rows (index 0, 2, 4, ...)
df_synthetic_1 = df_synthetic_both.iloc[::2]

# Even rows (index 1, 3, 5, ...)
df_synthetic_2 = df_synthetic_both.iloc[1::2]

print("Synthetic feedback 1 shape:", df_synthetic_1.shape)
print("Synthetic feedback 2 shape:", df_synthetic_2.shape)

# print("Odd rows:\n", df_synthetic_1.head())

df_synth_test_1 = df_synthetic_1.iloc[:, :-4]
df_synth_test_2 = df_synthetic_2.iloc[:, :-4]


common = pd.merge(df_synth_test_1, df_synth_test_2, how='inner')
print("Number of common rows in df_synth_1 and df_synth_2:", len(common))

Synthetic feedback shape: (563, 14)
Index(['file', 'frame_ID', 'frame_type', 'frame_text', 'precondition_id',
       'precondition_text', 'precondition_position', 'response_text',
       'prompt_config_examples', 'prompt_config_chain_of_thought',
       'feedback_extraction', 'feedback_detection', 'additional_feedback',
       'synthetic_feedback'],
      dtype='object')
Synthetic feedback shape: (564, 14)
Synthetic feedback 1 shape: (282, 14)
Synthetic feedback 2 shape: (282, 14)
Number of common rows in df_synth_1 and df_synth_2: 282


## 3. Load all proper files (except for the ones that were already created)


In [8]:
FILE_1 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_b2339fe6-2896-43ab-a9c8-24c8aacfbbd1.csv"
FILE_2 = None
FILE_3 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_6430b6fd-0ddd-4cc6-a4a0-216d5603143e.csv"
FILE_4 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_5fa87112-3702-4263-ba81-1779b3b24d16.csv"
FILE_5 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_7b177fba-3ddb-465b-9a25-6f4481eeb492.csv"
FILE_6 = None
FILE_7 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_ab53866b-7831-4f33-a628-3b6dbf01ead1.csv"
FILE_8 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_b1ed9f35-7d6a-439c-8a46-089311e8e340.csv"
FILE_9 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_a8ec999a-935f-476d-ac5c-f328a1288c7c.csv"
FILE_10 = None

### Load all dataframes

In [9]:
df_1 = pd.read_csv(FILE_1, sep=";")
df_3 = pd.read_csv(FILE_3, sep=";")
df_4 = pd.read_csv(FILE_4, sep=";")
df_5 = pd.read_csv(FILE_5, sep=";")
df_7 = pd.read_csv(FILE_7, sep=";")
df_8 = pd.read_csv(FILE_8, sep=";")
df_9 = pd.read_csv(FILE_9, sep=";")

print(df_1['feedback_detection'])

0                               Duidelijk
1                               Duidelijk
2                               Duidelijk
3                               Duidelijk
4                               Duidelijk
                      ...                
136    Onbestemde positie in ground truth
137    Onbestemde positie in ground truth
138               Helemaal niet duidelijk
139               Helemaal niet duidelijk
140               Helemaal niet duidelijk
Name: feedback_detection, Length: 141, dtype: object


### Check number of common rows

In [10]:
df_test_1 = df_3.iloc[:, :-3]
df_test_2 = df_8.iloc[:, :-3]


common = pd.merge(df_test_1, df_test_2, how='inner')
print("Number of common rows in df1 and df5:", len(common))
print(common)

# TODO: print file and frame type for common rows
print(f"common files: {common['file'].unique()}")
print(f"common frames: {common['frame_type'].unique()}")

Number of common rows in df1 and df5: 56
                                                 file  \
0   Interpretatie_Vw_over_besluiten_op_aanvragen_v...   
1   Interpretatie_Vw_over_besluiten_op_aanvragen_v...   
2   Interpretatie_Vw_over_besluiten_op_aanvragen_v...   
3   Interpretatie_Vw_over_besluiten_op_aanvragen_v...   
4   Interpretatie_Vw_over_besluiten_op_aanvragen_v...   
5   Interpretatie_Vw_over_besluiten_op_aanvragen_v...   
6   Interpretatie_Vw_over_besluiten_op_aanvragen_v...   
7   Interpretatie_Vw_over_besluiten_op_aanvragen_v...   
8   Interpretatie_Vw_over_besluiten_op_aanvragen_v...   
9   Interpretatie_Vw_over_besluiten_op_aanvragen_v...   
10  Interpretatie_Vw_over_besluiten_op_aanvragen_v...   
11  Interpretatie_Vw_over_besluiten_op_aanvragen_v...   
12  Interpretatie_Vw_over_besluiten_op_aanvragen_v...   
13  Interpretatie_Vw_over_besluiten_op_aanvragen_v...   
14  Interpretatie_Vw_over_besluiten_op_aanvragen_v...   
15  Interpretatie_Vw_over_besluiten_op_aanvrage

### Cast ratings to numeric values to use fleiss kappa

In [11]:
print(repr(df_2.loc[df_2['feedback_detection'].str.contains("Niet goed", na=False), 'feedback_detection'].unique()))

array(['Niet goed'], dtype=object)


In [12]:
# parse ratings in all dataframes to be able to calculate Fleiss kappa
def apply_parse_ratings(df, number):
    df['feedback_extraction'] = [parse_ratings(feedback) for feedback in df['feedback_extraction']]
    # print(number)
    # print(df['feedback_detection'])
    df['feedback_detection'] = [parse_ratings(feedback_detec) for feedback_detec in df['feedback_detection']]
    # print(df['feedback_detection'])
    return df

# Apply to each DataFrame
df1 = apply_parse_ratings(df_1, 1)
df2 = apply_parse_ratings(df_2, 2)
df3 = apply_parse_ratings(df_3, 3)
df4 = apply_parse_ratings(df_4, 4)
df5 = apply_parse_ratings(df_5, 5)
df6 = apply_parse_ratings(df_6, 6)
df7 = apply_parse_ratings(df_7, 7)
df8 = apply_parse_ratings(df_8, 8)
df9 = apply_parse_ratings(df_9, 9)
df10 = apply_parse_ratings(df_10, 10)

df_synthetic1 = apply_parse_ratings(df_synthetic_1, "synthetic1")
df_synthetic2 = apply_parse_ratings(df_synthetic_2, "synthetic2")

# Check if any feedback column contains Nan values


print(df1['feedback_detection'].isna().any())
print(df2['feedback_detection'].isna().any())
print(df3['feedback_detection'].isna().any())
print(df4['feedback_detection'].isna().any())
print(df5['feedback_detection'].isna().any())
print(df6['feedback_detection'].isna().any())
print(df7['feedback_detection'].isna().any())
print(df8['feedback_detection'].isna().any())
print(df9['feedback_detection'].isna().any())
print(df10['feedback_detection'].isna().any())
print(df_synthetic1['feedback_detection'].isna().any())
print(df_synthetic2['feedback_detection'].isna().any())

print(df1['feedback_extraction'].isna().any())
print(df2['feedback_extraction'].isna().any())
print(df3['feedback_extraction'].isna().any())
print(df4['feedback_extraction'].isna().any())
print(df5['feedback_extraction'].isna().any())
print(df6['feedback_extraction'].isna().any())
print(df7['feedback_extraction'].isna().any())
print(df8['feedback_extraction'].isna().any())
print(df9['feedback_extraction'].isna().any())
print(df10['feedback_extraction'].isna().any())
print(df_synthetic1['feedback_extraction'].isna().any())
print(df_synthetic2['feedback_extraction'].isna().any())


False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['feedback_extraction'] = [parse_ratings(feedback) for feedback in df['feedback_extraction']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['feedback_detection'] = [parse_ratings(feedback_detec) for feedback_detec in df['feedback_detection']]


In [13]:
# Collect the first 50 entries from each DataFrame for each column
dfs = [df1, df5, df7, df9, df10]
# dfs = [df5, df8]


# synthetic dataframes
dfs_synthetic = [df_synthetic1, df_synthetic2]


# Compare first 50 rows, excluding the last 3 columns
subset1 = df1.iloc[:50, :-3]
subset2 = df2.iloc[:50, :-3]
subset3 = df3.iloc[:50, :-3]
subset4 = df4.iloc[:50, :-3]
subset5 = df5.iloc[:50, :-3]
subset6 = df6.iloc[:50, :-3]
subset7 = df7.iloc[:50, :-3]
subset8 = df8.iloc[:50, :-3]
subset9 = df9.iloc[:50, :-3]
subset10 = df10.iloc[:50, :-3]

# Check if they are equal
are_equal_1_2 = subset1.equals(subset2)
are_equal_1_3 = subset1.equals(subset3)
are_equal_1_4 = subset1.equals(subset4)
are_equal_1_5 = subset1.equals(subset5)
are_equal_1_6 = subset1.equals(subset6)
are_equal_1_7 = subset1.equals(subset7)
are_equal_1_8 = subset1.equals(subset8)
are_equal_1_9 = subset1.equals(subset9)
are_equal_1_10 = subset1.equals(subset10)

# Print results of how equal stuff is
print("Are the first 50 rows of df1 and df2 equal (excluding last 3 columns)?", are_equal_1_2)
print("Are the first 50 rows of df1 and df3 equal (excluding last 3 columns)?", are_equal_1_3)
print("Are the first 50 rows of df1 and df4 equal (excluding last 3 columns)?", are_equal_1_4)
print("Are the first 50 rows of df1 and df5 equal (excluding last 3 columns)?", are_equal_1_5)
print("Are the first 50 rows of df1 and df6 equal (excluding last 3 columns)?", are_equal_1_6)
print("Are the first 50 rows of df1 and df7 equal (excluding last 3 columns)?", are_equal_1_7)
print("Are the first 50 rows of df1 and df8 equal (excluding last 3 columns)?", are_equal_1_8)
print("Are the first 50 rows of df1 and df9 equal (excluding last 3 columns)?", are_equal_1_9)
print("Are the first 50 rows of df1 and df10 equal (excluding last 3 columns)?", are_equal_1_10)

# Helper function to extract and stack ratings
def prepare_data(dfs, column):
    # print(column)
    data = [df[column].iloc[:50].tolist() for df in dfs]

    # print(data)
    return torch.tensor(list(zip(*data))) # shape: (50 items, 8 raters)


# # Convert feedback extraction column to int
# for df in dfs:
#     df['feedback_extraction'] = df['feedback_extraction'].astype(int)
#     df['feedback_detection'] = df['feedback_detection'].astype(int)

# Prepare data
extraction_tensor = prepare_data(dfs, 'feedback_extraction')
detection_tensor = prepare_data(dfs, 'feedback_detection')

# couont categories to pass into fleiss kappa
extraction_categories = [os.getenv("EXTRACTION_FEEDBACK_0"), os.getenv("EXTRACTION_FEEDBACK_1"), os.getenv("EXTRACTION_FEEDBACK_2"), os.getenv("EXTRACTION_FEEDBACK_3")]
detection_categories = [os.getenv("DETECTION_FEEDBACK_0"), os.getenv("DETECTION_FEEDBACK_1"), os.getenv("DETECTION_FEEDBACK_NONEXISTENT")]

categories_count_extraction = count_categories(extraction_tensor, extraction_categories)
categories_count_detection = count_categories(detection_tensor, detection_categories)



Are the first 50 rows of df1 and df2 equal (excluding last 3 columns)? True
Are the first 50 rows of df1 and df3 equal (excluding last 3 columns)? True
Are the first 50 rows of df1 and df4 equal (excluding last 3 columns)? True
Are the first 50 rows of df1 and df5 equal (excluding last 3 columns)? True
Are the first 50 rows of df1 and df6 equal (excluding last 3 columns)? True
Are the first 50 rows of df1 and df7 equal (excluding last 3 columns)? True
Are the first 50 rows of df1 and df8 equal (excluding last 3 columns)? True
Are the first 50 rows of df1 and df9 equal (excluding last 3 columns)? True
Are the first 50 rows of df1 and df10 equal (excluding last 3 columns)? True


In [14]:
kappa = FleissKappa(mode='counts')

print("Fleiss' Kappa for feedback_extraction:", kappa(categories_count_extraction))
print("Fleiss' Kappa for feedback_detection:", kappa(categories_count_detection))

Fleiss' Kappa for feedback_extraction: tensor(0.4705)
Fleiss' Kappa for feedback_detection: tensor(0.5817)


# Check distribution of acts/facts in different groups

In [15]:
big_df = pd.concat(dfs, ignore_index=True)
print("Total number of entries in the combined DataFrame:", len(big_df))
print("Number of fact entries in the combined DataFrame:", (big_df['frame_type']=='fact').sum())
print("Number of act entries in the combined DataFrame:", (big_df['frame_type']=='act').sum())

Total number of entries in the combined DataFrame: 929
Number of fact entries in the combined DataFrame: 502
Number of act entries in the combined DataFrame: 427


In [16]:
big_synthetic_df = pd.concat(dfs_synthetic, ignore_index=True)
print("Total number of entries in the combined synthetic DataFrame:", len(big_synthetic_df))
print("Number of fact entries in the combined synthetic DataFrame:", (big_synthetic_df['frame_type']=='fact').sum())
print("Number of act entries in the combined synthetic DataFrame:", (big_synthetic_df['frame_type']=='act').sum())

Total number of entries in the combined synthetic DataFrame: 564
Number of fact entries in the combined synthetic DataFrame: 260
Number of act entries in the combined synthetic DataFrame: 304


# Check act and fact inter-annotator agreement for rest of data

### Df10 contains both datasets and thus needs to be split accordingly

In [17]:
# extract dataset 1 and 2 from df10 which contains both

df10_group1 = pd.concat([df_10_1.head(56), df_10_2], ignore_index=True)
df10_group1 = apply_parse_ratings(df10_group1, 11)

df10_group2 = apply_parse_ratings(df_10_1, 12)


### Df1 has a very different order to its data and thus needs to be restructured

In [18]:
# Find matching rows
df1_reduced = df1.iloc[:, :-3]  # Exclude the last 3 columns
df5_reduced = df5.iloc[:, :-3]  # Exclude the last 3 columns --> group 2
df7_reduced = df3.iloc[:, :-3]  # Exclude the last 3 columns --> group 1

matching_indices1_group1 = df1_reduced[df1_reduced.isin(df7_reduced.to_dict(orient='list')).all(axis=1)].index
matching_indices2_group2 = df1_reduced[df1_reduced.isin(df5_reduced.to_dict(orient='list')).all(axis=1)].index
# Extract matching rows
df1_group1 = df1.loc[matching_indices1_group1].reset_index(drop=True)
df1_group2 = df1.loc[matching_indices2_group2].reset_index(drop=True)
# Print the number of matching rows for each group
print("Number of matching rows in group 1:", len(df1_group1))
print("Number of matching rows in group 2:", len(df1_group2))

#TODO: find specific order, re-order all dataframes according to it since otherwise Kappa will be wrong OR jut exclude this dataframe from the analysis? 
# NO, SINCE IT COUNTS INTO FLEISS KAPPA FOR EACH GROUP NO MATTER WHAT

Number of matching rows in group 1: 103
Number of matching rows in group 2: 94


### Dfs synthetic need to be re-stuctured according to the two datasets sice they were just run on the whole dataset at once --> need to plit them into the two datasets

In [19]:
# get two data rows for first and second dataset

rows_group1 = df7.iloc[:, :-3]
rows_group2 = df5.iloc[:, :-3]

# get the important columns from the synthetic dataframe
matching_columns1 = df_synthetic1.columns[:-4] 

# Merge with df1 to find matching rows
df_synthetic1_group1 = df_synthetic1.merge(rows_group1, on=list(matching_columns1), how='inner')
df_synthetic1_group2 = df_synthetic1.merge(rows_group2, on=list(matching_columns1), how='inner')

# get the important columns from the synthetic dataframe
matching_columns2 = df_synthetic2.columns[:-4] 

# Merge with df1 to find matching rows
df_synthetic2_group1 = df_synthetic2.merge(rows_group1, on=list(matching_columns2), how='inner')
df_synthetic2_group2 = df_synthetic2.merge(rows_group2, on=list(matching_columns2), how='inner')


print("Number of matching rows in synthetic group 1:", len(df_synthetic1_group1))
print("Number of matching rows in synthetic group 2:", len(df_synthetic1_group2))
print("Number of matching rows in synthetic group 1:", len(df_synthetic2_group1))
print("Number of matching rows in synthetic group 2:", len(df_synthetic2_group2))
print(len(df7))
print(len(df5))

Number of matching rows in synthetic group 1: 170
Number of matching rows in synthetic group 2: 168
Number of matching rows in synthetic group 1: 170
Number of matching rows in synthetic group 2: 168
170
168


## Sort dfs by unique key (combination of act and precondition)

### For dataset 1

In [20]:
# Create a composite key
df7['key'] = list(zip(df7['frame_ID'], df7['precondition_id'],df7['response_text']))
df1_group1['key'] = list(zip(df1_group1['frame_ID'], df1_group1['precondition_id'], df1_group1['response_text']))
df10_group1['key'] = list(zip(df10_group1['frame_ID'], df10_group1['precondition_id'], df10_group1['response_text']))
df_synthetic1_group1['key'] = list(zip(df_synthetic1_group1['frame_ID'], df_synthetic1_group1['precondition_id'], df_synthetic1_group1['response_text']))
df_synthetic2_group1['key'] = list(zip(df_synthetic2_group1['frame_ID'], df_synthetic2_group1['precondition_id'], df_synthetic2_group1['response_text']))


# get the order of keys in df1_group1
key_order1 = {key: i for i, key in enumerate(df1_group1['key'])}

# Map this order to df7
df7['sort_order'] = df7['key'].map(key_order1)

# Sort df7 — first by sort_order (NaNs last), then by key or index and get its order of keys
df7_sorted = df7.sort_values(by=['sort_order', 'key'], na_position='last').reset_index(drop=True)
ordered_keys_df7 = df7_sorted['key'].tolist()


#sort df1_group1 and df10_group1 according to the initial key order determined for df1_group1 and the order of keys in df7_sorted respectively
df1_group1_sorted = df1_group1.set_index('key').reindex(key_order1).dropna(how='all')
df1_group1_sorted = df1_group1_sorted.reset_index()
df10_group1_sorted = df10_group1.set_index('key').reindex(ordered_keys_df7).dropna(how='all').reset_index()


# Sort synthetic dataframes according to the order of keys in df7_sorted
df_synthetic1_group1_sorted = df_synthetic1_group1.set_index('key').reindex(ordered_keys_df7).dropna(how='all').reset_index()
df_synthetic2_group1_sorted = df_synthetic2_group1.set_index('key').reindex(ordered_keys_df7).dropna(how='all').reset_index()

In [21]:
df1_group1_sorted.head()

Unnamed: 0,level_0,level_1,level_2,file,frame_ID,frame_type,frame_text,precondition_id,precondition_text,precondition_position,response_text,prompt_config_examples,prompt_config_chain_of_thought,feedback_extraction,feedback_detection,additional_feedback
0,7e8d151e-a6ad-4877-85db-5eda2990ac67,341a8e52-ca51-42ec-b84e-f606887369e7,1. Subfact: Onze Minister\n 2. ...,Interpretatie_Vw_over_besluiten_op_aanvragen_v...,7e8d151e-a6ad-4877-85db-5eda2990ac67,fact,Onze Minister,341a8e52-ca51-42ec-b84e-f606887369e7,Onze Minister in de Vreemdelingenwet en de daa...,Artikel 1 IN Vreemdelingenwet 2024,1. Subfact: Onze Minister\n 2. ...,False,True,2,4,
1,7e8d151e-a6ad-4877-85db-5eda2990ac67,7e8d151e-a6ad-4877-85db-5eda2990ac67,1. Subfact: Onze Minister\n 2. ...,Interpretatie_Vw_over_besluiten_op_aanvragen_v...,7e8d151e-a6ad-4877-85db-5eda2990ac67,fact,Onze Minister,7e8d151e-a6ad-4877-85db-5eda2990ac67,Onze Minister,Artikel 1 IN Vreemdelingenwet 2024,1. Subfact: Onze Minister\n 2. ...,False,True,2,4,
2,7e8d151e-a6ad-4877-85db-5eda2990ac67,dd1b844c-4239-4d34-97ef-aa7022f430db,1. Subfact: Onze Minister\n 2. ...,Interpretatie_Vw_over_besluiten_op_aanvragen_v...,7e8d151e-a6ad-4877-85db-5eda2990ac67,fact,Onze Minister,dd1b844c-4239-4d34-97ef-aa7022f430db,Onze Minister van Veiligheid en Justitie,Artikel 1 IN Vreemdelingenwet 2024,1. Subfact: Onze Minister\n 2. ...,False,True,2,4,
3,7e8d151e-a6ad-4877-85db-5eda2990ac67,341a8e52-ca51-42ec-b84e-f606887369e7,1. Subfact: Onze Minister\n\n 2...,Interpretatie_Vw_over_besluiten_op_aanvragen_v...,7e8d151e-a6ad-4877-85db-5eda2990ac67,fact,Onze Minister,341a8e52-ca51-42ec-b84e-f606887369e7,Onze Minister in de Vreemdelingenwet en de daa...,Artikel 1 IN Vreemdelingenwet 2024,1. Subfact: Onze Minister\n\n 2...,False,True,2,4,
4,7e8d151e-a6ad-4877-85db-5eda2990ac67,7e8d151e-a6ad-4877-85db-5eda2990ac67,1. Subfact: Onze Minister\n\n 2...,Interpretatie_Vw_over_besluiten_op_aanvragen_v...,7e8d151e-a6ad-4877-85db-5eda2990ac67,fact,Onze Minister,7e8d151e-a6ad-4877-85db-5eda2990ac67,Onze Minister,Artikel 1 IN Vreemdelingenwet 2024,1. Subfact: Onze Minister\n\n 2...,False,True,3,4,


### For dataset 2

In [22]:
# Create a composite key
df5['key'] = list(zip(df5['frame_ID'], df5['precondition_id'], df5['response_text']))
df1_group2['key'] = list(zip(df1_group2['frame_ID'], df1_group2['precondition_id'], df1_group2['response_text']))
df10_group2['key'] = list(zip(df10_group2['frame_ID'], df10_group2['precondition_id'], df10_group2['response_text']))
df9['key'] = list(zip(df9['frame_ID'], df9['precondition_id'], df9['response_text']))
df_synthetic1_group2['key'] = list(zip(df_synthetic1_group2['frame_ID'], df_synthetic1_group2['precondition_id'], df_synthetic1_group2['response_text']))
df_synthetic2_group2['key'] = list(zip(df_synthetic2_group2['frame_ID'], df_synthetic2_group2['precondition_id'], df_synthetic2_group2['response_text']))

# get the order of keys in df1_group1
key_order2 = {key: i for i, key in enumerate(df1_group2['key'])}

# Map this order to df5
df5['sort_order'] = df5['key'].map(key_order2)

# Sort df5 — first by sort_order (NaNs last), then by key or index and get its order of keys
df5_sorted = df5.sort_values(by=['sort_order', 'key'], na_position='last').reset_index(drop=True)
ordered_keys_df5 = df5_sorted['key'].tolist()


#sort df1_group1 and df10_group1 according to the initial key order determined for df1_group1 and the order of keys in df7_sorted respectively
df1_group2_sorted = df1_group2.set_index('key').reindex(key_order2).dropna(how='all').reset_index()
df10_group2_sorted = df10_group2.set_index('key').reindex(ordered_keys_df5).dropna(how='all').reset_index()
df9_sorted = df9.set_index('key').reindex(ordered_keys_df5).dropna(how='all').reset_index()

# Sort synthetic dataframes according to the order of keys in df5_sorted
df_synthetic1_group2_sorted = df_synthetic1_group2.set_index('key').reindex(ordered_keys_df5).dropna(how='all').reset_index()
df_synthetic2_group2_sorted = df_synthetic2_group2.set_index('key').reindex(ordered_keys_df5).dropna(how='all').reset_index()


## Compute fleiss kappa for proper Dfs

In [23]:
# Get all acts and facts for the different participants
df1_group1_acts = df1_group1_sorted[df1_group1_sorted['frame_type'] != 'fact']
df1_group2_acts = df1_group2_sorted[df1_group2_sorted['frame_type'] != 'fact']
df5_acts = df5_sorted[df5_sorted['frame_type'] != 'fact']
df7_acts = df7_sorted[df7_sorted['frame_type'] != 'fact']
df9_acts = df9_sorted[df9_sorted['frame_type'] != 'fact']
df10_group1_acts = df10_group1_sorted[df10_group1_sorted['frame_type'] != 'fact']
df10_group2_acts = df10_group2_sorted[df10_group2_sorted['frame_type'] != 'fact']

# Synthetic acts and facts
df_synthetic1_group1_acts = df_synthetic1_group1_sorted[df_synthetic1_group1_sorted['frame_type'] != 'fact']
df_synthetic2_group1_acts = df_synthetic2_group1_sorted[df_synthetic2_group1_sorted['frame_type'] != 'fact']
df_synthetic1_group2_acts = df_synthetic1_group2_sorted[df_synthetic1_group2_sorted['frame_type'] != 'fact']
df_synthetic2_group2_acts = df_synthetic2_group2_sorted[df_synthetic2_group2_sorted['frame_type'] != 'fact']

df1_group1_facts = df1_group1_sorted[df1_group1_sorted['frame_type'] != 'act']
df1_group2_facts = df1_group2_sorted[df1_group2_sorted['frame_type'] != 'act']
df5_facts = df5_sorted[df5_sorted['frame_type'] != 'act']
df7_facts = df7_sorted[df7_sorted['frame_type'] != 'act']
df9_facts = df9_sorted[df9_sorted['frame_type'] != 'act']
df10_group1_facts = df10_group1_sorted[df10_group1_sorted['frame_type'] != 'act']
df10_group2_facts = df10_group2_sorted[df10_group2_sorted['frame_type'] != 'act']

#Synthetic acts and facts
df_synthetic1_group1_facts = df_synthetic1_group1_sorted[df_synthetic1_group1_sorted['frame_type'] != 'act']
df_synthetic2_group1_facts = df_synthetic2_group1_sorted[df_synthetic2_group1_sorted['frame_type'] != 'act']
df_synthetic1_group2_facts = df_synthetic1_group2_sorted[df_synthetic1_group2_sorted['frame_type'] != 'act']
df_synthetic2_group2_facts = df_synthetic2_group2_sorted[df_synthetic2_group2_sorted['frame_type'] != 'act']

### Human data Fleiss Kappa computation

In [24]:
# Prepare dataframes for comparison

dfs_acts_human_comp1 = [df1_group1_acts, df7_acts, df10_group1_acts]
dfs_acts_human_comp2 = [df1_group2_acts, df5_acts, df9_acts, df10_group2_acts]

dfs_facts_human_comp1 = [df1_group1_facts, df7_facts, df10_group1_facts]
dfs_facts_human_comp2 = [df1_group2_facts, df5_facts, df9_facts, df10_group2_facts]


# Prepare data for acts
extraction_tensor_acts_human_comp1 = prepare_data(dfs_acts_human_comp1, 'feedback_extraction')
detection_tensor_acts_human_comp1 = prepare_data(dfs_acts_human_comp1, 'feedback_detection')

extraction_tensor_acts_human_comp2 = prepare_data(dfs_acts_human_comp2, 'feedback_extraction')
detection_tensor_acts_human_comp2 = prepare_data(dfs_acts_human_comp2, 'feedback_detection')

# Prepare data for facts

extraction_tensor_facts_human_comp1 = prepare_data(dfs_facts_human_comp1, 'feedback_extraction')
detection_tensor_facts_human_comp1 = prepare_data(dfs_facts_human_comp1, 'feedback_detection')

extraction_tensor_facts_human_comp2 = prepare_data(dfs_facts_human_comp2, 'feedback_extraction')
detection_tensor_facts_human_comp2 = prepare_data(dfs_facts_human_comp2, 'feedback_detection')


# Get categories counts to pass into Fleiss Kappa

categories_count_extraction_acts_human_comp1 = count_categories(extraction_tensor_acts_human_comp1, extraction_categories)
categories_count_detection_acts_human_comp1 = count_categories(detection_tensor_acts_human_comp1, detection_categories)

categories_count_extraction_acts_human_comp2 = count_categories(extraction_tensor_acts_human_comp2, extraction_categories)
categories_count_detection_acts_human_comp2 = count_categories(detection_tensor_acts_human_comp2, detection_categories)

categories_count_extraction_facts_human_comp1 = count_categories(extraction_tensor_facts_human_comp1, extraction_categories)
categories_count_detection_facts_human_comp1 = count_categories(detection_tensor_facts_human_comp1, detection_categories)

categories_count_extraction_facts_human_comp2 = count_categories(extraction_tensor_facts_human_comp2, extraction_categories)
categories_count_detection_facts_human_comp2 = count_categories(detection_tensor_facts_human_comp2, detection_categories)

# Compute Fleiss' Kappa for acts

kappa = FleissKappa(mode='counts')


print("Fleiss' Kappa for feedback_extraction (acts) - group 1:", kappa(categories_count_extraction_acts_human_comp1))
print("Fleiss' Kappa for feedback_detection (acts) - group 1:", kappa(categories_count_detection_acts_human_comp1))
print("Fleiss' Kappa for feedback_extraction (acts) - group 2:", kappa(categories_count_extraction_acts_human_comp2))
print("Fleiss' Kappa for feedback_detection (acts) - group 2:", kappa(categories_count_detection_acts_human_comp2))

# Compute Fleiss' Kappa for facts
print("Fleiss' Kappa for feedback_extraction (facts) - group 1:", kappa(categories_count_extraction_facts_human_comp1))
print("Fleiss' Kappa for feedback_detection (facts) - group 1:", kappa(categories_count_detection_facts_human_comp1))
print("Fleiss' Kappa for feedback_extraction (facts) - group 2:", kappa(categories_count_extraction_facts_human_comp2))
print("Fleiss' Kappa for feedback_detection (facts) - group 2:", kappa(categories_count_detection_facts_human_comp2))

Fleiss' Kappa for feedback_extraction (acts) - group 1: tensor(0.5197)
Fleiss' Kappa for feedback_detection (acts) - group 1: tensor(0.7484)
Fleiss' Kappa for feedback_extraction (acts) - group 2: tensor(0.5068)
Fleiss' Kappa for feedback_detection (acts) - group 2: tensor(0.4484)
Fleiss' Kappa for feedback_extraction (facts) - group 1: tensor(0.4867)
Fleiss' Kappa for feedback_detection (facts) - group 1: tensor(0.4757)
Fleiss' Kappa for feedback_extraction (facts) - group 2: tensor(0.4993)
Fleiss' Kappa for feedback_detection (facts) - group 2: tensor(0.6800)


In [25]:
# Calculate average Fleiss' Kappa for acts and facts
average_kappa_extraction_acts_human = (kappa(categories_count_extraction_acts_human_comp1) + kappa(categories_count_extraction_acts_human_comp2)) / 2
average_kappa_detection_acts_human = (kappa(categories_count_detection_acts_human_comp1) + kappa(categories_count_detection_acts_human_comp2)) / 2
average_kappa_extraction_facts_human = (kappa(categories_count_extraction_facts_human_comp1) + kappa(categories_count_extraction_facts_human_comp2)) / 2
average_kappa_detection_facts_human = (kappa(categories_count_detection_facts_human_comp1) + kappa(categories_count_detection_facts_human_comp2)) / 2
print("Average Fleiss' Kappa for feedback_extraction (acts):", average_kappa_extraction_acts_human)
print("Average Fleiss' Kappa for feedback_detection (acts):", average_kappa_detection_acts_human)
print("Average Fleiss' Kappa for feedback_extraction (facts):", average_kappa_extraction_facts_human)
print("Average Fleiss' Kappa for feedback_detection (facts):", average_kappa_detection_facts_human)

Average Fleiss' Kappa for feedback_extraction (acts): tensor(0.5133)
Average Fleiss' Kappa for feedback_detection (acts): tensor(0.5984)
Average Fleiss' Kappa for feedback_extraction (facts): tensor(0.4930)
Average Fleiss' Kappa for feedback_detection (facts): tensor(0.5778)


In [26]:
print("Average Fleiss' Kappa for feedback_extraction:", (average_kappa_extraction_acts_human + average_kappa_extraction_facts_human) / 2)
print("Average Fleiss' Kappa for feedback_detection:", (average_kappa_detection_acts_human + average_kappa_detection_facts_human) / 2)


Average Fleiss' Kappa for feedback_extraction: tensor(0.5031)
Average Fleiss' Kappa for feedback_detection: tensor(0.5881)


### Synthetic data Fleiss Kappa computation

In [27]:
# Prepare dataframes for comparison

dfs_acts_synth_comp1 = [df_synthetic1_group1_acts, df_synthetic2_group1_acts]
dfs_acts_synth_comp2 = [df_synthetic1_group2_acts, df_synthetic2_group2_acts]

dfs_facts_synth_comp1 = [df_synthetic1_group1_facts, df_synthetic2_group1_facts]
dfs_facts_synth_comp2 = [df_synthetic1_group2_facts, df_synthetic2_group2_facts]


# Prepare data for acts
extraction_tensor_acts_synth_comp1 = prepare_data(dfs_acts_synth_comp1, 'feedback_extraction')
detection_tensor_acts_synth_comp1 = prepare_data(dfs_acts_synth_comp1, 'feedback_detection')

extraction_tensor_acts_synth_comp2 = prepare_data(dfs_acts_synth_comp2, 'feedback_extraction')
detection_tensor_acts_synth_comp2 = prepare_data(dfs_acts_synth_comp2, 'feedback_detection')

# Prepare data for facts

extraction_tensor_facts_synth_comp1 = prepare_data(dfs_facts_synth_comp1, 'feedback_extraction')
detection_tensor_facts_synth_comp1 = prepare_data(dfs_facts_synth_comp1, 'feedback_detection')

extraction_tensor_facts_synth_comp2 = prepare_data(dfs_facts_synth_comp2, 'feedback_extraction')
detection_tensor_facts_synth_comp2 = prepare_data(dfs_facts_synth_comp2, 'feedback_detection')


# Get categories counts to pass into Fleiss Kappa

categories_count_extraction_acts_synth_comp1 = count_categories(extraction_tensor_acts_synth_comp1, extraction_categories)
categories_count_detection_acts_synth_comp1 = count_categories(detection_tensor_acts_synth_comp1, detection_categories)

categories_count_extraction_acts_synth_comp2 = count_categories(extraction_tensor_acts_synth_comp2, extraction_categories)
categories_count_detection_acts_synth_comp2 = count_categories(detection_tensor_acts_synth_comp2, detection_categories)

categories_count_extraction_facts_synth_comp1 = count_categories(extraction_tensor_facts_synth_comp1, extraction_categories)
categories_count_detection_facts_synth_comp1 = count_categories(detection_tensor_facts_synth_comp1, detection_categories)

categories_count_extraction_facts_synth_comp2 = count_categories(extraction_tensor_facts_synth_comp2, extraction_categories)
categories_count_detection_facts_synth_comp2 = count_categories(detection_tensor_facts_synth_comp2, detection_categories)

# Compute Fleiss' Kappa for acts

print("Fleiss' Kappa for feedback_extraction (acts) - group 1:", kappa(categories_count_extraction_acts_synth_comp1))
print("Fleiss' Kappa for feedback_detection (acts) - group 1:", kappa(categories_count_detection_acts_synth_comp1))
print("Fleiss' Kappa for feedback_extraction (acts) - group 2:", kappa(categories_count_extraction_acts_synth_comp2))
print("Fleiss' Kappa for feedback_detection (acts) - group 2:", kappa(categories_count_detection_acts_synth_comp2))

# Compute Fleiss' Kappa for facts

print("Fleiss' Kappa for feedback_extraction (facts) - group 1:", kappa(categories_count_extraction_facts_synth_comp1))
print("Fleiss' Kappa for feedback_detection (facts) - group 1:", kappa(categories_count_detection_facts_synth_comp1))
print("Fleiss' Kappa for feedback_extraction (facts) - group 2:", kappa(categories_count_extraction_facts_synth_comp2))
print("Fleiss' Kappa for feedback_detection (facts) - group 2:", kappa(categories_count_detection_facts_synth_comp2))

Fleiss' Kappa for feedback_extraction (acts) - group 1: tensor(1.0000)
Fleiss' Kappa for feedback_detection (acts) - group 1: tensor(1.0000)
Fleiss' Kappa for feedback_extraction (acts) - group 2: tensor(0.6461)
Fleiss' Kappa for feedback_detection (acts) - group 2: tensor(0.6381)
Fleiss' Kappa for feedback_extraction (facts) - group 1: tensor(0.7219)
Fleiss' Kappa for feedback_detection (facts) - group 1: tensor(0.9350)
Fleiss' Kappa for feedback_extraction (facts) - group 2: tensor(0.7219)
Fleiss' Kappa for feedback_detection (facts) - group 2: tensor(0.9350)


In [28]:
# Calculate average Fleiss' Kappa for acts and facts
average_kappa_extraction_acts_synth = (kappa(categories_count_extraction_acts_synth_comp1) + kappa(categories_count_extraction_acts_synth_comp2)) / 2
average_kappa_detection_acts_synth = (kappa(categories_count_detection_acts_synth_comp1) + kappa(categories_count_detection_acts_synth_comp2)) / 2
average_kappa_extraction_facts_synth = (kappa(categories_count_extraction_facts_synth_comp1) + kappa(categories_count_extraction_facts_synth_comp2)) / 2
average_kappa_detection_facts_synth = (kappa(categories_count_detection_facts_synth_comp1) + kappa(categories_count_detection_facts_synth_comp2)) / 2
print("Average Fleiss' Kappa for feedback_extraction (acts):", average_kappa_extraction_acts_synth)
print("Average Fleiss' Kappa for feedback_detection (acts):", average_kappa_detection_acts_synth)
print("Average Fleiss' Kappa for feedback_extraction (facts):", average_kappa_extraction_facts_synth)
print("Average Fleiss' Kappa for feedback_detection (facts):", average_kappa_detection_facts_synth)

Average Fleiss' Kappa for feedback_extraction (acts): tensor(0.8230)
Average Fleiss' Kappa for feedback_detection (acts): tensor(0.8190)
Average Fleiss' Kappa for feedback_extraction (facts): tensor(0.7219)
Average Fleiss' Kappa for feedback_detection (facts): tensor(0.9350)


In [29]:
print("Average Fleiss' Kappa for feedback_extraction:", (average_kappa_extraction_acts_synth + average_kappa_extraction_facts_synth) / 2)
print("Average Fleiss' Kappa for feedback_detection:", (average_kappa_detection_acts_synth + average_kappa_detection_facts_synth) / 2)

Average Fleiss' Kappa for feedback_extraction: tensor(0.7725)
Average Fleiss' Kappa for feedback_detection: tensor(0.8770)


### Fleiss Kappa for human-computer agreement

In [30]:
# Prepare dataframes for comparison

dfs_acts_mix_comp1 = [df1_group1_acts, df7_acts, df10_group1_acts, df_synthetic1_group1_acts, df_synthetic2_group1_acts]
dfs_acts_mix_comp2 = [df1_group2_acts, df5_acts, df9_acts, df10_group2_acts, df_synthetic1_group2_acts, df_synthetic2_group2_acts]

dfs_facts_mix_comp1 = [df1_group1_facts, df7_facts, df10_group1_facts, df_synthetic1_group1_facts, df_synthetic2_group1_facts]
dfs_facts_mix_comp2 = [df1_group2_facts, df5_facts, df9_facts, df10_group2_facts, df_synthetic1_group2_facts, df_synthetic2_group2_facts]


# Prepare data for acts
extraction_tensor_acts_mix_comp1 = prepare_data(dfs_acts_mix_comp1, 'feedback_extraction')
detection_tensor_acts_mix_comp1 = prepare_data(dfs_acts_mix_comp1, 'feedback_detection')

extraction_tensor_acts_mix_comp2 = prepare_data(dfs_acts_mix_comp2, 'feedback_extraction')
detection_tensor_acts_mix_comp2 = prepare_data(dfs_acts_mix_comp2, 'feedback_detection')

# Prepare data for facts

extraction_tensor_facts_mix_comp1 = prepare_data(dfs_facts_mix_comp1, 'feedback_extraction')
detection_tensor_facts_mix_comp1 = prepare_data(dfs_facts_mix_comp1, 'feedback_detection')

extraction_tensor_facts_mix_comp2 = prepare_data(dfs_facts_mix_comp2, 'feedback_extraction')
detection_tensor_facts_mix_comp2 = prepare_data(dfs_facts_mix_comp2, 'feedback_detection')


# Get categories counts to pass into Fleiss Kappa

categories_count_extraction_acts_mix_comp1 = count_categories(extraction_tensor_acts_mix_comp1, extraction_categories)
categories_count_detection_acts_mix_comp1 = count_categories(detection_tensor_acts_mix_comp1, detection_categories)

categories_count_extraction_acts_mix_comp2 = count_categories(extraction_tensor_acts_mix_comp2, extraction_categories)
categories_count_detection_acts_mix_comp2 = count_categories(detection_tensor_acts_mix_comp2, detection_categories)

categories_count_extraction_facts_mix_comp1 = count_categories(extraction_tensor_facts_mix_comp1, extraction_categories)
categories_count_detection_facts_mix_comp1 = count_categories(detection_tensor_facts_mix_comp1, detection_categories)

categories_count_extraction_facts_mix_comp2 = count_categories(extraction_tensor_facts_mix_comp2, extraction_categories)
categories_count_detection_facts_mix_comp2 = count_categories(detection_tensor_facts_mix_comp2, detection_categories)

# Compute Fleiss' Kappa for acts

print("Fleiss' Kappa for feedback_extraction (acts) - group 1:", kappa(categories_count_extraction_acts_mix_comp1))
print("Fleiss' Kappa for feedback_detection (acts) - group 1:", kappa(categories_count_detection_acts_mix_comp1))
print("Fleiss' Kappa for feedback_extraction (acts) - group 2:", kappa(categories_count_extraction_acts_mix_comp2))
print("Fleiss' Kappa for feedback_detection (acts) - group 2:", kappa(categories_count_detection_acts_mix_comp2))

# Compute Fleiss' Kappa for facts

print("Fleiss' Kappa for feedback_extraction (facts) - group 1:", kappa(categories_count_extraction_facts_mix_comp1))
print("Fleiss' Kappa for feedback_detection (facts) - group 1:", kappa(categories_count_detection_facts_mix_comp1))
print("Fleiss' Kappa for feedback_extraction (facts) - group 2:", kappa(categories_count_extraction_facts_mix_comp2))
print("Fleiss' Kappa for feedback_detection (facts) - group 2:", kappa(categories_count_detection_facts_mix_comp2))

Fleiss' Kappa for feedback_extraction (acts) - group 1: tensor(0.1320)
Fleiss' Kappa for feedback_detection (acts) - group 1: tensor(0.4746)
Fleiss' Kappa for feedback_extraction (acts) - group 2: tensor(0.2019)
Fleiss' Kappa for feedback_detection (acts) - group 2: tensor(0.1887)
Fleiss' Kappa for feedback_extraction (facts) - group 1: tensor(0.4097)
Fleiss' Kappa for feedback_detection (facts) - group 1: tensor(0.5771)
Fleiss' Kappa for feedback_extraction (facts) - group 2: tensor(0.4250)
Fleiss' Kappa for feedback_detection (facts) - group 2: tensor(0.7327)


In [31]:
# Calculate average Fleiss' Kappa for acts and facts
average_kappa_extraction_acts_mix = (kappa(categories_count_extraction_acts_mix_comp1) + kappa(categories_count_extraction_acts_mix_comp2)) / 2
average_kappa_detection_acts_mix = (kappa(categories_count_detection_acts_mix_comp1) + kappa(categories_count_detection_acts_mix_comp2)) / 2
average_kappa_extraction_facts_mix = (kappa(categories_count_extraction_facts_mix_comp1) + kappa(categories_count_extraction_facts_mix_comp2)) / 2
average_kappa_detection_facts_mix = (kappa(categories_count_detection_facts_mix_comp1) + kappa(categories_count_detection_facts_mix_comp2)) / 2
print("Average Fleiss' Kappa for feedback_extraction (acts):", average_kappa_extraction_acts_mix)
print("Average Fleiss' Kappa for feedback_detection (acts):", average_kappa_detection_acts_mix)
print("Average Fleiss' Kappa for feedback_extraction (facts):", average_kappa_extraction_facts_mix)
print("Average Fleiss' Kappa for feedback_detection (facts):", average_kappa_detection_facts_mix)

Average Fleiss' Kappa for feedback_extraction (acts): tensor(0.1669)
Average Fleiss' Kappa for feedback_detection (acts): tensor(0.3317)
Average Fleiss' Kappa for feedback_extraction (facts): tensor(0.4174)
Average Fleiss' Kappa for feedback_detection (facts): tensor(0.6549)


In [32]:
print("Average Fleiss' Kappa for feedback_extraction:", (average_kappa_extraction_acts_mix + average_kappa_extraction_facts_mix) / 2)
print("Average Fleiss' Kappa for feedback_detection:", (average_kappa_detection_acts_mix + average_kappa_detection_facts_mix) / 2)

Average Fleiss' Kappa for feedback_extraction: tensor(0.2922)
Average Fleiss' Kappa for feedback_detection: tensor(0.4933)


**Conclusion:**

It seems that the language model evaluates the extraction task very differently than the huan experts, whereas it agrees a bit more on the detection side but still does pull the score down. We can hence conclude that the computer views this task differently OR may even just have answered all wrong or all right on the data input.