# Data cleanup notebook


## 1. Imports

In [1]:
import torch
from torchmetrics.nominal import FleissKappa
import pandas as pd
import numpy as np

from utils import parse_ratings, count_categories

## 2. Clean up messed up files

### Load files

In [2]:
# Participant 2

first_file_2 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_7a185ca4-3d37-4487-8aa7-ad9e8f6fe884.csv" # 13:10
second_file_2 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_861cdaee-9f2e-4454-8252-d8ff397eb14e.csv" # 15:48
third_file_2 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_f60e0e84-2638-44da-871b-4847b751fabb.csv" # 16:23


# Participant 6

first_file_6 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_48d1eb99-5d33-476a-a1a4-75917aa92e92.csv" #12:21
second_file_6 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_b3d2cab5-ffca-467d-bddb-b9e188e5a85a.csv" # 11:09
third_file_6 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_c2e282a4-f8e7-4542-95ab-a9a74b6f57e0.csv" # 13:39

### Sort out files participant 2

In [3]:
df_2_1 = pd.read_csv(first_file_2, sep=";")
df_2_2 = pd.read_csv(second_file_2, sep=";")
df_2_3 = pd.read_csv(third_file_2, sep=";")

df_2_1 = df_2_1[:186]
df_2_2 = df_2_2[186:]
df_2_3 = df_2_3[30:]

df_2 = pd.concat([df_2_1, df_2_2, df_2_3], ignore_index=True)
print(len(df_2))

281


### Sort out files participant 6

In [4]:
df_6_1 = pd.read_csv(second_file_6, sep=";")
df_6_2 = pd.read_csv(first_file_6, sep=";")
df_6_3 = pd.read_csv(third_file_6, sep=";")

print(len(df_6_1))
print(len(df_6_2))
print(len(df_6_3))

df_6_1 = df_6_1[:]
df_6_2 = df_6_2[84:]
df_6_3 = df_6_3[:]

df_6 = pd.concat([df_6_1, df_6_2, df_6_3], ignore_index=True)
print(len(df_6))

84
168
114
282


### Check if there are duplicate rows in the dataframes

In [5]:
duplicates_6 = df_6.iloc[:, :-2].duplicated()
duplicates_2 = df_2.iloc[:, :-2].duplicated()
print(duplicates_6.sum())
print(duplicates_2.sum())

0
0


## 3. Load all proper files (except for the ones that were already created)


In [6]:
FILE_1 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_b2339fe6-2896-43ab-a9c8-24c8aacfbbd1.csv"
FILE_2 = None
FILE_3 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_6430b6fd-0ddd-4cc6-a4a0-216d5603143e.csv"
FILE_4 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_5fa87112-3702-4263-ba81-1779b3b24d16.csv"
FILE_5 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_7b177fba-3ddb-465b-9a25-6f4481eeb492.csv"
FILE_6 = None
FILE_7 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_ab53866b-7831-4f33-a628-3b6dbf01ead1.csv"
FILE_8 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_b1ed9f35-7d6a-439c-8a46-089311e8e340.csv"
FILE_9 = "/home/jacques.furst/development/RAG/flintfiller-precondition-rl/data/human_feedback/human_feedback_a8ec999a-935f-476d-ac5c-f328a1288c7c.csv"
FILE_10 = None

### Load all dataframes

In [7]:
df_1 = pd.read_csv(FILE_1, sep=";")
df_3 = pd.read_csv(FILE_3, sep=";")
df_4 = pd.read_csv(FILE_4, sep=";")
df_5 = pd.read_csv(FILE_5, sep=";")
df_7 = pd.read_csv(FILE_7, sep=";")
df_8 = pd.read_csv(FILE_8, sep=";")
df_9 = pd.read_csv(FILE_9, sep=";")

print(df_1['feedback_detection'])

0                               Duidelijk
1                               Duidelijk
2                               Duidelijk
3                               Duidelijk
4                               Duidelijk
                      ...                
136    Onbestemde positie in ground truth
137    Onbestemde positie in ground truth
138               Helemaal niet duidelijk
139               Helemaal niet duidelijk
140               Helemaal niet duidelijk
Name: feedback_detection, Length: 141, dtype: object


### Cast ratings to numeric values to use fleiss kappa

In [8]:
print(repr(df_2.loc[df_2['feedback_detection'].str.contains("Niet goed", na=False), 'feedback_detection'].unique()))

array(['Niet goed'], dtype=object)


In [9]:
# parse ratings in all dataframes to be able to calculate Fleiss kappa
def apply_parse_ratings(df, number):
    df['feedback_extraction'] = [parse_ratings(feedback) for feedback in df['feedback_extraction']]
    # print(number)
    # print(df['feedback_detection'])
    df['feedback_detection'] = [parse_ratings(feedback_detec) for feedback_detec in df['feedback_detection']]
    # print(df['feedback_detection'])
    return df

# Apply to each DataFrame
df1 = apply_parse_ratings(df_1, 1)
df2 = apply_parse_ratings(df_2, 2)
df3 = apply_parse_ratings(df_3, 3)
df4 = apply_parse_ratings(df_4, 4)
df5 = apply_parse_ratings(df_5, 5)
df6 = apply_parse_ratings(df_6, 6)
df7 = apply_parse_ratings(df_7, 7)
df8 = apply_parse_ratings(df_8, 8)
df9 = apply_parse_ratings(df_9, 9)
#df10 = 

# Check if any feedback column contains Nan values


print(df1['feedback_detection'].isna().any())
print(df2['feedback_detection'].isna().any())
print(df3['feedback_detection'].isna().any())
print(df4['feedback_detection'].isna().any())
print(df5['feedback_detection'].isna().any())
print(df6['feedback_detection'].isna().any())
print(df7['feedback_detection'].isna().any())
print(df8['feedback_detection'].isna().any())
print(df9['feedback_detection'].isna().any())

print(df1['feedback_extraction'].isna().any())
print(df2['feedback_extraction'].isna().any())
print(df3['feedback_extraction'].isna().any())
print(df4['feedback_extraction'].isna().any())
print(df5['feedback_extraction'].isna().any())
print(df6['feedback_extraction'].isna().any())
print(df7['feedback_extraction'].isna().any())
print(df8['feedback_extraction'].isna().any())
print(df9['feedback_extraction'].isna().any())


False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


In [10]:

# Collect the first 50 entries from each DataFrame for each column
dfs = [df1, df3, df5, df7, df9]
# dfs = [df5, df8]


# Compare first 50 rows, excluding the last 3 columns
subset1 = df1.iloc[:50, :-3]
subset2 = df2.iloc[:50, :-3]
subset3 = df3.iloc[:50, :-3]
subset4 = df4.iloc[:50, :-3]
subset5 = df5.iloc[:50, :-3]
subset6 = df6.iloc[:50, :-3]
subset7 = df7.iloc[:50, :-3]
subset8 = df8.iloc[:50, :-3]
subset9 = df9.iloc[:50, :-3]

# Check if they are equal
are_equal_1_2 = subset1.equals(subset2)
are_equal_1_3 = subset1.equals(subset3)
are_equal_1_4 = subset1.equals(subset4)
are_equal_1_5 = subset1.equals(subset5)
are_equal_1_6 = subset1.equals(subset6)
are_equal_1_7 = subset1.equals(subset7)
are_equal_1_8 = subset1.equals(subset8)
are_equal_1_9 = subset1.equals(subset9)

# Print results of how equal stuff is
print("Are the first 50 rows of df1 and df2 equal (excluding last 3 columns)?", are_equal_1_2)
print("Are the first 50 rows of df1 and df3 equal (excluding last 3 columns)?", are_equal_1_3)
print("Are the first 50 rows of df1 and df4 equal (excluding last 3 columns)?", are_equal_1_4)
print("Are the first 50 rows of df1 and df5 equal (excluding last 3 columns)?", are_equal_1_5)
print("Are the first 50 rows of df1 and df6 equal (excluding last 3 columns)?", are_equal_1_6)
print("Are the first 50 rows of df1 and df7 equal (excluding last 3 columns)?", are_equal_1_7)
print("Are the first 50 rows of df1 and df8 equal (excluding last 3 columns)?", are_equal_1_8)
print("Are the first 50 rows of df1 and df9 equal (excluding last 3 columns)?", are_equal_1_9)

# Helper function to extract and stack ratings
def prepare_data(dfs, column):
    # print(column)
    data = [df[column].iloc[:50].tolist() for df in dfs]

    # print(data)
    return torch.tensor(list(zip(*data))) # shape: (50 items, 8 raters)

# Prepare data
extraction_tensor = prepare_data(dfs, 'feedback_extraction')
detection_tensor = prepare_data(dfs, 'feedback_detection')

# couont categories to pass into fleiss kappa
extraction_categories = [0,1,2,3]
detection_categories = [4,5,6]

categories_count_extraction = count_categories(extraction_tensor, extraction_categories)
categories_count_detection = count_categories(detection_tensor, detection_categories)

# print("Extraction tensor shape:", extraction_tensor.shape)
# print("Etraction tensor:", extraction_tensor)

# print("Detection tensor shape:", detection_tensor.shape)
# print("Detection tensor:", detection_tensor)

# print(extraction_tensor.isnan().any())
# print(detection_tensor.isnan().any())




Are the first 50 rows of df1 and df2 equal (excluding last 3 columns)? True
Are the first 50 rows of df1 and df3 equal (excluding last 3 columns)? True
Are the first 50 rows of df1 and df4 equal (excluding last 3 columns)? True
Are the first 50 rows of df1 and df5 equal (excluding last 3 columns)? True
Are the first 50 rows of df1 and df6 equal (excluding last 3 columns)? True
Are the first 50 rows of df1 and df7 equal (excluding last 3 columns)? True
Are the first 50 rows of df1 and df8 equal (excluding last 3 columns)? True
Are the first 50 rows of df1 and df9 equal (excluding last 3 columns)? True


In [11]:
# Determine number of classes (assuming all ratings are integers starting from 0 or 1)
num_classes_extraction = len(set(extraction_tensor.flatten().tolist()))
num_classes_detection = len(set(detection_tensor.flatten().tolist()))

# Compute Fleiss' Kappa
# TODO: data structure seems fine, but check that you have taken the ame data for every person


kappa_extraction = FleissKappa(mode='counts')
kappa_detection = FleissKappa(mode='counts')

print("Fleiss' Kappa for feedback_extraction:", kappa_extraction(categories_count_extraction))
print("Fleiss' Kappa for feedback_detection:", kappa_detection(categories_count_detection))

Fleiss' Kappa for feedback_extraction: tensor(0.4586)
Fleiss' Kappa for feedback_detection: tensor(0.3678)
