# Interannotator agreement

Basic work to calculate interannotator agreement on 1914 QA task.

In [1]:
import krippendorff # https://github.com/pln-fing-udelar/fast-krippendorff
import numpy  as np
import pandas as pd

In [2]:
ted = pd.read_json(
    '4oBigUntunedJudged.json'
)

In [3]:
def find_target_text(text, df=ted, col='user'):
    '''Find columns Ted's gold data that match given text.'''
    return df.loc[df[col].apply(lambda x: np.any([text.lower() in i.lower() for i in x]))]

In [4]:
excel = pd.read_excel(
    'questions_with_answers.xlsm'
)

In [5]:
# match entries across datasets
idx_pairs = []
for excel_idx in excel.index:
    excel_text = excel.at[excel_idx, 'user']
    ted_row = find_target_text(excel_text)
    if len(ted_row) == 1:
        idx_pairs.append((excel_idx, ted_row.index[0]))

In [6]:
# combine labels
labeled_answers = pd.concat(
    [
        excel.loc[
            list(list(zip(*idx_pairs))[0]), 
            ['user', '4omini-raw-ok', '4omini-ft-ok', '4obig-ok']
        ].reset_index(drop=True),
        ted.loc[
            list(list(zip(*idx_pairs))[1]), 
            ['plausibly1914', 'user']
        ].reset_index(drop=True)
    ],
    axis='columns',
).dropna()

In [7]:
# stats and IRR/IAA value
print('Labeled answers:', f'{len(labeled_answers):3}')

print('\nPass rates:')
for col in ['4omini-raw-ok', '4omini-ft-ok', '4obig-ok', 'plausibly1914']:
    print(f'  {col:13}: {labeled_answers[col].sum():3} ({round(labeled_answers[col].sum()/len(labeled_answers)*100,1)}%)')

irr = krippendorff.alpha(labeled_answers[['4obig-ok', 'plausibly1914']].astype(int).T)
print(f"IRR (Krippendorff's alpha; MW/TU on 4obig): {round(irr, 3)}")

Labeled answers:  25

Pass rates:
  4omini-raw-ok:  10 (40.0%)
  4omini-ft-ok :  19 (76.0%)
  4obig-ok     :  14 (56.0%)
  plausibly1914:  11 (44.0%)
IRR (Krippendorff's alpha; MW/TU on 4obig): 0.294


IRR interpretation: Generally want alpha > 0.67, [per Krippendoff](https://www.k-alpha.org/methodological-notes).