# Expert dataset (Kili)

# Define function and metrics for computing Krippendorff's alpha on processed Kili output

#### Note that K's alpha measures disagreement, not agreement.
#### Final score between 0 and 1 (but can also be negative), where 1 = full reliability (and no disagreement)

Alpha is computed on LABELS only (classification part of each task), then on position index OVERLAP (entity extraction part of each task), then on OVERLAP + text SIMILARITY (to cover cases where the labeled texts are similar but not overlapping). Finally the function eval_metric merges everything into one.  

In [1]:
import pandas as pd
import numpy as np
import itertools
import statistics

In [2]:
kilidata = pd.read_csv('../data/Kili/relevance_en.csv')
kilidata.columns = ['id', 'labeler', 'value']
# kilidata = kilidata[kilidata.value!='N_A']
kilidata.head(3).T

Unnamed: 0,0,1,2
id,EN2652359,EN2422390,EN2718256
labeler,FP,FP,FP
value,RELEVANT,NOT_RELEVANT,RELEVANT


In [3]:
kilidata.id.unique().shape

(197,)

In [4]:
kilidata.value.value_counts()

NOT_RELEVANT    136
RELEVANT        122
N_A               6
Name: value, dtype: int64

## Define Krippendorf's alpha

In [5]:
# define metrics for agreement
def nominal_metric(a, b):
    return a != b


def interval_metric(a, b):
    return (a-b)**2


def ratio_metric(a, b):
    return ((a-b)/(a+b))**2


def krippendorff_alpha(units, metric=interval_metric):

    n = sum(len(pv) for pv in units.values())  # number of pairable values
    print (f'n: {n}')
    
    if n == 0:
        raise ValueError("No items to compare.")
    
    Do = 0.
    for grades in units.values():
        Du = sum(metric(gi, gj) for gi in grades for gj in grades)
        Do += Du/float(len(grades)-1)
    Do /= float(n)
    print (f'Do: {Do}')

    if Do == 0:
        return 1.

    De = 0.
    for g1 in units.values():
        for g2 in units.values():
            De += sum(metric(gi, gj) for gi in g1 for gj in g2)
    De /= float(n*(n-1))
    print (f'De: {De}')

    return 1.-Do/De if (Do and De) else 1.

# For each task, prepare data for computing alpha

## Task: RELEVANCE

In [6]:
# create assetID-labels table for a given task 
df_type = kilidata[['id', 'labeler', 'value']]
df_type = df_type.groupby(['id', 'labeler']).last().reset_index()

units = df_type.groupby('id').apply(lambda x: x.value.values)
units_type = units[units.map(lambda x: len(x)>1)]
units_type.head()

id
EN2401380    [NOT_RELEVANT, NOT_RELEVANT, NOT_RELEVANT]
EN2433725        [RELEVANT, NOT_RELEVANT, NOT_RELEVANT]
EN2434557            [NOT_RELEVANT, RELEVANT, RELEVANT]
EN2445660                          [RELEVANT, RELEVANT]
EN2468074    [NOT_RELEVANT, NOT_RELEVANT, NOT_RELEVANT]
dtype: object

In [7]:
def type_metric_str(a, b):
    (a,b) = sorted((a,b))
    if a==b:
        return 0        # 0:agreement 
    elif a=='NOT_RELEVANT' and b=='N_A': # nr and na are same
        return 0
    else:
        return 1        # 1:disagreement

krippendorff_alpha(units_type.to_dict(), type_metric_str)

n: 103
Do: 0.13592233009708737
De: 0.5010470207500476


0.7287234042553192