# Expert dataset (Kili)

# Define function and metrics for computing Krippendorff's alpha on processed Kili output

#### Note that K's alpha measures disagreement, not agreement.
#### Final score between 0 and 1 (but can also be negative), where 1 = full reliability (and no disagreement)

Alpha is computed on LABELS only (classification part of each task), then on position index OVERLAP (entity extraction part of each task), then on OVERLAP + text SIMILARITY (to cover cases where the labeled texts are similar but not overlapping). Finally the function eval_metric merges everything into one.  

In [1]:
import pandas as pd
import numpy as np
import itertools
import statistics

In [2]:
kilidata = pd.read_csv('../data/Kili/complete_en.csv')
kilidata.head(3).T

Unnamed: 0,0,1,2
id,EN2652359,EN2652359,EN2652359
published,"(2019, 6, 24, 'MON')","(2019, 6, 24, 'MON')","(2019, 6, 24, 'MON')"
labeler,Fabio Poletto,Fabio Poletto,Fabio Poletto
taskID,TYPE,FACT,FACT
value,SUMMARY,RETURN,REFUGEE
offset_from,,636.0,636.0
offset_to,,885.0,885.0
content,,Refugees and Congolese returnees from elsewher...,Refugees and Congolese returnees from elsewher...


In [3]:
kilidata.id.unique().shape

(95,)

In [4]:
kilidata.taskID.value_counts()

LOCATION_ORIGIN    280
FACT               270
CAUSE              251
QUANTITY           232
TYPE               133
DATE                61
Name: taskID, dtype: int64

## Define Krippendorf's alpha

In [5]:
# define metrics for agreement on each task
    
def nominal_metric(a, b):
    return a != b


def interval_metric(a, b):
    return (a-b)**2


def ratio_metric(a, b):
    return ((a-b)/(a+b))**2


def type_metric_str(a, b):
    (a,b) = sorted((a,b))
    if a==b:
        return 0        # 0:agreement 
    elif a=='BOTH' and b=='NEWS': # news and both are same
        return 0
    elif a=='BOTH' and b=='SUMMARY': # summary and both are same
        return 0
    else:
        return 1        # 1:disagreement

In [6]:
def krippendorff_alpha(units, metric=interval_metric):

    n = sum(len(pv) for pv in units.values())  # number of pairable values
    print (f'n: {n}')
    
    if n == 0:
        raise ValueError("No items to compare.")
    
    Do = 0.
    for grades in units.values():
        Du = sum(metric(gi, gj) for gi in grades for gj in grades)
        Do += Du/float(len(grades)-1)
    Do /= float(n)
    print (f'Do: {Do}')

    if Do == 0:
        return 1.

    De = 0.
    for g1 in units.values():
        for g2 in units.values():
            De += sum(metric(gi, gj) for gi in g1 for gj in g2)
    De /= float(n*(n-1))
    print (f'De: {De}')

    return 1.-Do/De if (Do and De) else 1.

# For each task, prepare data for computing alpha

## Task: TYPE

In [7]:
# create assetID-labels table for a given task 
df_type = kilidata[kilidata.taskID == 'TYPE'][['id', 'labeler', 'value']]
df_type = df_type.groupby(['id', 'labeler']).last().reset_index()

units = df_type.groupby('id').apply(lambda x: x.value.values)
units_type = units[units.map(lambda x: len(x)>1)]
units_type.head()

id
EN2433725             [NEWS, NEWS, NEWS]
EN2434557             [NEWS, NEWS, NEWS]
EN2445660                   [NEWS, NEWS]
EN2468666        [N_A, SUMMARY, SUMMARY]
EN2469866    [SUMMARY, SUMMARY, SUMMARY]
dtype: object

In [8]:
krippendorff_alpha(units_type.to_dict(), type_metric_str)

n: 58
Do: 0.1724137931034483
De: 0.45977011494252873


0.625

## Compute alpha for labels

In [9]:
tasks = ['TYPE', 'FACT', 'CAUSE', 'QUANTITY', 'LOCATION_ORIGIN', 'DATE']

units_dict = {}
for task in tasks:
    df_task = kilidata[kilidata.taskID == task][['id', 'labeler', 'value']] # select relevant columns
    df_task = df_task.groupby(['id', 'labeler']).last().reset_index() # group by 'id' (asset id), 'labeler'

    units = df_task.groupby('id').apply(lambda x: x.value.values) # group by 'id' (asset id) and get values (labels)
    units = units[units.map(lambda x: len(x)>1)] # keep only asset with > 1 annotation
    units_dict[task] = units

In [10]:
metrics = {'TYPE': type_metric_str,
           'FACT':nominal_metric,
           'CAUSE':nominal_metric,
           'QUANTITY':nominal_metric,
           'LOCATION_ORIGIN':nominal_metric,
           'DATE':nominal_metric
           # 'DATE':(lambda x,y: 0)
          }

kalphas = {}
# for each task return n (number of pairable items), Do (observed disagreement) and De (expected disagreement) 
for task in tasks:
    ka = krippendorff_alpha(units_dict[task].to_dict(), metrics[task]) 
    kalphas[task] = ka

n: 58
Do: 0.1724137931034483
De: 0.45977011494252873
n: 42
Do: 0.42857142857142855
De: 0.7328687572590011
n: 40
Do: 0.1
De: 0.4564102564102564
n: 40
Do: 0.15
De: 0.18846153846153846
n: 39
Do: 0.717948717948718
De: 0.7004048582995951
n: 9
Do: 0.2222222222222222
De: 0.3888888888888889


In [11]:
kalphas

{'TYPE': 0.625,
 'FACT': 0.41521394611727414,
 'CAUSE': 0.7808988764044944,
 'QUANTITY': 0.20408163265306123,
 'LOCATION_ORIGIN': -0.025048169556840083,
 'DATE': 0.4285714285714286}

In [12]:
kalphas

{'TYPE': 0.625,
 'FACT': 0.41521394611727414,
 'CAUSE': 0.7808988764044944,
 'QUANTITY': 0.20408163265306123,
 'LOCATION_ORIGIN': -0.025048169556840083,
 'DATE': 0.4285714285714286}

## Compute alpha for text overlap

In [13]:
# define function for overlap agreement on text selection (offset_from, offset_to)
def getOverlap(a, b):
    return max(0, min(a[1], b[1]) - max(a[0], b[0]))

def getUnion(a, b):
    return max(0, max(a[1], b[1]) - min(a[0], b[0]))

def textoverlap_metric(a, b):
    '''Compute agreement between two labelers (a, b)
    as the relative overlap (overlap/union) of the text they selected
    '''
    (l1_from, l1_to), (l2_from, l2_to) = a, b
    o = getOverlap((l1_from, l1_to), (l2_from, l2_to))
    u = getUnion((l1_from, l1_to), (l2_from, l2_to))
    
    aou = 0 if u==0 else o/u
    return 0 if aou > 0 else 1

def group_offsets(subgroup):
    offsets = []
    for ix, row in subgroup.iterrows():
        offsets.append((row['offset_from'], row['offset_to']))
    return offsets

In [14]:
###
tasks = ['FACT', 'CAUSE', 'QUANTITY', 'LOCATION_ORIGIN', 'DATE']

units_dict = {}
for task in tasks:
    df_task = kilidata[kilidata.taskID == task][['id', 'labeler', 'value', 'offset_from', 'offset_to']]
    df_task = df_task.groupby(['id', 'labeler']).last().reset_index()

    units = df_task.groupby('id').apply(group_offsets)
    units = units[units.map(lambda x: len(x)>1)]
    units_dict[task] = units

In [15]:
###
kalphas_overlap = {}
for task in tasks:
    ka = krippendorff_alpha(units_dict[task].to_dict(), textoverlap_metric)
    kalphas_overlap[task] = ka

n: 42
Do: 0.42857142857142855
De: 0.9279907084785134
n: 40
Do: 0.625
De: 0.9769230769230769
n: 40
Do: 0.45
De: 0.9730769230769231
n: 39
Do: 0.5897435897435898
De: 0.97165991902834
n: 9
Do: 0.8888888888888888
De: 0.9722222222222222


In [16]:
# print Krippendorff's alpha score based on text selection overlap + text similarity
# for a given label and for each NER task
kalphas_overlap

{'FACT': 0.5381727158948686,
 'CAUSE': 0.36023622047244097,
 'QUANTITY': 0.5375494071146245,
 'LOCATION_ORIGIN': 0.3930555555555555,
 'DATE': 0.08571428571428574}

## Compute alpha for text overlap + text similarity

In [17]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
import re
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))

In [18]:
def textoverlap(a, b):
    '''Compute agreement between two labelers (a, b)
    as the relative overlap (overlap/union) of the text they selected
    '''
    (l1_from, l1_to), (l2_from, l2_to) = a, b
    o = getOverlap((l1_from, l1_to), (l2_from, l2_to))
    u = getUnion((l1_from, l1_to), (l2_from, l2_to))
    
    aou = 0 if u==0 else o/u
    return aou
#    return 0 if aou > 0 else 1

In [19]:
# returns a distance between a and b, based on overlap and jaccard

def text_similarity_metric(a, b, overlap_threshold=0, jaccard_threshold=0):
    '''Compute agreement between two labelers (a, b)
    as the relative overlap (overlap/union) of the text they selected
    PLUS the Jaccard similarity between the two selected texts,
    regardless of their position.
    
    a, b = lists containing values from columns "offset_from", "offset_to", "content"
    overlap_threshold = how much overlap is needed for agreement
    jaccard_threshold = how much similarity is needed for agreement if no overlap
    '''
    (l1_from, l1_to, content1), (l2_from, l2_to, content2) = a, b
    om = textoverlap((l1_from, l1_to), (l2_from, l2_to))
    #if om == 0:
    #    return 0
    if om > overlap_threshold:
        return 0
    
    # TEXT PREPROCESSING: lowercase, tokenize, remove stopwords
     
    s1 = tokenizer.tokenize(content1.lower())
    s2 = tokenizer.tokenize(content2.lower())
    s1 = set([w for w in s1 if not w in stop_words]) 
    s2 = set([w for w in s2 if not w in stop_words]) 
    
    jaccard = len(s1.intersection(s2))/len(s1.union(s2))
    if jaccard > jaccard_threshold:
        return 0
    
    return 1


def group_offsets(subgroup):
    offsets = []
    for ix, row in subgroup.iterrows():
        offsets.append((row['offset_from'], row['offset_to'], row['content']))
    return offsets

In [20]:
tasks = ['FACT', 'CAUSE', 'QUANTITY', 'LOCATION_ORIGIN', 'DATE']

units_dict = {}
for task in tasks:
    df_task = kilidata[kilidata.taskID == task][['id', 'labeler', 'value', 'offset_from', 'offset_to', 'content']]
    df_task = df_task.groupby(['id', 'labeler']).last().reset_index()

    units = df_task.groupby('id').apply(group_offsets)
    units = units[units.map(lambda x: len(x)>1)]
    units_dict[task] = units

In [21]:
kalphas_overlap_sim = {}
for task in tasks:
    ka = krippendorff_alpha(units_dict[task].to_dict(), lambda x, y: text_similarity_metric(x, y, 0, 0))
    kalphas_overlap_sim[task] = ka

n: 42
Do: 0.21428571428571427
De: 0.4436701509872242
n: 40
Do: 0.3
De: 0.9384615384615385
n: 40
Do: 0.3
De: 0.7551282051282051
n: 39
Do: 0.4358974358974359
De: 0.9608636977058029
n: 9
Do: 0.6666666666666666
De: 0.9444444444444444


In [22]:
# print Krippendorff's alpha score based on text selection overlap + text similarity
# for a given label and for each NER task
kalphas_overlap_sim

{'FACT': 0.5170157068062828,
 'CAUSE': 0.680327868852459,
 'QUANTITY': 0.6027164685908319,
 'LOCATION_ORIGIN': 0.5463483146067416,
 'DATE': 0.2941176470588236}

In [23]:
# print Krippendorff's alpha score based on text selection overlap
# for a given label and for each NER task
kalphas_overlap

{'FACT': 0.5381727158948686,
 'CAUSE': 0.36023622047244097,
 'QUANTITY': 0.5375494071146245,
 'LOCATION_ORIGIN': 0.3930555555555555,
 'DATE': 0.08571428571428574}

## Merge all metrics
### First compute disagreement for overlap+similarity, then for labels but only on assets where there is overlap or similarity between selection 

In [24]:
def group_data(subgroup):
    offsets = []
    for ix, row in subgroup.iterrows():
        offsets.append((row['value'], row['offset_from'], row['offset_to'], row['content']))
    return offsets

tasks = ['FACT', 'CAUSE', 'QUANTITY', 'LOCATION_ORIGIN', 'DATE']

units_dict = {}
for task in tasks:
    df_task = kilidata[kilidata.taskID == task][['id', 'labeler', 'value', 'offset_from', 'offset_to', 'content']]
    df_task = df_task.groupby(['id', 'labeler']).last().reset_index()

    units = df_task.groupby('id').apply(group_data)
    units = units[units.map(lambda x: len(x)>1)]
    units_dict[task] = units

In [25]:
def eval_metric(a, b):
    score_label = text_similarity_metric(a[1:], b[1:])
    score_class = metrics[task](a[0], b[0])
    return (score_label + score_class)/2

In [26]:
kalphas_overlap_sim = {}
for task in tasks:
    ka = krippendorff_alpha(units_dict[task].to_dict(), eval_metric)
    kalphas_overlap_sim[task] = ka

n: 42
Do: 0.32142857142857145
De: 0.5882694541231127
n: 40
Do: 0.2
De: 0.6974358974358974
n: 40
Do: 0.225
De: 0.4717948717948718
n: 39
Do: 0.5769230769230769
De: 0.8306342780026991
n: 9
Do: 0.4444444444444444
De: 0.6666666666666666


In [27]:
kalphas_overlap_sim

{'FACT': 0.45360315893385983,
 'CAUSE': 0.713235294117647,
 'QUANTITY': 0.5230978260869565,
 'LOCATION_ORIGIN': 0.30544272948822104,
 'DATE': 0.33333333333333337}