In [3]:
import json
import pandas as pd
import numpy as np
import pickle
from nltk.metrics import agreement
from nltk.metrics.agreement import AnnotationTask
from nltk.metrics import masi_distance, jaccard_distance
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from joblib import Parallel, delayed
from scipy.spatial import distance
from itertools import permutations, combinations
import random

In [6]:
# The string emotion labels have previously been translated using the dictionary below:
translation_dict = {
    'Angst': (1,0),
    'Aanvaarding': (0.707,0.707),
    'Boosheid (woede)': (-1, 0),
    'Anticipatie': (-0.707, 0.707),
    'Walging': (-0.707, -0.707),
    'Vreugde': (0, 1),
    'Verdriet': (0, -1),
    'Verassing': (0.707, -0.707),
    'Geen emoti': (100, 100),
    'Geen Emoti': (100, 100),
    'Niet Bruikbaar': (-100, -100)
}

In [None]:
numerical_annotations = pd.read_pickle('numerical_agreement_set.pkl')

In [10]:
def custom_distance_function(label1, label2):
    '''
        This function takes the euclidean distance between two numerical emotion labels.
        To this end, it first converts the frozenset input to a list.
        Subsequently it considers special cases (no emotion & not usable).
        Lastly, it normalizes the output to be a number between 0 and 1 as this is required for NLTK agreement 
        metrics. It does this by dividing the output by two as the maximum distance is equal to 2.
    '''
    label1 = list(label1)
    label2 = list(label2)
    assert len(label1) > 0 and len(label2) > 0, "Labels have length zero"
    
    # If a label indicates 'no emotion' AND 'not usable' change it to 'not usable'
    if label1 == [(100, 100), (-100, -100)] or label1 == [(-100, -100), (100, 100)]:
        label1 = [(-100, -100)]
    elif label2 == [(100, 100), (-100, -100)] or label2 == [(-100, -100), (100, 100)]:
        label2 = [(-100, -100)]
    
    # If both labels indicate 'no emotion' distance is zero (full agreement)
    if label1 == [(100, 100)] and label2 == [(100, 100)]:
        return 0
    # If both labels indicate 'not usable' distance is zero (full agreement)
    elif label1 == [(-100, -100)] and label2 == [(-100, -100)]:
        return 0
    # If one label indicates 'no emotion' and the other 'not usable', we argue that there is some agreement
    elif (label1 == [(-100, -100)] and label2 == [(100, 100)]) or (label2 == [(-100, -100)] and label1 == [(100, 100)]):
        return 0.5
    # If one label indicates 'no emotion' or 'not usable' and the other indicates emotion, there is no agreement
    # distance is therefore equal to one
    elif (label1 == [(100, 100)] and label2 != [(100, 100)]) or (label1 == [(-100, -100)] and label2 != [(-100, -100)]):
        return 1
    # The same for label2
    elif (label2 == [(100, 100)] and label1 != [(100, 100)]) or (label2 == [(-100, -100)] and label1 != [(-100, -100)]):
        return 1
    # If one label contains 'no emotion' and the other does not, there is no agreement (distance 1)
    elif ((100, 100) in label1 and (100, 100) not in label2) or ((100, 100) in label2 and (100, 100) not in label1):
        return 1
    # Special case: When someone used 'not usable' together with an emotion, and another did not, we calculate 
    # the distance between the emotion labels, and we add 1 for the 'not usable' label (normalised it becomes 1/2)
    elif ((-100, -100) in label1 and (-100, -100) not in label2 and len(label1) > 1):
        label1.remove((-100, -100))
        if len(label1) > 1 or len(label2)  >1:
            distances = [distance.euclidean(one, two) for one in label1 for two in label2]
            return ((sum(distances) / len(distances)) / 2) + 0.5
        elif len(label1) == 1 and len(label2) == 1:
            return (distance.euclidean(label1, label2) / 2) + 0.5
    # The same thing, but now for label2    
    elif ((-100, -100) in label2 and (-100, -100) not in label1 and len(label2) > 0):
        label2.remove((-100, -100))
        if len(label1) > 1 or len(label2)  >1:
            distances = [distance.euclidean(one, two) for one in label1 for two in label2]
            return ((sum(distances) / len(distances)) / 2) + 0.5
        elif len(label1) == 1 and len(label2) == 1:
            return (distance.euclidean(label1, label2) / 2) + 0.5
    # The straightforward case (i.e. one emotion label per person)
    elif len(label1) == 1 and len(label2) == 1:
        return distance.euclidean(label1, label2) / 2
    # The case for multiple labels
    else:
        distances = [distance.euclidean(one, two) for one in label1 for two in label2]
        return (sum(distances) / len(distances)) / 2

In [13]:
def powerset(s, min_size):
    powerset = []
    x = len(s)
    for i in range(1 << x):
        powerset.append([s[j] for j in range(x) if (i & (1 << j))])
        
    powerset = [i for i in powerset if len(i) > min_size]
        
    return powerset

def agreement_measures(annotator_dataframe, annotators):
    
    annotations = triples_list(annotator_dataframe, annotators)
        
    annotation_task = AnnotationTask(data = annotations, distance = masi_distance)
    annotation_task_jaccard = AnnotationTask(data=annotations, distance=jaccard_distance)
    annotation_task_custom = AnnotationTask(data = annotations, distance=custom_distance_function)
    
    # Single-metric evaluation of the annotation task
    alpha_masi = annotation_task.alpha()
    alpha_jaccard = annotation_task_jaccard.alpha()
    alpha_custom = annotation_task_custom.alpha()
    multi_kappa = annotation_task_custom.multi_kappa()
    multi_pi = annotation_task_custom.pi()
    
    return alpha_masi, alpha_jaccard, alpha_custom, multi_kappa, multi_pi
    
def find_best_team(df, annotators):
    if len(annotators)  <= 1:
        return {}
    
    metrics = agreement_measures(df, annotators)
    return {
        'team': annotators,
        'alpha_masi': metrics[0],
        'alpha_jaccard': metrics[1],
        'alpha_custom': metrics[2],
        'multi_kappa': metrics[3],
        'multi_pi': metrics[4]
    }

In [46]:
teams =  Parallel(n_jobs = -1, verbose = 1)(delayed(find_best_team)(result[0], team) for team in powerset(annotators,2))

df_results = pd.DataFrame(teams).dropna(subset = ['team']).set_index('team')

df_results = df_results.sort_values(by = 'alpha_custom', ascending = False)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 536 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 1536 tasks      | elapsed:   29.6s
[Parallel(n_jobs=-1)]: Done 2936 tasks      | elapsed:   58.2s
[Parallel(n_jobs=-1)]: Done 4058 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 4608 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 5258 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 6008 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 6858 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 7808 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 8858 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 10008 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 11258 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done 12608 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 14058 tasks  

In [49]:
df_results.to_pickle('results_totaal.pkl')

In [61]:
def research_permutations(df, annotators):
    perm = list(permutations(annotators))
    metrics = []
    for team in perm:
        agreement = agreement_measures(df, team)
        metrics.append({
            'alpha_masi': agreement[0],
            'alpha_jaccard': agreement[1],
            'alpha_custom': agreement[2],
            'multi_kappa': agreement[3],
            'multi_pi': agreement[4]
        }) 
    return metrics

In [62]:
results_permutation =  Parallel(n_jobs = -1, verbose = 1)(delayed(research_permutations)(result[0], team) for team in random.sample(list(combinations(annotators, 6)), 100))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 25.7min finished


In [65]:
final_results_permutation = [item for sublist in results_permutation for item in sublist]

In [70]:
with open('results_permutation_experiment.pkl', 'wb') as f:
    pickle.dump(final_results_permutation, f)