# Intro
A sample from the DWUG DE data set was additionally annotated with sense descriptions extracted from two historical dictionaries. Annotators were instructed to choose only one of the provided descriptions and could choose one generic description 'others' for senses which were not provided in the descriptions. In the first round only annotatorA annotated the data and also provided some additional sense descriptions. These were then added to the previous descriptions and presented to two further annotators. We analyze the agreement of all three annotators and the correspondence of the annotated data with different cleaning methods (majority label conditions) to the inferred SemEval clusterings.

In [1]:
import sys
sys.path.append('../')
from os.path import exists
import os
import pandas as pd
from pandas import DataFrame
import numpy as np
from pathlib import Path
import csv
from collections import defaultdict, Counter
from itertools import combinations
import seaborn as sb
from sklearn.metrics import cohen_kappa_score, hamming_loss, accuracy_score
import krippendorff_ as krippendorff
from sklearn.metrics.cluster import adjusted_rand_score
from matplotlib import pyplot as plt
import matplotlib.colors as mcolors
nice_colors = [x for x in mcolors.get_named_colors_mapping().values() if isinstance(x, str)] # Nice colors
colors_global = ['#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00'] # color-blind colors

# Load inferred clusterings

In [2]:
input_path = '../../data/dwug_de_1.1.0/dwug_de/clusters/semeval'
data = []
for root, subdirectories, files in os.walk(input_path):
    for f in files:
        path = os.path.join(root, f)
        #print(f)       
        with open(path, encoding='utf-8') as csvfile: 
            reader = csv.DictReader(csvfile, delimiter='\t',quoting=csv.QUOTE_NONE,strict=True)
            table = [row | {'lemma':f.split('.')[0]} for row in reader]
            data = data + table
 
variable_names = list(data[0].keys())
variable_names.remove('identifier')
index = [row['identifier'] for row in data]
variables = [[row[name] for row in data] for name in variable_names]
df = DataFrame(np.transpose(variables), index=index, columns=variable_names)
df["cluster"] = df["cluster"].astype(str).astype(int)

# Load dwug_de sense description annotation

In [3]:
input_path = '../../data/dwug_de_1.1.0/data'
judgments_senses = []
data_senses = []
for root, subdirectories, files in os.walk(input_path):
    for f in files:
        path = os.path.join(root, f)
        if f=='judgments_senses.csv':
            #print(f)    
            with open(path, encoding='utf-8') as csvfile: 
                reader = csv.DictReader(csvfile, delimiter='\t',quoting=csv.QUOTE_NONE,strict=True)
                table = [row for row in reader]
                judgments_senses = judgments_senses + table
        if f=='senses.csv':
            with open(path, encoding='utf-8') as csvfile: 
                reader = csv.DictReader(csvfile, delimiter='\t',quoting=csv.QUOTE_NONE,strict=True)
                table = [row | {'lemma':path.split('/')[-2]} for row in reader]
                data_senses = data_senses + table
 
# Transform data format            
annotators = sorted(list(set([row['annotator'] for row in judgments_senses])))
identifier2annotator2judgment = defaultdict(lambda: {})
for row in judgments_senses:
    identifier2annotator2judgment[(row['identifier'],row['lemma'])] |= {row['annotator']:row['identifier_sense']}    
#print(identifier2annotator2judgment)
judgments_senses_transformed = [{'identifier':identifier} | {annotator:annotator2judgment[annotator] 
                    for annotator in annotators} | {'lemma':lemma} for ((identifier,lemma), annotator2judgment) in 
                    identifier2annotator2judgment.items()]
variable_names = list(judgments_senses_transformed[0].keys())
variable_names.remove('identifier')
index = [row['identifier'] for row in judgments_senses_transformed]
variables = [[row[name] for row in judgments_senses_transformed] for name in variable_names]
df_description = DataFrame(np.transpose(variables), index=index, columns=variable_names)

# Get sense labels
lemma2description2label = defaultdict(lambda: {})
for row in data_senses:
    lemma2description2label[row['lemma']] |= {row['description_sense']:row['identifier_sense']}    

In [4]:
def sense2index(label):
    label_1 = label[-1]
    label_2 = label[-2:-1]
    if label_2.isdigit():
        id_ = int(label_2)
    else:
        id_ = int(label_1)
    return id_

# Get lemmas
lemmas = sorted(list(set(df_description['lemma'])))
#print(lemmas)
lemma2index = {l:i*100 for (i,l) in enumerate(lemmas)}
#print(lemma2index)
# Transform data, convert to numbers, set nans
df_description = df_description.apply(lambda x: pd.Series([lemma2index[x['lemma']]+sense2index(y) if y!='None' 
                else np.NaN for y in x[annotators[0]:annotators[-1]]]+[x['lemma']], index=df_description.columns),
                axis=1)
index2description = {lemma2index[lemma]+sense2index(label):description 
               for lemma, description2label in lemma2description2label.items() for description, label in 
               description2label.items()}

# Join data frames

In [5]:
for column in df_description.columns:
    if column=='lemma':
        continue
    df[column] = np.NaN 
for identifier, row in df_description.iterrows():
    for column in df_description.columns:
        df.loc[identifier,column] = row[column]

lemmas = sorted(list(set(df['lemma'])))
    
print(df)

                                          cluster           lemma  annotatorA  \
treitschke_geschichte02_1882-7387-3           0.0  Ausnahmegesetz         NaN   
2532889X_1978-04-01_01_229.tcf.xml-40-11      0.0  Ausnahmegesetz         NaN   
treitschke_geschichte02_1882-8135-23          0.0  Ausnahmegesetz         NaN   
treitschke_geschichte03_1885-129-10           0.0  Ausnahmegesetz         NaN   
26120215_1963_06_01_01_068.tcf.xml-26-4       0.0  Ausnahmegesetz         NaN   
...                                           ...             ...         ...   
beyer_poetik01_1882-7723-51                   NaN        Mißklang       801.0   
robert_griechische_1881-1366-6                NaN           Titel         NaN   
savigny_system01_1840-2586-21                 NaN           Titel      1506.0   
26120215_1966_07_27_01_394.tcf.xml-9-7        NaN           Titel         NaN   
2532889X_1966-05-09_01_147.tcf.xml-3-5        NaN           Titel      1504.0   

                           

In [6]:
# Add andere column
def extract_label_count(row, labels, columns, index, name):
    label2count = Counter(row[columns])
    label_count = np.sum([label2count[label] for label in label2count if label in labels])
    out_data = pd.Series(row.to_list()+[label_count], index=index+[name])
    return out_data

indexes_andere = [index for index, description in index2description.items() if description == 'andere']
df = df.apply(lambda x: extract_label_count(x, indexes_andere, annotators, 
                 list(df.columns), 'label_count_andere'), axis=1)
#print(df)

print(len(df.index))
df = df.drop(df[(df['label_count_andere'] > 0)].index) # uncomment to test without instances annotated as andere
print(len(df.index))

8488
8375


# Pairwise Cohen's Kappa

In [7]:
cohs = []
for a, b in combinations(annotators, 2):
    data = df[~df[a].isnull() & ~df[b].isnull()]
    coh = cohen_kappa_score(data[a], data[b])
    print('full', a, b, coh)
    cohs.append(coh)
    
mean = np.mean(cohs)    
print('mean', mean)

# By lemma
groups = df.groupby(by="lemma")
data_groups = [groups.get_group(x) for x in groups.groups]    
for i, lemma in enumerate(lemmas):
    cohs = []
    for a, b in combinations(annotators, 2):
        data = data_groups[i]
        data = data[~data[a].isnull() & ~data[b].isnull()]
        coh = cohen_kappa_score(data[a], data[b])
        print(lemma, a, b, coh)
        cohs.append(coh)

    mean = np.nanmean(cohs)    
    print('mean', mean)

full annotatorA annotatorB 0.8519102638032974
full annotatorA annotatorC 0.8882536723351926
full annotatorB annotatorC 0.8862587056179285
mean 0.8754742139188062
Abgesang annotatorA annotatorB 0.837962962962963
Abgesang annotatorA annotatorC 1.0
Abgesang annotatorB annotatorC 0.837962962962963
mean 0.8919753086419754
Ackergerät annotatorA annotatorB nan
Ackergerät annotatorA annotatorC nan
Ackergerät annotatorB annotatorC nan
mean nan
Armenhaus annotatorA annotatorB 1.0
Armenhaus annotatorA annotatorC 1.0
Armenhaus annotatorB annotatorC 1.0
mean 1.0
Ausnahmegesetz annotatorA annotatorB nan
Ausnahmegesetz annotatorA annotatorC nan
Ausnahmegesetz annotatorB annotatorC nan
mean nan
Dynamik annotatorA annotatorB 0.5685618729096991
Dynamik annotatorA annotatorC 0.6606538895152199
Dynamik annotatorB annotatorC 0.6226415094339622
mean 0.6172857572862936
Einreichung annotatorA annotatorB nan
Einreichung annotatorA annotatorC nan
Einreichung annotatorB annotatorC nan
mean nan
Eintagsfliege anno

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  mean = np.nanmean(cohs)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  mean = np.nanmean(cohs)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  mean = np.nanmean(cohs)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  mean = np.nanmean(cohs)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  mean = np


mean 0.8404096720347815
beimischen annotatorA annotatorB nan
beimischen annotatorA annotatorC nan
beimischen annotatorB annotatorC nan
mean nan
packen annotatorA annotatorB 1.0
packen annotatorA annotatorC 0.9042253521126761
packen annotatorB annotatorC 0.9090909090909091
mean 0.9377720870678617
verbauen annotatorA annotatorB nan
verbauen annotatorA annotatorC nan
verbauen annotatorB annotatorC nan
mean nan
vergönnen annotatorA annotatorB nan
vergönnen annotatorA annotatorC nan
vergönnen annotatorB annotatorC nan
mean nan
voranstellen annotatorA annotatorB nan
voranstellen annotatorA annotatorC nan
voranstellen annotatorB annotatorC nan
mean nan
vorliegen annotatorA annotatorB nan
vorliegen annotatorA annotatorC nan
vorliegen annotatorB annotatorC nan
mean nan
vorweisen annotatorA annotatorB nan
vorweisen annotatorA annotatorC nan
vorweisen annotatorB annotatorC nan
mean nan
weitgreifend annotatorA annotatorB nan
weitgreifend annotatorA annotatorC nan
weitgreifend annotatorB annotator

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  mean = np.nanmean(cohs)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  mean = np.nanmean(cohs)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  mean = np.nanmean(cohs)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  mean = np.nanmean(cohs)


# Pairwise Percentage Agreement
- see https://dl.acm.org/doi/10.1162/coli.07-034-R2
- as far as I understand (please verify) the same as inter-tagger agreement (ITA) used in early words sense annotation studies (e.g. SenseEval).
- "With WordNet, the sense inventory currently most widely used in word sense annotation, ITA ranges from 67% to 78%" (Erk et al. 2013)

In [8]:
pers = []
for a, b in combinations(annotators, 2):
    data = df[~df[a].isnull() & ~df[b].isnull()]
    per = 1-hamming_loss(data[a], data[b])
    print('full', a, b, per)
    pers.append(per)
    
mean = np.mean(pers)    
print('mean', mean)

# By lemma
groups = df.groupby(by="lemma")
data_groups = [groups.get_group(x) for x in groups.groups]    
for i, lemma in enumerate(lemmas):
    pers = []
    for a, b in combinations(annotators, 2):
        data = data_groups[i]
        data = data[~data[a].isnull() & ~data[b].isnull()]
        per = 1-hamming_loss(data[a], data[b])
        print(lemma, a, b, per)
        pers.append(per)

    mean = np.nanmean(pers)    
    print('mean', mean)

full annotatorA annotatorB 0.8551724137931034
full annotatorA annotatorC 0.8907646474677259
full annotatorB annotatorC 0.8887841658812441
mean 0.8782404090473578
Abgesang annotatorA annotatorB 0.9428571428571428
Abgesang annotatorA annotatorC 1.0
Abgesang annotatorB annotatorC 0.9428571428571428
mean 0.9619047619047619
Ackergerät annotatorA annotatorB nan
Ackergerät annotatorA annotatorC nan
Ackergerät annotatorB annotatorC nan
mean nan
Armenhaus annotatorA annotatorB 1.0
Armenhaus annotatorA annotatorC 1.0
Armenhaus annotatorB annotatorC 1.0
mean 1.0
Ausnahmegesetz annotatorA annotatorB nan
Ausnahmegesetz annotatorA annotatorC nan
Ausnahmegesetz annotatorB annotatorC nan
mean nan
Dynamik annotatorA annotatorB 0.7906976744186046
Dynamik annotatorA annotatorC 0.8372093023255813
Dynamik annotatorB annotatorC 0.8222222222222222
mean 0.8167097329888028
Einreichung annotatorA annotatorB nan
Einreichung annotatorA annotatorC nan
Einreichung annotatorB annotatorC nan
mean nan
Eintagsfliege an

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  mean = np.nanmean(pers)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  mean = np.nanmean(pers)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  mean = np.nanmean(pers)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  mean = np.nanmean(pers)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  mean = np.nanmean(pers)


Ohrwurm annotatorA annotatorB 1.0
Ohrwurm annotatorA annotatorC 1.0
Ohrwurm annotatorB annotatorC 1.0
mean 1.0
Pachtzins annotatorA annotatorB nan
Pachtzins annotatorA annotatorC nan
Pachtzins annotatorB annotatorC nan
mean nan
Rezeption annotatorA annotatorB 0.9591836734693877
Rezeption annotatorA annotatorC 0.9795918367346939
Rezeption annotatorB annotatorC 0.92
mean 0.9529251700680272
Schmiere annotatorA annotatorB 0.875
Schmiere annotatorA annotatorC 0.8181818181818181
Schmiere annotatorB annotatorC 0.8636363636363636
mean 0.8522727272727272
Seminar annotatorA annotatorB 0.782608695652174
Seminar annotatorA annotatorC 0.7555555555555555
Seminar annotatorB annotatorC 0.7777777777777778
mean 0.7719806763285023
Sensation annotatorA annotatorB 0.782608695652174
Sensation annotatorA annotatorC 0.7391304347826086
Sensation annotatorB annotatorC 0.88
mean 0.8005797101449276
Spielball annotatorA annotatorB 0.9591836734693877
Spielball annotatorA annotatorC 0.9791666666666666
Spielball anno

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  mean = np.nanmean(pers)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  mean = np.nanmean(pers)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  mean = np.nanmean(pers)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  mean = np.nanmean(pers)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  mean = np.nanmean(pers)


beimischen annotatorA annotatorB nan
beimischen annotatorA annotatorC nan
beimischen annotatorB annotatorC nan
mean nan
packen annotatorA annotatorB 1.0
packen annotatorA annotatorC 0.9411764705882353
packen annotatorB annotatorC 0.9444444444444444
mean 0.9618736383442266
verbauen annotatorA annotatorB nan
verbauen annotatorA annotatorC nan
verbauen annotatorB annotatorC nan
mean nan
vergönnen annotatorA annotatorB nan
vergönnen annotatorA annotatorC nan
vergönnen annotatorB annotatorC nan
mean nan
voranstellen annotatorA annotatorB nan
voranstellen annotatorA annotatorC nan
voranstellen annotatorB annotatorC nan
mean nan
vorliegen annotatorA annotatorB nan
vorliegen annotatorA annotatorC nan
vorliegen annotatorB annotatorC nan
mean nan
vorweisen annotatorA annotatorB nan
vorweisen annotatorA annotatorC nan
vorweisen annotatorB annotatorC nan
mean nan
weitgreifend annotatorA annotatorB nan
weitgreifend annotatorA annotatorC nan
weitgreifend annotatorB annotatorC nan
mean nan
zersetzen 

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  mean = np.nanmean(pers)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  mean = np.nanmean(pers)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  mean = np.nanmean(pers)


# Krippendorf

In [9]:
# Full data
data = np.transpose(df.loc[:, annotators[0]:annotators[-1]].values)
kri = krippendorff.alpha(reliability_data=data, level_of_measurement='nominal')
print('full', kri)

# Pairwise
for a, b in combinations(annotators, 2):
    data = [df[a].values, df[b].values]
    kri = krippendorff.alpha(reliability_data=data, level_of_measurement='nominal')
    print('full', a, b, kri)


# By lemma
groups = df.groupby(by="lemma")
data_groups = [groups.get_group(x) for x in groups.groups]
for i, lemma in enumerate(lemmas):
    data = np.transpose(data_groups[i].loc[:, annotators[0]:annotators[-1]].values)
    #print(data)
    try:
        kri = krippendorff.alpha(reliability_data=data, level_of_measurement='nominal')
    except AssertionError:
        kri = np.nan
    print(lemma, kri)
    #print(data_groups[i])
    
    

full 0.8734491989957522
full annotatorA annotatorB 0.8519030313743345
full annotatorA annotatorC 0.8882605729326465
full annotatorB annotatorC 0.8862822946006377
Abgesang 0.8930041152263375
Ackergerät nan
Armenhaus 1.0
Ausnahmegesetz nan
Dynamik 0.6264150943396227
Einreichung nan
Eintagsfliege 0.9061102831594635
Engpaß 0.9171461449942463
Entscheidung nan
Festspiel nan
Frechheit nan
Fuß 0.9359653346172364
Gesichtsausdruck nan
Knotenpunkt 0.3261627906976744
Kubikmeter nan
Lyzeum nan
Manschette 0.7934368590682683
Mißklang 0.5689655172413792
Mulatte nan
Naturschönheit nan
Ohrwurm 1.0
Pachtzins nan
Rezeption 0.7705426356589147
Schmiere 0.8061628345269506
Seminar 0.6218834080717488
Sensation 0.4288340336134454
Spielball 0.6547619047619048
Tier nan
Titel 0.7679558011049723
Tragfähigkeit nan
Truppenteil nan
Unentschlossenheit nan
abbauen 0.8418015707645456
abdecken 0.4898085237801112
abgebrüht 0.972972972972973
artikulieren 0.0
aufrechterhalten nan
ausspannen 0.8423423423423424
beimischen nan


# Extract majority labels

In [10]:
def extract_majority_label(row, columns, threshold, index):
    label2count = Counter(row[columns])
    majority_labels = [l for l, c in label2count.items() if c >= threshold]
    if len(majority_labels) > 0:
        label = np.random.choice(majority_labels)
    else:
        label = np.NaN  
    #print(label, row, columns, index, index+['maj_'+str(threshold)])
    out_data = pd.Series(row.to_list()+[label], index=index+['maj_'+str(threshold)])
    return out_data
        
df = df.apply(lambda x: extract_majority_label(x, annotators, 1, index=list(df.columns)), axis=1)
df = df.apply(lambda x: extract_majority_label(x, annotators, 2, index=list(df.columns)), axis=1)
df = df.apply(lambda x: extract_majority_label(x, annotators, 3, index=list(df.columns)), axis=1)
print(df)


                                          cluster           lemma  annotatorA  \
treitschke_geschichte02_1882-7387-3           0.0  Ausnahmegesetz         NaN   
2532889X_1978-04-01_01_229.tcf.xml-40-11      0.0  Ausnahmegesetz         NaN   
treitschke_geschichte02_1882-8135-23          0.0  Ausnahmegesetz         NaN   
treitschke_geschichte03_1885-129-10           0.0  Ausnahmegesetz         NaN   
26120215_1963_06_01_01_068.tcf.xml-26-4       0.0  Ausnahmegesetz         NaN   
...                                           ...             ...         ...   
beyer_poetik01_1882-7723-51                   NaN        Mißklang       801.0   
robert_griechische_1881-1366-6                NaN           Titel         NaN   
savigny_system01_1840-2586-21                 NaN           Titel      1506.0   
26120215_1966_07_27_01_394.tcf.xml-9-7        NaN           Titel         NaN   
2532889X_1966-05-09_01_147.tcf.xml-3-5        NaN           Titel      1504.0   

                           

# Compare to inferred clusterings

In [11]:
# By lemma
majs = ['annotatorA', 'maj_2','maj_3']
namess = []
file2variables = {'semeval-correlation-2.5':[]}
for maj in majs:
    print('-----','condition', maj, '-----') 
    names = []
    aris = []
    groups = df.groupby(by="lemma")
    data_groups = [groups.get_group(x) for x in groups.groups]
    for i, lemma in enumerate(lemmas):
        data = data_groups[i].reindex(columns = ['cluster', maj])
        #data = np.transpose(data.values)
        data = np.transpose(data[(~data['cluster'].isnull()) & (~data[maj].isnull())].values)
        #print(data)
        if len(data[0]) == 0 :
            continue
        names.append(lemma)
        ari = adjusted_rand_score(data[0], data[1])
        #print(lemma, ari, len(data[0]))
        aris.append(ari)
        #print(data_groups[i])

    namess.append(names)    
    file2variables['semeval-correlation-2.5'].append(aris) 
    mean = np.mean(aris)    
    print('mean', mean)

for (names1, names2) in combinations(namess,2):
    if (names1 != names2):
        print(names1, '\n', names2)
        sys.exit('Breaking: names in columns don\'t match.')
        
names = namess[0]    

----- condition annotatorA -----
mean 0.5025746601524207
----- condition maj_2 -----
mean 0.5829602562214008
----- condition maj_3 -----
mean 0.6521181994175511


In [24]:
# Plot results
variable_names = majs
for (i, name) in enumerate(variable_names):
    
    if not name == 'maj_3':
        continue
        
    name = 'ARI'
    
    variable_over_files = np.transpose(np.array([file2variables[filename][i] for filename in file2variables.keys()]))
    df = DataFrame(variable_over_files, index=names, columns=list(file2variables.keys()))
    df.sort_values([list(file2variables.keys())[0]], ascending=[True], inplace=True)
    #df.to_numpy()
    #df=(df-df.min())/(df.max()-df.min()) # normalize columns
    plt.figure(figsize=(9,6))
    plt.ylabel(name, fontsize='large')
    #plt.ylim(0.85, 4.15)
    #print(df)
    plt.xticks(np.arange(0,len(df.index)*3,3), df.index, fontsize='medium', rotation=45)
    for filename in file2variables.keys():
        #print(filename)
        variable = df[filename]
        plt.plot(np.arange(0,len(variable)*3,3), variable, label=filename[:20], marker='o', color='k', linestyle='None')
    #plt.legend()
    plt.tight_layout()
    plt.show()
    #plt.savefig('ari-scores.png', dpi=300)
    plt.close()


# Statistics

In [None]:
def extract_nan_count(row, columns, index):
    nan_count = row[columns].isna().sum()
    #print(nan_count)
    out_data = pd.Series(row.to_list()+[nan_count], index=index+['nans_'+'_'.join(columns)])
    return out_data
        
df_description = df_description.apply(lambda x: extract_nan_count(x, annotators, index=list(df_description.columns)), axis=1)
#print(df_description)

indexes_none = df_description[ np.isnan(df_description[annotators[0]]) | 
                np.isnan(df_description[annotators[1]]) | np.isnan(df_description[annotators[2]])].index  
print('total number of sense description annotations:', len(df_description.index))
for i in [1,2,3]:
    print('number of instances with exactly {0} missing annotations:'.format(i), 
      len(df_description[ df_description['nans_'+'_'.join(annotators)] == i].index))

In [None]:
indexes_disagreement_0 = df[(~df['maj_3'].isnull())].index
indexes_disagreement_1 = df[(~df['maj_2'].isnull()) & (df['maj_3'].isnull())].index
indexes_disagreement_3 = df[(~df['maj_1'].isnull()) & (df['maj_2'].isnull())].index
print('number of instances with exactly 3 agreeing annotators:', len(indexes_disagreement_0))
print('number of instances with exactly 2 agreeing annotators:', len(indexes_disagreement_1))
print('number of instances with exactly 0 agreeing annotators:', len(indexes_disagreement_3))

In [None]:
for i in [1,2,3]:
    print('number of instances with exactly {0} \'andere\' annotations:'.format(i), 
      len(df[ df['label_count_andere'] == i].index))

# Conclusion
- agreement on sense description annotation is high with 0.82 Krippendorf's alpha, percentage agreement (ITA) and pairwise Cohen's Kappa.
- Nikolai: how is agreement on binary WiC annotation, e.g. recent competition you participated in?
- low agreement between annotators A versus B and C on *zersetzen* comes from this: the word has mainly two senses 'to destroy' and 'to dissolve'. Annotator A chose to annotate instances of a chemical or physical dissolving with the sense 'to destroy', while the others chose to annotate it with the sense 'to dissolve'.
- correspondence between sense description annotation and inferred clusterings was 0.5 (ARI) for annotatorA and is now considerably higher with 0.59/0.65 for a threshold of 2/3 agreeing annotations of the three annotators.
- using opt clusterings instead of semeval yields the same results (less than 0.01 improvement).
- removing instances annotated with at least one 'andere' label increases agreement scores to 0.88, but has no effect on correspondence with inferred clusterings.

# To do
- in future: annotate semantic proximity of sense descriptions

# To come
- which effect on correspondence have different clustering methods
- which effect has additional data on the graph clusterings

# Load pairwise proximity annotation

In [None]:
input_path = '../../data/dwug_de_1.1.0/data'
judgments_pairwise = []
for root, subdirectories, files in os.walk(input_path):
    for f in files:
        path = os.path.join(root, f)
        if f=='judgments.csv':
            with open(path, encoding='utf-8') as csvfile: 
                reader = csv.DictReader(csvfile, delimiter='\t',quoting=csv.QUOTE_NONE,strict=True)
                table = [row for row in reader]
                judgments_pairwise = judgments_pairwise + table

# Transform data format            
annotators_pairwise = sorted(list(set([row['annotator'] for row in judgments_pairwise])))
identifier2annotator2judgment = defaultdict(lambda: {})
for row in judgments_pairwise:
    identifier2annotator2judgment[(frozenset((row['identifier1'],row['identifier2'])),row['lemma'])] |= {row['annotator']:float(row['judgment'])}    
#print(identifier2annotator2judgment)
judgment_pairwise_transformed = [{'identifier':identifier1+'__'+identifier2} | 
                                 {annotator:annotator2judgment[annotator] if annotator in annotator2judgment else np.NaN for annotator in annotators_pairwise} | 
                                 {'lemma':lemma} for (((identifier1,identifier2),lemma), annotator2judgment) 
                                 in identifier2annotator2judgment.items()]
variable_names = list(judgment_pairwise_transformed[0].keys())
variable_names.remove('identifier')
index = [row['identifier'] for row in judgment_pairwise_transformed]
variables = [[row[name] for row in judgment_pairwise_transformed] for name in variable_names]
df_pairwise = DataFrame(np.transpose(variables), index=index, columns=variable_names)
for annotator in annotators_pairwise:
    df_pairwise[annotator] = df_pairwise[annotator].astype(float)
    #print(len(df_pairwise[df_pairwise[annotator]==0.0].index))
    df_pairwise[annotator] = df_pairwise[annotator].replace(0.0, np.NaN)
df_pairwise.head()

In [None]:
def extract_median(row, columns, index):
    #print(row[columns].to_list())
    median = np.nanmedian(row[columns].to_list())
    out_data = pd.Series(row.to_list()+[median], index=index+['median'])
    return out_data
def extract_std(row, columns, index):
    #print(row[columns].to_list())
    std = np.nanstd(row[columns].to_list())
    out_data = pd.Series(row.to_list()+[std], index=index+['std'])
    return out_data
         
# Add annotator number column
df_pairwise['annotator_no'] = df_pairwise[annotators_pairwise].count(axis=1) 
    
# Add median and std column
df_pairwise = df_pairwise.apply(lambda x: extract_median(x, annotators_pairwise, list(df_pairwise.columns)), axis=1)
df_pairwise = df_pairwise.apply(lambda x: extract_std(x, annotators_pairwise, list(df_pairwise.columns)), axis=1)

# Add binarize column
df_pairwise['binarized'] = np.NaN
indices_1 = df_pairwise[(df_pairwise['median']==4.0) & (df_pairwise['std']==0.0)
                        & (df_pairwise['annotator_no']>1)].index
indices_0 = df_pairwise[( (df_pairwise['median']==1.0) | (df_pairwise['median']==2.0)) 
                        & (df_pairwise['std']==0.0) & (df_pairwise['annotator_no']>1)].index

In [None]:
df_pairwise.loc[indices_1,'binarized'] = 1.0
df_pairwise.loc[indices_0,'binarized'] = 0.0

# Add column with pairwise annotation derived from sense annotation
for column in df.columns:
    if column=='lemma':
        continue
    df_pairwise[column] = np.NaN
    
for ((identifier1,identifier2),lemma) in identifier2annotator2judgment:
    if (not identifier1 in df.index) or (not identifier2 in df.index):
        continue
    for column in df.columns:
        if column=='lemma':
            continue
        judgment1, judgment2 = df.loc[identifier1,column], df.loc[identifier2,column]
        if np.isnan(judgment1) or np.isnan(judgment2):
            judgment_derived = np.NaN
        elif judgment1==judgment2:
            judgment_derived = 1.0
        else:
            judgment_derived = 0.0
        if identifier1 + '__' + identifier2 in df_pairwise.index:
            df_pairwise.loc[identifier1 + '__' + identifier2,column] = judgment_derived
            #if lemma=='Armenhaus' and column=='maj_2':
            #    print(judgment1, judgment2, judgment_derived)
        elif identifier2 + '__' + identifier1 in df_pairwise.index:
            df_pairwise.loc[identifier2 + '__' + identifier1,column] = judgment_derived
        else:
            df_pairwise.loc[identifier1 + '__' + identifier2,column] = judgment_derived

#print(df_pairwise[(df_pairwise['lemma']=='Armenhaus')])
#print(len(df_pairwise[(~df_pairwise['maj_2'].isnull()) & (df_pairwise['lemma']=='Armenhaus')].index))

indexes_binarized = df_pairwise[(~df_pairwise['binarized'].isnull())].index
indexes_annotatorA = df_pairwise[(~df_pairwise['annotatorA'].isnull())].index
indexes_maj_2 = df_pairwise[(~df_pairwise['maj_2'].isnull())].index
indexes_maj_3 = df_pairwise[(~df_pairwise['maj_3'].isnull())].index
print('number of binarized instances from pairwise proximity annotation:', len(indexes_binarized))
print('number of inferred pairwise instances from annotatorA condition on sense annotation:', len(indexes_annotatorA))
print('number of inferred pairwise instances from maj_2 condition on sense annotation:', len(indexes_maj_2))
print('number of inferred pairwise instances from maj_3 condition on sense annotation:', len(indexes_maj_3))

# Evaluate accuracy between (reliable) pairwise annotation and inferred pairwise annotation from senses

In [None]:
for a in ['binarized']:
    for b in ['annotatorA', 'maj_2', 'maj_3']:
        data = df_pairwise[~df_pairwise[a].isnull() & ~df_pairwise[b].isnull()]
        score = accuracy_score(data[a], data[b])
        print(a, b, score)
        print('majority class baseline:', max([accuracy_score(data[a], [0.0]*len(data.index)), 
                                            accuracy_score(data[a], [1.0]*len(data.index))]))
        print('number of evaluation instances:', len(data.index))
        #print('number of instances where {0} is not nan:'.format(a), len(df_pairwise[~df_pairwise[a].isnull()].index))
        #print('number of instances where {0} is not nan:'.format(b), len(df_pairwise[~df_pairwise[b].isnull()].index))
print('-----')

# By lemma
lemmas = sorted(list(set(df_pairwise['lemma'])))
#print('number of lemmas:', len(lemmas))
groups = df_pairwise.groupby(by="lemma")
data_groups = [groups.get_group(x) for x in groups.groups]    
for i, lemma in enumerate(lemmas):
    data_full = data_groups[i]
    for a in ['binarized']:
        for b in ['annotatorA', 'maj_2', 'maj_3']:
            data = data_full[(~data_full[a].isnull()) & (~data_full[b].isnull())]
            score = accuracy_score(data[a], data[b])
            if not np.isnan(score):
                print(lemma, a, b, score)
                print('majority class baseline:', max([accuracy_score(data[a], [0.0]*len(data.index)), 
                                            accuracy_score(data[a], [1.0]*len(data.index))]))
                print('number of evaluation instances:', len(data.index))
                #print('number of instances where {0} is not nan:'.format(a), len(data_full[(~data_full[a].isnull())].index))
                #print('number of instances where {0} is not nan:'.format(b), len(data_full[(~data_full[b].isnull())].index))
           

# Conclusion
- accuracy of cleaned and binarized pairwise annotation with inferred pairwise annotation from sense description annotation is 0.87/0.94/0.99 for conditions annotatorA/maj_2/maj_3. This means that if annotators show high agreement on either type of annotation they lead to the same (0.99) results in terms of binary sense distinctions.