# Intro
We analyze the correspondence of DWUG DE 2.0.0 (contains additional pairwise judgments) inferred clusterings to annotated sense descriptions.

In [1]:
from os.path import exists
import os
import pandas as pd
from pandas import DataFrame
import numpy as np
from pathlib import Path
import csv
from collections import defaultdict, Counter
from itertools import combinations
import seaborn as sb
from sklearn.metrics import cohen_kappa_score, hamming_loss, accuracy_score
import krippendorff
from sklearn.metrics.cluster import adjusted_rand_score
from matplotlib import pyplot as plt
import matplotlib.colors as mcolors
nice_colors = [x for x in mcolors.get_named_colors_mapping().values() if isinstance(x, str)] # Nice colors
colors_global = ['#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00'] # color-blind colors

# Load inferred clusterings

In [2]:
input_path = '../../data/dwug_de/clusters' # opt clusterings
data = []
for root, subdirectories, files in os.walk(input_path):
    for f in files:
        path = os.path.join(root, f)
        #print(f)       
        with open(path, encoding='utf-8') as csvfile: 
            reader = csv.DictReader(csvfile, delimiter='\t',quoting=csv.QUOTE_NONE,strict=True)
            table = [row | {'lemma':f.split('.')[0]} for row in reader]
            data = data + table
 
variable_names = list(data[0].keys())
variable_names.remove('identifier')
index = [row['identifier'] for row in data]
variables = [[row[name] for row in data] for name in variable_names]
df = DataFrame(np.transpose(variables), index=index, columns=variable_names)
df["cluster"] = df["cluster"].astype(str).astype(int)

# Load dwug_de sense description annotation

In [3]:
input_path = '../../data/dwug_de/data'
judgments_senses = []
data_senses = []
for root, subdirectories, files in os.walk(input_path):
    for f in files:
        path = os.path.join(root, f)
        if f=='judgments_senses.csv':
            #print(f)    
            with open(path, encoding='utf-8') as csvfile: 
                reader = csv.DictReader(csvfile, delimiter='\t',quoting=csv.QUOTE_NONE,strict=True)
                table = [row for row in reader]
                judgments_senses = judgments_senses + table
        if f=='senses.csv':
            with open(path, encoding='utf-8') as csvfile: 
                reader = csv.DictReader(csvfile, delimiter='\t',quoting=csv.QUOTE_NONE,strict=True)
                table = [row | {'lemma':path.split('/')[-2]} for row in reader]
                data_senses = data_senses + table
 
# Transform data format            
annotators = sorted(list(set([row['annotator'] for row in judgments_senses])))
identifier2annotator2judgment = defaultdict(lambda: {})
for row in judgments_senses:
    identifier2annotator2judgment[(row['identifier'],row['lemma'])] |= {row['annotator']:row['identifier_sense']}    
#print(identifier2annotator2judgment)
judgments_senses_transformed = [{'identifier':identifier} | {annotator:annotator2judgment[annotator] 
                    for annotator in annotators} | {'lemma':lemma} for ((identifier,lemma), annotator2judgment) in 
                    identifier2annotator2judgment.items()]
variable_names = list(judgments_senses_transformed[0].keys())
variable_names.remove('identifier')
index = [row['identifier'] for row in judgments_senses_transformed]
variables = [[row[name] for row in judgments_senses_transformed] for name in variable_names]
df_description = DataFrame(np.transpose(variables), index=index, columns=variable_names)

# Get sense labels
lemma2description2label = defaultdict(lambda: {})
for row in data_senses:
    lemma2description2label[row['lemma']] |= {row['description_sense']:row['identifier_sense']}    

In [4]:
def sense2index(label):
    label_1 = label[-1]
    label_2 = label[-2:-1]
    if label_2.isdigit():
        id_ = int(label_2)
    else:
        id_ = int(label_1)
    return id_

# Get lemmas
lemmas = sorted(list(set(df_description['lemma'])))
#print(lemmas)
lemma2index = {l:i*100 for (i,l) in enumerate(lemmas)}
#print(lemma2index)
# Transform data, convert to numbers, set nans
df_description = df_description.apply(lambda x: pd.Series([lemma2index[x['lemma']]+sense2index(y) if y!='None' 
                else np.NaN for y in x[annotators[0]:annotators[-1]]]+[x['lemma']], index=df_description.columns),
                axis=1)
index2description = {lemma2index[lemma]+sense2index(label):description 
               for lemma, description2label in lemma2description2label.items() for description, label in 
               description2label.items()}

# Join data frames

In [5]:
for column in df_description.columns:
    if column=='lemma':
        continue
    df[column] = np.NaN 
for identifier, row in df_description.iterrows():
    for column in df_description.columns:
        df.loc[identifier,column] = row[column]

lemmas = sorted(list(set(df['lemma'])))
    
print(df)

                                          cluster           lemma  annotatorA  \
mommsen_roemische03_1856-4686-25              0.0  Ausnahmegesetz         NaN   
2532889X_1975-09-20_01_251.tcf.xml-26-18      0.0  Ausnahmegesetz         NaN   
26120215_1977_08_04_01_124.tcf.xml-20-8       0.0  Ausnahmegesetz         NaN   
mommsen_roemische03_1856-3166-33              0.0  Ausnahmegesetz         NaN   
26120215_1948_09_22_01_011.tcf.xml-17-6       0.0  Ausnahmegesetz         NaN   
...                                           ...             ...         ...   
2532889X_1949-10-09_01_039.tcf.xml-11-13      NaN      Manschette       701.0   
2532889X_1967-01-15_01_271.tcf.xml-29-5       NaN      Manschette       704.0   
wedekind_erdgeist_1895-2727-9                 NaN       abgebrüht         NaN   
26120215_1950_12_24_01_388.tcf.xml-1-7        NaN             Fuß         NaN   
26120215_1966_07_27_01_394.tcf.xml-9-7        NaN           Titel         NaN   

                           

In [6]:
# Add andere column
def extract_label_count(row, labels, columns, index, name):
    label2count = Counter(row[columns])
    label_count = np.sum([label2count[label] for label in label2count if label in labels])
    out_data = pd.Series(row.to_list()+[label_count], index=index+[name])
    return out_data

indexes_andere = [index for index, description in index2description.items() if description == 'andere']
df = df.apply(lambda x: extract_label_count(x, indexes_andere, annotators, 
                 list(df.columns), 'label_count_andere'), axis=1)
#print(df)

#print(len(df.index))
df = df.drop(df[(df['label_count_andere'] > 0)].index) # uncomment to test without instances annotated as andere
#print(len(df.index))

# Extract majority labels

In [7]:
def extract_majority_label(row, columns, threshold, index):
    label2count = Counter(row[columns])
    majority_labels = [l for l, c in label2count.items() if c >= threshold]
    if len(majority_labels) > 0:
        label = np.random.choice(majority_labels)
    else:
        label = np.NaN  
    #print(label, row, columns, index, index+['maj_'+str(threshold)])
    out_data = pd.Series(row.to_list()+[label], index=index+['maj_'+str(threshold)])
    return out_data
        
df = df.apply(lambda x: extract_majority_label(x, annotators, 1, index=list(df.columns)), axis=1)
df = df.apply(lambda x: extract_majority_label(x, annotators, 2, index=list(df.columns)), axis=1)
df = df.apply(lambda x: extract_majority_label(x, annotators, 3, index=list(df.columns)), axis=1)
print(df)


                                          cluster           lemma  annotatorA  \
mommsen_roemische03_1856-4686-25              0.0  Ausnahmegesetz         NaN   
2532889X_1975-09-20_01_251.tcf.xml-26-18      0.0  Ausnahmegesetz         NaN   
26120215_1977_08_04_01_124.tcf.xml-20-8       0.0  Ausnahmegesetz         NaN   
mommsen_roemische03_1856-3166-33              0.0  Ausnahmegesetz         NaN   
26120215_1948_09_22_01_011.tcf.xml-17-6       0.0  Ausnahmegesetz         NaN   
...                                           ...             ...         ...   
2532889X_1949-10-09_01_039.tcf.xml-11-13      NaN      Manschette       701.0   
2532889X_1967-01-15_01_271.tcf.xml-29-5       NaN      Manschette       704.0   
wedekind_erdgeist_1895-2727-9                 NaN       abgebrüht         NaN   
26120215_1950_12_24_01_388.tcf.xml-1-7        NaN             Fuß         NaN   
26120215_1966_07_27_01_394.tcf.xml-9-7        NaN           Titel         NaN   

                           

# Compare to inferred clusterings

In [8]:
# By lemma
majs = ['annotatorA', 'maj_2','maj_3']
for maj in majs:
    print('-----','condition', maj, '-----') 
    aris = []
    groups = df.groupby(by="lemma")
    data_groups = [groups.get_group(x) for x in groups.groups]
    for i, lemma in enumerate(lemmas):
        data = data_groups[i].reindex(columns = ['cluster', maj])
        #data = np.transpose(data.values)
        data = np.transpose(data[(~data['cluster'].isnull()) & (~data[maj].isnull())].values)
        #print(data)
        if len(data[0]) == 0 :
            continue
        
        ari = adjusted_rand_score(data[0], data[1])
        print(lemma, ari, len(data[0]))
        print(list(Counter(data[0]).values()), list(Counter(data[1]).values())) # label distributions
        aris.append(ari)
        #print(data_groups[i])

    mean = np.mean(aris)    
    print('mean', mean)


----- condition annotatorA -----
Abgesang 0.7601509821166813 35
[27, 8] [27, 8]
Armenhaus 0.9183333333333333 49
[30, 19] [29, 20]
Dynamik 0.4881945930522862 41
[27, 14] [23, 18]
Eintagsfliege 0.8001661819692564 39
[18, 21] [20, 19]
Engpaß 0.8645580217670962 45
[23, 21, 1] [25, 20]
Fuß 0.7300248440960215 43
[25, 12, 2, 3, 1] [20, 1, 5, 12, 1, 3, 1]
Knotenpunkt 0.5600977198697068 37
[22, 11, 4] [19, 18]
Manschette 0.7853770403345475 43
[37, 3, 2, 1] [37, 3, 3]
Mißklang 0.36717439517263895 47
[34, 13] [40, 7]
Ohrwurm 1.0 33
[18, 15] [18, 15]
Rezeption 0.7105847520355293 47
[39, 5, 2, 1] [42, 5]
Schmiere 0.6601344860710855 30
[14, 11, 3, 2] [11, 4, 8, 3, 3, 1]
Seminar 0.0 45
[45] [20, 19, 6]
Sensation 0.2634547812546278 44
[36, 7, 1] [26, 18]
Spielball 0.6421319796954315 48
[46, 2] [47, 1]
Titel 0.09375648205766439 34
[24, 4, 6] [14, 17, 1, 2]
abbauen 0.7150486000637801 47
[21, 17, 8, 1] [21, 20, 6]
abdecken 0.6425053108294232 39
[21, 13, 4, 1] [20, 5, 11, 2, 1]
abgebrüht 0.959132610508757

# Conclusion
- for DWUG DE 1.1.0 correspondence between sense description annotation and inferred clusterings was 0.5/0.59/0.65 for a threshold of annotatorA/2/3. With additional pairwise annotations this correspondence increases to 0.58/0.67/0.74 (general increase of roughly 0.08).
- For most words correspondence increased, 9 of 24 words have near to perfect correspondence (>0.9). For six words correspondence decreases with additional annotation (Seminar, Spielball, Titel, abbauen, ausspannen, packen). Decrease is mostly small.
- Some observations:
 * for Seminar, Spielball the label distribution seems to be strongly skewed (with one label dominating strongly)
 * for Spielball there seems to be at least one erroneous pairwise judgment involved.
 * artikulieren has low correspondence with 0.0. It also has a strongly skewed label distribution.

# Evaluate accuracy between (reliable) pairwise annotation and inferred pairwise annotation from senses
- for DWUG DE 1.1.0 accuracy of cleaned and binarized pairwise annotation with inferred pairwise annotation from sense description annotation was 0.87/0.94/0.99 for conditions annotatorA/maj_2/maj_3. This slightly increases to 0.90/0.96/0.99 with DWUG DE 2.0.0. (For code see other notebook. It was removed here for readability.)

# Comparison of change scores

In [9]:
from scipy.stats import spearmanr, pearsonr
from sklearn.metrics import accuracy_score
from sklearn.metrics import hamming_loss

In [14]:
dataset = 'dwug_en'
clustering = 'semeval'
# Load change scores
input_path = '../../data/{0}_1.0.0/{0}/stats/{1}/stats_groupings.csv'.format(dataset, clustering)
with open(input_path, encoding='utf-8') as csvfile: 
    reader = csv.DictReader(csvfile, delimiter='\t',quoting=csv.QUOTE_NONE,strict=True)
    table = [row for row in reader]
    #print(table[0])
    lemma2binary_semeval = {row['lemma']:row['change_binary'] for row in table}
    lemma2graded_semeval = {row['lemma']:row['change_graded'] for row in table}

input_path = '../../data/{0}/stats/stats_groupings.csv'.format(dataset)
with open(input_path, encoding='utf-8') as csvfile: 
    reader = csv.DictReader(csvfile, delimiter='\t',quoting=csv.QUOTE_NONE,strict=True)
    table = [row for row in reader]
    #print(table[0])
    lemma2binary_200 = {row['lemma']:row['change_binary'] for row in table}
    lemma2graded_200 = {row['lemma']:row['change_graded'] for row in table}

intersection = [lemma for lemma in lemma2binary_semeval if lemma in lemma2binary_200]
y_true = [lemma2binary_semeval[lemma] for lemma in intersection]
y_pred = [lemma2binary_200[lemma] for lemma in intersection]
acc = accuracy_score(y_true, y_pred)
ham = hamming_loss(y_true, y_pred)
print('accuracy', acc, len(y_true))
print('percentage of items changing their label', ham, len(y_true))

intersection = [lemma for lemma in lemma2graded_semeval if lemma in lemma2graded_200]
y_true = [lemma2graded_semeval[lemma] for lemma in intersection]
y_pred = [lemma2graded_200[lemma] for lemma in intersection]
spr, p = spearmanr(y_true, y_pred)
print('spearman', spr, len(y_true))

accuracy 0.8648648648648649 37
percentage of items changing their label 0.13513513513513514 37
spearman 0.8935665595310848 37


# Conclusion
- accuracy of binary change labels derived from opt clusterings between DWUG DE 1.1.0 and 2.0.0 is 0.88. This means that a non-negligible amount of 12% of items change their label after additional annotation and clustering.
- correlation between graded change scores is high with 0.98.
 * &rarr; graded change scores are more robust to sparsity of annotation
- doing the same comparisons between semeval clusterings from 1.1.0 (official version used for shared task) and opt from 2.0.0 yields accuracy of 0.81 and correlation of 0.98. The discrepancy for binary change scores is higher with 19% of items changing their labels. Note that this can also be because for SemEval binary change scores different thresholds were used.
- robustness of change scores for DWUG SV 1.0.0 is accuracy of 0.95/0.94 and Spearman of 0.96/0.82 for opt/semeval compared with opt from DWUG SV 2.0.0.
- robustness of change scores is for DWUG EN 1.0.0 is accuracy of 0.78/0.86 and Spearman of 0.95/0.89 for opt/semeval compared with opt from DWUG EN 2.0.0.



# To do
- compare change scores to previous version
