# Aggregate judgements 

By: Iris Luden
Created: May 2023

In [1]:
import pandas as pd 
import nltk
from collections import Counter

# Read the annotators file 


In [2]:
# old file 
df_annotators  = pd.read_csv('Annotations_Merged_2_7_1_Annotators_experiment.tsv', sep='\t')
df_annotators

Unnamed: 0,Word_id,Example,Word,Corpus,Line number,Prediction,Judgement,Category,Kaya,Hanna,Laura
0,look%C1.1035374,in heroes and generals there you have limited ...,look,C1,1035374,expect something to happen or be the case,,stable,2,2,3
1,look%C1.1116016,same thing i suggest for other agi lk is to lo...,look,C1,1116016,orient oneself in order to find something,,stable,2,3,2
2,look%C2.2838548,user user deftechpat juliadavisnews looks like...,look,C2,2838548,have the appearance of being,,stable,3,3,3
3,look%C2.2035418,being thick is a trend that 's gon na die down...,look,C2,2035418,behave in a specified way,,stable,0,0,0
4,look%C1.736364,i know there are a few setting sprays on sepho...,look,C1,736364,investigate ; study,,stable,3,2,2
...,...,...,...,...,...,...,...,...,...,...,...
395,closeness%C2.2380201,hahahah is it bad that i wanna have sex just f...,closeness,C2,2380201,the quality of being intimate,,changing,2,2,1
396,corona%C1.1241317,im the most interesting man in the world ill h...,corona,C1,1241317,a cocktail made with aromatic spices and fruit...,,changing,2,2,2
397,corona%C1.763988,the only other place i know of is game meats u...,corona,C1,763988,a deep red or yellowish-brown colour,,changing,0,0,0
398,corona%C1.3137805,having an ice cold corona at a bar that plays ...,corona,C1,3137805,a cold drink served with drinks such as fruit ...,,changing,0,0,0


In [3]:
# count each type of judgement

counters  = {}
for category in ['stable', 'changing', 'emerging']:
    for corpus in ['C1', 'C2']:
        subdf = df_annotators[(df_annotators['Category'] == category) & (df_annotators['Corpus'] == corpus)]    
        counters[(category, corpus)] = Counter(subdf['Hanna']) + Counter(subdf['Kaya']) + Counter(subdf['Laura'])
counters



{('stable', 'C1'): Counter({3: 107, 2: 46, 0: 38, 1: 30, -10: 19}),
 ('stable', 'C2'): Counter({3: 106, 0: 43, 2: 39, 1: 33, -10: 19}),
 ('changing', 'C1'): Counter({0: 100, 3: 54, 2: 37, 1: 27, -10: 22}),
 ('changing', 'C2'): Counter({0: 118, 3: 43, 1: 39, 2: 27, -10: 13}),
 ('emerging', 'C1'): Counter(),
 ('emerging', 'C2'): Counter({0: 148, 1: 38, -10: 30, 3: 14, 2: 10})}

In [4]:
def read_annotations(df, judges):
    for j in judges: 
        print("Number not judged: ", sum(df[j].isna()))
        print("Number of self-referenced:", sum(df[j] == -10))
        print("Number of 0:", sum(df[j] == 0))
        print("Number of 1:", sum(df[j] == 1))
        print("Number of 2:", sum(df[j] == 2))
        print("Number of 3:", sum(df[j] == 3))

        print("Number correct:", sum(df[j] > 1 ))
        print("Number incorrect:", sum(df[j] < 2 ))
        print()
        
read_annotations(df_annotators, ['Kaya', 'Hanna', 'Laura'])

Number not judged:  0
Number of self-referenced: 43
Number of 0: 137
Number of 1: 53
Number of 2: 44
Number of 3: 123
Number correct: 167
Number incorrect: 233

Number not judged:  0
Number of self-referenced: 41
Number of 0: 150
Number of 1: 49
Number of 2: 55
Number of 3: 105
Number correct: 160
Number incorrect: 240

Number not judged:  0
Number of self-referenced: 19
Number of 0: 160
Number of 1: 65
Number of 2: 60
Number of 3: 96
Number correct: 156
Number incorrect: 244



In [5]:
def boolean_annotations(df, judges):
    ''' map the judgements onto boolean judgements '''
    boolean_judgements = []

    for j in judges: 
        boolean_judgements.append(df[j].map(lambda x: 1 if x > 1 else 0 ))
    
    majority_vote = (sum(boolean_judgements)/3).map(lambda x: 1 if x >= (2/3) else 0)

    return majority_vote

def averaged_annotations(df, judges):
    ''' replace  the -10 values with incorrectness'''
    
    # remplaces the -10 tag 
    flattened_judgements = []
    for j in judges: 
        flattened_judgements.append(df[j].map(lambda x: 0 if x < 0 else x))

    averaged = sum(flattened_judgements).map(lambda x: 1 if x > 4  else 0)
    return averaged

def consensus_vote(df, judges):
    boolean_judgements = []

    for j in judges: 
        boolean_judgements.append(df[j].map(lambda x: 1 if x > 1 else 0 ))
    
    consensus_vote = (sum(boolean_judgements)).map(lambda x: 1 if x >= 3 else 0)

    return consensus_vote

In [6]:
# add the majority boolean judgements to the original file 
df_annotators['Boolean majority'] = boolean_annotations(df_annotators, ['Kaya', 'Hanna', 'Laura'])

df_annotators['Averaged judgements'] = averaged_annotations(df_annotators, ['Kaya', 'Hanna', 'Laura'])

df_annotators['Consensus vote'] = consensus_vote(df_annotators, ['Kaya', 'Hanna', 'Laura'])

In [7]:
def correctness_per_category(df, col='Boolean majority'):
    
    df_C1 = df[df['Corpus'] == 'C1']
    df_C2 = df[(df['Corpus'] == 'C2') & (df['Category'] != 'emerging')]
    

    print("Percentage correct C1:", sum(df[df['Corpus'] == 'C1'][col])/len(df_C1))
    print("Percentage correct C2:", sum(df_C2[col])/len(df_C2))
    print("Percentage correct stable + emerging C1 + C2", (sum(df_C1[col]) + sum(df_C2[col])) / (len(df_C1) + len(df_C2)) )
    
    print("percentave correct C2 including emerging", sum(df[df['Corpus'] =='C2'][col]/len(df[df['Corpus'] == 'C2'])))
    print("percentave correct total", sum(df[col]/len(df)))
                                             
    print()

    for c in ['changing', 'stable', 'emerging']:
        print(f'Percentage correct {c} over all: ', sum(df[df['Category'] == c][col])/sum(df['Category'] == c))
    print()
    for corpus in ['C1', 'C2']:
        df_corpus = df[df['Corpus'] == corpus]
        print("This corpus has a total of" , len(df_corpus), "sentences")
        
        
        
        for c in df_corpus['Category'].unique():

            print(f'Percentage correct {c} in corpus {corpus}: ', sum(df_corpus[df_corpus['Category'] == c][col])/sum(df_corpus['Category'] == c))
        print()

In [8]:
print("Boolean judgements")
correctness_per_category(df_annotators)
print(" -------------- ")

print("Averaged judgements")
correctness_per_category(df_annotators, 'Averaged judgements')
print(" -------------- ")

print("Consensus vote")
correctness_per_category(df_annotators, 'Consensus vote')
print(" -------------- ")

print("Overall disagreement (none agree:)")
print(sum(
            (df_annotators['Kaya'] != df_annotators['Hanna']) 
          & (df_annotators['Hanna'] != df_annotators['Laura']) 
          & (df_annotators['Laura'] != df_annotators['Kaya'])))

Boolean judgements
Percentage correct C1: 0.525
Percentage correct C2: 0.425
Percentage correct stable + emerging C1 + C2 0.475
percentave correct C2 including emerging 0.3125000000000001
percentave correct total 0.3975000000000003

Percentage correct changing over all:  0.3125
Percentage correct stable over all:  0.6375
Percentage correct emerging over all:  0.0875

This corpus has a total of 160 sentences
Percentage correct stable in corpus C1:  0.6625
Percentage correct changing in corpus C1:  0.3875

This corpus has a total of 240 sentences
Percentage correct stable in corpus C2:  0.6125
Percentage correct emerging in corpus C2:  0.0875
Percentage correct changing in corpus C2:  0.2375

 -------------- 
Averaged judgements
Percentage correct C1: 0.54375
Percentage correct C2: 0.45
Percentage correct stable + emerging C1 + C2 0.496875
percentave correct C2 including emerging 0.32500000000000007
percentave correct total 0.4125000000000003

Percentage correct changing over all:  0.337

In [9]:
# df_annotators.to_csv('Annotations.tsv', sep='\t', index=False)

# Krippendorff alpha

In [10]:
# Calculate krippindorf alpha
import krippendorff

print("Considering all values:")
# when all have a different value 
print(krippendorff.alpha(reliability_data=df_annotators[['Hanna', 'Kaya', 'Laura']].T))
print()

# WHEN WE SKIP THE SELF REFERENCE
print("When disregarding the -10 judgement")
print(krippendorff.alpha(reliability_data=df_annotators[['Hanna', 'Kaya', 'Laura']].applymap(lambda x: 0 if x < 0 else x).T))
print()

# When we take the boolean judgements: correct/incorrect
print("When considering the boolean jusdgements: correct/incorrect")
print(krippendorff.alpha(reliability_data=df_annotators[['Hanna', 'Kaya', 'Laura']].applymap(lambda x: 0 if x < 2 else 1).T))


Considering all values:
0.6209278571501973

When disregarding the -10 judgement
0.6766731440102806

When considering the boolean jusdgements: correct/incorrect
0.6191573470089025
