In [1]:
from sklearn.metrics import cohen_kappa_score
import itertools
import os
from prettytable import PrettyTable

In [2]:
def is_annotation_line(line):
    """Returns True if this line corresponds to a annotation line in the Webanno TSV file, False otherwise"""
    if line.strip().startswith('#') or line.strip() == '':
        return False
    return True

In [3]:
def get_labels_list(annots_file):
    """Get list of labels from Webanno TSV annotation file"""
    
    with open(annots_file) as f:
        annots = f.readlines()
        
    annots = filter(lambda line: is_annotation_line(line), annots)
    annots = map(lambda line: line.strip().split('\t'), annots)
    
    labels = map(lambda line: line[-1], annots)
    
    return labels

In [4]:
def calculate_kappa(user1, user2, project_path):
    """Calculates kappa between two users. Aggregates annotations from all files annotated and then annotates
    usual kappa. It ignores files that were not annotated by both users."""
    
    annotations_folder = '{}/annotation'.format(project_path)
    
    files_annotated = os.listdir(annotations_folder)
    
    total_labels1 = []
    total_labels2 = []
    
    for file_annotated in files_annotated:
        
        error_message = 'User {} didn\'t annotate file {}.'
        
        file_annotated_by_both = True
        
        try:
            labels1 = get_labels_list('{}/{}/{}.tsv'.format(annotations_folder, file_annotated, user1))
        except IOError:
            print error_message.format(user1, file_annotated)
            file_annotated_by_both = False
        
        try:    
            labels2 = get_labels_list('{}/{}/{}.tsv'.format(annotations_folder, file_annotated, user2))
        except IOError:
            print error_message.format(user2, file_annotated)
            file_annotated_by_both = False
            
        if file_annotated_by_both:
            total_labels1 += labels1
            total_labels2 += labels2
    
    assert len(total_labels1) == len(total_labels2)
    return cohen_kappa_score(total_labels1, total_labels2)

In [23]:
def get_all_kappas(usernames_list, project_path):
    """Returns a dictionary with values of Cohen's kappa between all pairs of users"""
    
    all_kappas = {}
    
    all_pairs = list(itertools.combinations(usernames_list, 2))
    for pair in all_pairs:
        all_kappas[pair] = round(calculate_kappa(pair[0], pair[1], project_path), 2)
        
    return all_kappas

In [43]:
def create_agreement_table(usernames_list, project_path):
    """project_path is the path to the project folder exported from the Webanno app."""
    
    all_kappas = get_all_kappas(usernames_list, project_path)
    
    table = PrettyTable()
    table.field_names = ['Users'] + usernames_list
    
    for i, user1 in enumerate(usernames_list):
        row = [user1]
        
        for j, user2 in enumerate(usernames_list):
            if i == j:
                row.append('-')
            elif i < j:
                row.append(all_kappas[(user1, user2)])
            else:
                row.append('-')
                
        assert len(row) == len(usernames_list) + 1
        
        table.add_row(row)
        
    table.format = True
    html_table = table.get_html_string()
    with open('results.html', 'w') as f:
        f.write(html_table)
    

In [45]:
usernames_list = ['user1', 'user2', 'user3', 'user4', 'user5', 'user6', 'user7', 'user8', 'user9', 'user10']
create_agreement_table(usernames_list, 'project')

User user2 didn't annotate file 20040614134056938.tsv.
User user3 didn't annotate file 20040614103112698.tsv.
User user3 didn't annotate file 20040614134056938.tsv.
User user4 didn't annotate file 20040614103112698.tsv.
User user4 didn't annotate file 20040614134056938.tsv.
User user5 didn't annotate file 20040614103112698.tsv.
User user5 didn't annotate file 20040614134056938.tsv.
User user6 didn't annotate file 20040614103112698.tsv.
User user6 didn't annotate file 20040614134056938.tsv.
User user7 didn't annotate file 20040614103112698.tsv.
User user7 didn't annotate file 20040614134056938.tsv.
User user8 didn't annotate file 20040614103112698.tsv.
User user8 didn't annotate file 20040614134056938.tsv.
User user9 didn't annotate file 20040614103112698.tsv.
User user9 didn't annotate file 20040614134056938.tsv.
User user10 didn't annotate file 20040614103112698.tsv.
User user10 didn't annotate file 20040614134056938.tsv.
User user3 didn't annotate file 20040614103112698.tsv.
User use