In [1]:
import math
import json

import pandas as pd
from sklearn.metrics import cohen_kappa_score

# Calculation of Inter-Rater Reliability

This notebook contains the calculation of three versions of inter-rater reliability. The scores give insight into how reliable the coding process of the first author are by comparing it to the independent coding process of the second rater.

In [2]:
# indices of the extractions coded by the second rater, i.e., indices of the extractions for which an overlap exist
overlap = [10, 27, 50, 58, 63, 69, 86, 92, 96]

with open('./../../data//columns.json', 'r') as f:
    columns = json.load(f)

In [3]:
data_location = './../../data/interview-data.xlsx'

df_orig = pd.read_excel(data_location, 
                        sheet_name='Data',
                        usecols=[column['name'] for column in columns],
                        dtype={'Impact 1': "string", 'Impact 2': "string"}) \
    .set_index('ID') \
    .filter(items=overlap, axis = 0) \
    .fillna('na')

df_overlap = pd.read_excel(data_location, 
                           sheet_name='Overlap',
                        usecols=[column['name'] for column in columns],
                        dtype={'Impact 1': "string", 'Impact 2': "string"}) \
    .set_index('ID') \
    .filter(items=overlap, axis = 0) \
    .fillna('na')

  warn(msg)
  warn(msg)


### Percentage Agreement

Percentage agreement is the simplest type of inter-rater reliability. It suffers from the fact that it does not account for agreement caused by chance.

Holsti, O. R. (1969). Content analysis for the social sciences and humanities. Reading. MA: Addison-Wesley (content analysis).

In [4]:
individual_labels = 0
matching_labels = 0

for paperid in overlap:
    for column in df_orig.columns:
        individual_labels += 1
        if df_orig.loc[paperid][column] == df_overlap.loc[paperid][column]: 
            matching_labels += 1

print(f'Percent agreement of {matching_labels/individual_labels:.2%} ({matching_labels}/{individual_labels})')

Percent agreement of 83.76% (98/117)


### Cohen's Kappa

Cohen's Kappa accounts for agreement caused by chance but samples the expected marginal distributions from the data directly, which is not applicable in this labeling task. The measure is calculated for completeness' and comparison's sake.

Cohen, J. (1960). A coefficient of agreement for nominal scales. Educational and psychological measurement, 20(1), 37-46.

In [5]:
codes = {column['name']: column['codes']+['na'] for column in columns[1:]}
kappas = {}

for column in df_orig.columns:
    y1 = df_orig[column].to_list()
    y2 = df_overlap[column].to_list()
    labels = codes[column]

    kappa = cohen_kappa_score(y1, y2, labels=labels)
    if math.isnan(kappa):
        kappa = 1

    kappas[column] = kappa


kappa_values = [kappas[column] for column in kappas]
avg_kappa = sum(kappa_values)/len(kappa_values)

print(f"Average Cohen's Kappa of {avg_kappa:.2%}")

Average Cohen's Kappa of 71.78%


### Bennett's S-Score

Bennett's S-score is a recommended alternative to Cohen's Kappa in this task, since it does account for agreement caused by chance but does not make the same assumptions as described above.

Bennett, E. M., Alpert, R., & Goldstein, A. C. (1954). Communications through limited-response questioning. Public Opinion Quarterly, 18(3), 303-308.

In [6]:
def equal_proportion(y1, y2):
    D = 0
    for e1, e2 in zip(y1, y2):
        if e1 == e2:
            D += 1

    return D/len(y1)

sscores = {}

def bennetts_s_score(y1, y2, labels):
    k = len(labels)
    p = equal_proportion(y1, y2)
    s = (k/(k-1)) * (p-(1/k))  
    return s

for column in df_orig.columns:
    y1 = df_orig[column].to_list()
    y2 = df_overlap[column].to_list()
    labels = codes[column]

    sscores[column] = bennetts_s_score(y1, y2, labels)

s_score_values = [sscores[column] for column in sscores]
avg_sscore = sum(s_score_values)/len(s_score_values)
print(f"Average Bennet's S-Score of {avg_sscore:.2%}")

Average Bennet's S-Score of 82.30%
