In [16]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score

In [2]:
rating1 = pd.read_excel('../data/r3a-slr-experimentation.xlsx', sheet_name='Candidates', usecols=['ID', 'V']).set_index('ID')
rating2 = pd.read_excel('../data/r3a-slr-experimentation.xlsx', sheet_name='Overlap 2', usecols=['ID', 'V']).set_index('ID')

In [3]:
relevant_ids = rating2.index
rating1_relevant = rating1[rating1.index.isin(relevant_ids)]

In [4]:
rating1_relevant.rename(columns={'V': 'R1'}, inplace=True)
rating2.rename(columns={'V': 'R2'}, inplace=True)
rating = pd.concat([rating1_relevant, rating2], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating1_relevant.rename(columns={'V': 'R1'}, inplace=True)


### Percentage Agreement

Percentage agreement is the simplest type of inter-rater reliability. It suffers from the fact that it does not account for agreement caused by chance.

Holsti, O. R. (1969). Content analysis for the social sciences and humanities. Reading. MA: Addison-Wesley (content analysis).

In [13]:
agreement = rating.apply(lambda row: row['R1'] == row['R2'], axis=1)
agreement_statistics = agreement.value_counts().to_dict()
percentage_agreement = agreement_statistics[True]/len(rating)
print(f'The two raters achieved a percentage agreement of {percentage_agreement:.2%}.')

The two raters achieved a percentage agreement of 94.67%.


### Cohen's Kappa

Cohen's Kappa accounts for agreement caused by chance but samples the expected marginal distributions from the data directly, which is not applicable in this labeling task. The measure is calculated for completeness' and comparison's sake.

Cohen, J. (1960). A coefficient of agreement for nominal scales. Educational and psychological measurement, 20(1), 37-46.

In [18]:
cohens_kappa = cohen_kappa_score(rating['R1'], rating['R2'], labels=[True, False])
print(f"The two raters achieved a Cohen's Kappa agreement of {cohens_kappa:.2%}.")

The two raters achieved a Cohen's Kappa agreement of 68.49%.


### Bennett's S-Score

Bennett's S-score is a recommended alternative to Cohen's Kappa in this task, since it does account for agreement caused by chance but does not make the same assumptions as described above.

Bennett, E. M., Alpert, R., & Goldstein, A. C. (1954). Communications through limited-response questioning. Public Opinion Quarterly, 18(3), 303-308.

In [20]:
def equal_proportion(y1, y2):
    D = 0
    for e1, e2 in zip(y1, y2):
        if e1 == e2:
            D += 1

    return D/len(y1)

def bennetts_s_score(y1, y2, labels):
    k = len(labels)
    p = equal_proportion(y1, y2)
    s = (k/(k-1)) * (p-(1/k))  
    return s

s_score = bennetts_s_score(rating['R1'], rating['R2'], labels=[True, False])
print(f"The two raters achieved a Bennet's S-Score of {s_score:.2%}.")

The two raters achieved a Bennet's S-Score of 89.33%.
