In [1]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score

In [2]:
rating1 = pd.read_excel('../data/r3a-data-extraction.xlsx', sheet_name='Data', usecols=['ID', 'Dependent Variable', 'Val'])
rating2 = pd.read_excel('../data/r3a-data-extraction.xlsx', sheet_name='Valuation Overlap', usecols=['ID', 'Dependent Variable', 'No valuation'])

In [3]:
relevant_ids = rating2['ID'].values
relevant_dependent_variables = rating2['Dependent Variable'].values
rating1_relevant = rating1[(rating1['ID'].isin(relevant_ids)) & (rating1['Dependent Variable'].isin(relevant_dependent_variables))]

In [6]:
rating1_relevant.drop_duplicates().set_index('ID')

Unnamed: 0_level_0,Dependent Variable,Val
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
E54,Comprehension Level. It measures the comprehen...,False
E148,File Version History [...] Each team submitted...,True
E218,"The comprehension level, which was measured as...",False
E384,the understandability and manageability of the...,False
E569,"Completeness: Is the test scenario complete, e...",False
E591,Threat-phase association: A percentage that es...,True
E626,Required number of decisions: for the first fo...,True
E677,The average time to conclude a prioritization ...,False
E730,both eye movement and verbal data were recorde...,True
E952,To get a single measure that represents a trad...,False


In [7]:
rating1_relevant = rating1_relevant.drop_duplicates().set_index('ID')
rating2 = rating2.set_index('ID')

In [8]:
rating1_relevant.rename(columns={'Val': 'R1'}, inplace=True)
rating2.rename(columns={'No valuation': 'R2'}, inplace=True)
rating = pd.concat([rating1_relevant, rating2], axis=1)

In [12]:
rating

Unnamed: 0_level_0,Dependent Variable,R1,Dependent Variable,R2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
E54,Comprehension Level. It measures the comprehen...,False,Comprehension Level. It measures the comprehen...,False
E148,File Version History [...] Each team submitted...,True,File Version History [...] Each team submitted...,True
E218,"The comprehension level, which was measured as...",False,"The comprehension level, which was measured as...",False
E384,the understandability and manageability of the...,False,the understandability and manageability of the...,True
E569,"Completeness: Is the test scenario complete, e...",False,"Completeness: Is the test scenario complete, e...",False
E591,Threat-phase association: A percentage that es...,True,Threat-phase association: A percentage that es...,False
E626,Required number of decisions: for the first fo...,True,Required number of decisions: for the first fo...,True
E677,The average time to conclude a prioritization ...,False,The average time to conclude a prioritization ...,False
E730,both eye movement and verbal data were recorde...,True,both eye movement and verbal data were recorde...,True
E952,To get a single measure that represents a trad...,False,To get a single measure that represents a trad...,False


### Percentage Agreement

Percentage agreement is the simplest type of inter-rater reliability. It suffers from the fact that it does not account for agreement caused by chance.

Holsti, O. R. (1969). Content analysis for the social sciences and humanities. Reading. MA: Addison-Wesley (content analysis).

In [10]:
agreement = rating.apply(lambda row: row['R1'] == row['R2'], axis=1)
agreement_statistics = agreement.value_counts().to_dict()
percentage_agreement = agreement_statistics[True]/len(rating)
print(f'The two raters achieved a percentage agreement of {percentage_agreement:.2%}.')

The two raters achieved a percentage agreement of 66.67%.


### Cohen's Kappa

Cohen's Kappa accounts for agreement caused by chance but samples the expected marginal distributions from the data directly, which is not applicable in this labeling task. The measure is calculated for completeness' and comparison's sake.

Cohen, J. (1960). A coefficient of agreement for nominal scales. Educational and psychological measurement, 20(1), 37-46.

In [11]:
cohens_kappa = cohen_kappa_score(rating['R1'], rating['R2'], labels=[True, False])
print(f"The two raters achieved a Cohen's Kappa agreement of {cohens_kappa:.2%}.")

The two raters achieved a Cohen's Kappa agreement of 33.33%.


### Bennett's S-Score

Bennett's S-score is a recommended alternative to Cohen's Kappa in this task, since it does account for agreement caused by chance but does not make the same assumptions as described above.

Bennett, E. M., Alpert, R., & Goldstein, A. C. (1954). Communications through limited-response questioning. Public Opinion Quarterly, 18(3), 303-308.

In [13]:
def equal_proportion(y1, y2):
    D = 0
    for e1, e2 in zip(y1, y2):
        if e1 == e2:
            D += 1

    return D/len(y1)

def bennetts_s_score(y1, y2, labels):
    k = len(labels)
    p = equal_proportion(y1, y2)
    s = (k/(k-1)) * (p-(1/k))  
    return s

s_score = bennetts_s_score(rating['R1'], rating['R2'], labels=[True, False])
print(f"The two raters achieved a Bennet's S-Score of {s_score:.2%}.")

The two raters achieved a Bennet's S-Score of 33.33%.
