Imports

In [1]:
import pandas as pd
import numpy as np
import statsmodels.stats.inter_rater as IR

Data Processing

In [2]:
# Own Data
df = pd.read_csv('Selction_Frame.csv', sep=';')
#df.head()

In [3]:
# Survey Data
survey_df = pd.read_csv('responses.csv')
#survey_df.head()

In [4]:
# Transpose and Extract Survey Answers
survey_answers = survey_df.transpose().iloc[1:151].reset_index(drop=True)
survey_answers=survey_answers.add_suffix('_survey')
participants = survey_answers.columns

# Extract relevant data from own answers
team_answers = df[['flow_prediction', 'flow_annotated', 'control_flow', 'control_no_flow']]

# Combine Datasets
answers = pd.concat([team_answers, survey_answers], ignore_index=False, axis=1)
answers['model_label'] = answers['flow_prediction'].apply(lambda x: int(x>=0.5))

In [5]:
answers_control=answers.loc[(answers['control_flow']==True) | (answers['control_no_flow']==True)]
answers_no_control = answers.loc[(answers['control_flow']==False) & (answers['control_no_flow']==False)]

# Agreement Calculation

## Labelers

In [6]:
agreement_control=dict()
agreement_no_control=dict()
agreement_complete=dict()
for participant in participants:
    agreement_control[participant]=(answers_control['flow_annotated'] == answers_control[participant]).mean()
    agreement_no_control[participant]=(answers_no_control['flow_annotated'] == answers_no_control[participant]).mean()
    agreement_complete[participant]=(answers['flow_annotated'] == answers[participant]).mean()

In [7]:
agreement_df_label = pd.concat(
    [
        pd.DataFrame.from_dict(agreement_complete, orient='index', columns=['complete']),
        pd.DataFrame.from_dict(agreement_no_control, orient='index', columns=['no_control']),
        pd.DataFrame.from_dict(agreement_control, orient='index', columns=['control'])
    ], axis=1
)
print('Averages:')
print(agreement_df_label)
print('\nSummary Statistics:')
print(agreement_df_label.describe())

Averages:
          complete  no_control  control
0_survey  0.720000        0.74     0.68
1_survey  0.666667        0.64     0.72
2_survey  0.606667        0.59     0.64
3_survey  0.706667        0.72     0.68
4_survey  0.733333        0.73     0.74
5_survey  0.713333        0.73     0.68
6_survey  0.740000        0.72     0.78

Summary Statistics:
       complete  no_control   control
count  7.000000    7.000000  7.000000
mean   0.698095    0.695714  0.702857
std    0.046780    0.057404  0.046803
min    0.606667    0.590000  0.640000
25%    0.686667    0.680000  0.680000
50%    0.713333    0.720000  0.680000
75%    0.726667    0.730000  0.730000
max    0.740000    0.740000  0.780000


## Model

In [8]:
agreement_control=dict()
agreement_no_control=dict()
agreement_complete=dict()
for participant in participants:
    agreement_control[participant]=(answers_control['model_label'] == answers_control[participant]).mean()
    agreement_no_control[participant]=(answers_no_control['model_label'] == answers_no_control[participant]).mean()
    agreement_complete[participant]=(answers['model_label'] == answers[participant]).mean()

In [9]:
agreement_df_model = pd.concat(
    [
        pd.DataFrame.from_dict(agreement_complete, orient='index', columns=['complete']),
        pd.DataFrame.from_dict(agreement_no_control, orient='index', columns=['no_control']),
        pd.DataFrame.from_dict(agreement_control, orient='index', columns=['control'])
    ], axis=1
)
print('Averages:')
print(agreement_df_model)
print('\nSummary Statistics:')
print(agreement_df_model.describe())


Averages:
          complete  no_control  control
0_survey  0.660000        0.67     0.64
1_survey  0.620000        0.55     0.76
2_survey  0.613333        0.60     0.64
3_survey  0.700000        0.69     0.72
4_survey  0.673333        0.64     0.74
5_survey  0.653333        0.62     0.72
6_survey  0.626667        0.59     0.70

Summary Statistics:
       complete  no_control   control
count  7.000000    7.000000  7.000000
mean   0.649524    0.622857  0.702857
std    0.031472    0.048206  0.046803
min    0.613333    0.550000  0.640000
25%    0.623333    0.595000  0.670000
50%    0.653333    0.620000  0.720000
75%    0.666667    0.655000  0.730000
max    0.700000    0.690000  0.760000


# Control Group

In [10]:
dupli = df['file_path']
dupli = dupli[dupli.duplicated(keep=False)]
dupli = dupli.groupby(list(dupli)).apply(lambda x: tuple(x.index)).tolist()

In [11]:
control_measures=dict()
for participant in participants:
    sample = list()
    for couple in dupli:
        sample.append(answers.loc[couple[0], participant]==answers.loc[couple[1], participant])

    control_measures[participant]=np.mean(sample)
agreement_df_control = pd.DataFrame.from_dict(control_measures, orient='index')
print('Averages:')
print(agreement_df_control*100)
print('\nSummary Statistics:')
print(agreement_df_control.describe())

Averages:
             0
0_survey  68.0
1_survey  64.0
2_survey  92.0
3_survey  84.0
4_survey  76.0
5_survey  84.0
6_survey  90.0

Summary Statistics:
              0
count  7.000000
mean   0.797143
std    0.107349
min    0.640000
25%    0.720000
50%    0.840000
75%    0.870000
max    0.920000


# Kappas

## Full

In [12]:
kappa_data = answers.drop(columns=['flow_prediction','control_flow', 'control_no_flow','model_label'])
ratings=pd.DataFrame()
ratings['flow'] = kappa_data.sum(axis=1).astype(int)
ratings['no_flow'] = (len(kappa_data.columns)- ratings['flow']).astype(int)
print("Fleiss:", round(IR.fleiss_kappa(ratings[['flow', 'no_flow']], method='fleiss') * 100, 2))
print("Randolph’s:", round(IR.fleiss_kappa(ratings[['flow', 'no_flow']], method='unif') * 100, 2))

Fleiss: 30.4
Randolph’s: 31.67


## No Control

In [13]:
kappa_data = answers_no_control.drop(columns=['flow_prediction','control_flow', 'control_no_flow','model_label'])
ratings=pd.DataFrame()
ratings['flow'] = kappa_data.sum(axis=1).astype(int)
ratings['no_flow'] = (len(kappa_data.columns)- ratings['flow']).astype(int)
print("Fleiss:", round(IR.fleiss_kappa(ratings[['flow', 'no_flow']], method='fleiss') * 100, 2))
print("Randolph’s:", round(IR.fleiss_kappa(ratings[['flow', 'no_flow']], method='unif') * 100, 2))

Fleiss: 29.55
Randolph’s: 30.79


# Cronbach

https://learnaitech.com/how-to-compute-inter-rater-reliablity-metrics-cohens-kappa-fleisss-kappa-cronbach-alpha-kripndorff-alpha-scotts-pi-inter-class-correlation-in-python/

In [14]:
import pingouin as pg

In [15]:
pg.cronbach_alpha(data=kappa_data.transpose().apply(pd.to_numeric))

(0.9244152989228714, array([0.826, 0.982]))

In [16]:
def CronbachAlpha(itemscores):
    itemscores = np.asarray(itemscores)
    itemvars = itemscores.var(axis=1, ddof=1)
    tscores = itemscores.sum(axis=0)
    nitems = len(itemscores)

    return nitems / (nitems-1.) * (1 - itemvars.sum() / tscores.var(ddof=1))

In [17]:
CronbachAlpha(kappa_data)

0.9244152989228714

# kippendorf

In [18]:
from nltk import agreement

In [19]:
formatted_codes=list()
i_rater=1
for rater in kappa_data.columns:
    i_row=1
    for row in kappa_data.index:
        formatted_codes.append([i_rater, i_row, kappa_data.loc[row, rater]])
        i_row+=1
    i_rater+=1

In [20]:
ratingtask = agreement.AnnotationTask(data=formatted_codes)

print('Krippendorff\'s alpha:',ratingtask.alpha())
print('Scott\'s pi:',ratingtask.pi())
print('Fleiss\'s Kappa:',round(ratingtask.multi_kappa(),4))

Krippendorff's alpha: 0.2963692032531854
Scott's pi: 0.2954885639581326
Fleiss's Kappa: 0.3027
