In [1]:
from sklearn.metrics import cohen_kappa_score
import pandas as pd
from itertools import combinations
import numpy as np

In [2]:
# read csv while replacing nan as missing
reddit = {'reddit_stance_0': pd.read_csv('reddit_stance/reddit_stance_0.csv').fillna('Missing')}
twitter = {'twitter_stance_0': pd.read_csv('twitter_stance/twitter_stance_0.csv').fillna('Missing'),
          'twitter_stance_1': pd.read_csv('twitter_stance/twitter_stance_1.csv').fillna('Missing'),
          'twitter_stance_2': pd.read_csv('twitter_stance/twitter_stance_2.csv').fillna('Missing'),
          'twitter_stance_3': pd.read_csv('twitter_stance/twitter_stance_3.csv').fillna('Missing'),
          'twitter_stance_4': pd.read_csv('twitter_stance/twitter_stance_4.csv').fillna('Missing'),}

# twitter_stance_3 has "unclear,unclear" as a label
# twitter_stance_4 has "anti-mitigation,anti-mitigation"
# reddit_stance_0 has "pro-mitigation,promitigation" as one of the label
# 2 labels in one, but 1 always missing, so prob just parsing error
# thus, move the second label to the right, updating other labels as necessary

for k, v in twitter.items():
    for anno in v.iloc[:, 1:]:
        col_idx = v.columns.get_loc(anno)
        if not v.loc[v[anno].str.contains(','), anno].empty: # has "unclear,unclear" etc, 2 labels in one cell
            #print(v.loc[v[anno].str.contains(','), :])
            idx = v.index[v[anno].str.contains(',')]
            cur_label = v.loc[idx, anno].values.item()
            labels = cur_label.split(',')
            
            # move labels to the right, distributing them, so one annotator would have one label
            # current annotator would have first label
            v.iloc[idx, col_idx] = labels.pop(0)
            #print(v.iloc[idx, col_idx])
            for z in range(col_idx + 1, len(v.columns)):
                if labels and v.iloc[idx, z].values.item() == 'Missing':  
                    v.iloc[idx, z] = labels.pop(0)
                else: # replace and move to right
                    labels.append(v.iloc[idx, z])
                    v.iloc[idx, z] = labels.pop(0)
                #print(v.iloc[idx, z])
# Thus changed line 149 in twitter_stance_3 from "unclear,unclear" "unclear" "pro-mitigation" ""
# to "unclear" "unclear" "unclear" "pro-mitigation"
# line 301 in twitter_stance_4 from "anti-mitigation,antimitigation" "unclear" "" "pro-mitigation"
# to "anti-mitigation" "anti-mitigation" "unclear" "pro-mitigation"
            
for k, v in reddit.items():
    for anno in v.iloc[:, 1:]:
        col_idx = v.columns.get_loc(anno)
        if not v.loc[v[anno].str.contains(','), anno].empty: # has "unclear,unclear" etc, 2 labels in one cell
            #print(v.loc[v[anno].str.contains(','), :])
            idx = v.index[v[anno].str.contains(',')]
            cur_label = v.loc[idx, anno].values.item()
            labels = cur_label.split(',')
            
            # move labels to the right, distributing them, so one annotator would have one label
            # current annotator would have first label
            v.iloc[idx, col_idx] = labels.pop(0)
            #print(v.iloc[idx, col_idx])
            for z in range(col_idx + 1, len(v.columns)):
                if labels and v.iloc[idx, z].values.item() == 'Missing':  
                    v.iloc[idx, z] = labels.pop(0)
                else: # replace and move to right
                    labels.append(v.iloc[idx, z])
                    v.iloc[idx, z] = labels.pop(0)
                #print(v.iloc[idx, z])
# line 101 in reddit_stance_0 from "pro-mitigation,pro-mitigation" "pro-mitigation" "pro-mitigation" ""
# to "pro-mitigation" "pro-mitigation" "pro-mitigation" "pro-mitigation"

# Twitter Stance

In [3]:
twitter_annotators = {}

for k, v in twitter.items():
    current_twitter = v.iloc[:, 1:]
    
    # get labels for each annotator
    annotators = {}
    for anno in current_twitter:
        annotators[anno] = current_twitter[anno].values
    
    # get each pair of annotators -- 1,3 and 3,1 are same
    # calculate agreement score
    scores = {}
    pairs = list(combinations(annotators, 2))
    for pair in pairs:
        scores[pair] = cohen_kappa_score(annotators[pair[0]], annotators[pair[1]])
        
    # claculate average kappa score
    annotators_avg = {}
    for k_, v_ in annotators.items():
        #print(k_)
        total = 0
        for k__, v__ in scores.items():
            if k_ in k__: # find pairs where current annotator is in -- 3 pairs
                #print(k__)
                total += scores[k__]
        annotators_avg[k_] = total / 3
    twitter_annotators[k] = annotators_avg
        
    print(k)
    print("Cohen's kappa agreement score:")
    print('\n'.join([str(k_) + ": " + str(v_) for k_, v_ in scores.items()]))
    print('\n')
    
    print('Average Kappa Score:')
    print('\n'.join([str(k_) + ": " + str(v_) for k_, v_ in annotators_avg.items()]))
    print('\n')

twitter_stance_0
Cohen's kappa agreement score:
('annotation_32', 'annotation_33'): 0.2534050781906845
('annotation_32', 'annotation_35'): 0.40854106796765366
('annotation_32', 'annotation_17'): 0.23917483660130712
('annotation_33', 'annotation_35'): 0.32735426008968616
('annotation_33', 'annotation_17'): 0.3911261152640463
('annotation_35', 'annotation_17'): 0.4136460554371002


Average Kappa Score:
annotation_32: 0.30037366091988177
annotation_33: 0.323961817848139
annotation_35: 0.3831804611648133
annotation_17: 0.3479823357674845


twitter_stance_1
Cohen's kappa agreement score:
('annotation_26', 'annotation_37'): 0.14795244385733164
('annotation_26', 'annotation_38'): 0.22745024472602615
('annotation_26', 'annotation_39'): 0.16167664670658677
('annotation_37', 'annotation_38'): 0.3147601660581204
('annotation_37', 'annotation_39'): 0.44672131147540983
('annotation_38', 'annotation_39'): 0.2834474695172874


Average Kappa Score:
annotation_26: 0.1790264450966482
annotation_37: 0.30

In [13]:
# assemble dataset
final_data = {'text': [], 'label': []}
# for dupe texts -- "Looks like #CovidVaccine will ...", "STUDIES SHOW MORPHINE MILLIGRAM ...", etc
unique_texts = {}
dupes = {}
for k, v in twitter.items():
    cur_data = v
    annotators = twitter_annotators[k]
    
    # remove annotators with kappa < 0.2 -- unreliable
    for anno in annotators:
        if annotators[anno] < 0.2:
            cur_data = cur_data.drop(columns = [anno])
            #print('unreliable', anno)
    
    # get frequent labels for each text
    frequent_labels = cur_data.iloc[:, 1:].mode(axis = 1)
    #print(frequent_labels)
    for i in range(len(cur_data)):
        cur_text = cur_data.loc[i, 'text']
        
        freqs = {key: (0, 0) for key in frequent_labels.loc[i].tolist() if not pd.isnull(key)}
        # key: (total kappa, # annotators)
        # use freq label with higher reliability annotators
        
        # get kappas for the annotators that chose the most frequent for current text
        for anno, cur_label in cur_data.iloc[i, 1:].iteritems():
            #print('index: ', anno, 'value: ', cur_label)
            if cur_label in freqs:
                freqs[cur_label] = (freqs[cur_label][0] + annotators[anno], freqs[cur_label][1] + 1)
        
        # get best label based on reliability
        max_avg_kappa = 0
        best_label = 'None'
        for cur_label, cur_v in freqs.items():
            cur_avg_kappa = cur_v[0] / cur_v[1] if cur_v[1] else 0
            if cur_avg_kappa > max_avg_kappa and cur_label != 'Missing': 
                # if missing ignore, so essentially use the next available highest kappa
                max_avg_kappa = cur_avg_kappa
                best_label = cur_label
                
        # if text is a duplicate, keep highest kappa
        if cur_text in unique_texts:
            if cur_text in dupes: # already ahve one duplicate -- if current lable higher kappa then change
                if max_avg_kappa > dupes[cur_text][1]:
                    dupes[cur_text] = (best_label, max_avg_kappa)
            else: # new duplkicate, use last dupe (in unique texts) and choose higher kappa
                if max_avg_kappa > unique_texts[cur_text][1]: # current label is best, so change it
                    dupes[cur_text] = (best_label, max_avg_kappa)
                else: # keep value in unique text
                    dupes[cur_text] = (unique_texts[cur_text][0], unique_texts[cur_text][1])
        else:
            unique_texts[cur_text] = (best_label, max_avg_kappa)
        
        final_data['text'].append(cur_text)
        final_data['label'].append(best_label)
    
final_dataset = pd.DataFrame(data = final_data)
# remove duplicates and update the label with the highest kappa
# 11 dupes with 4 uniques, so remove 7
final_dataset = final_dataset.drop_duplicates(subset = ['text'])
for k in dupes: # update label for the dupes to the most frequent/kappa
    final_dataset.loc[final_dataset['text'] == k, 'label'] = dupes[k][0]

# Remove None labels -- labels that were 'Missing' even after choosing the most frequent and avg kappa scores
# 2 removed from twitter_stance_3
# twitter_stance_3 only had 1 reliable annotator (avg kappa > 0.2) so this was the annotator missing labels
FINAL = final_dataset[final_dataset['label'] != 'None']
FINAL.to_csv('PRIMARY-Twitter_Stance.csv', index = False)

In [5]:
FINAL

Unnamed: 0,text,label
0,Follow the CDC guidelines. Don’t become a stat...,pro-mitigation
1,Do you agree with CDC guidelines that children...,unclear
2,"So, both #Pharmaceutical companies #lilly and ...",unclear
3,The CDC's guidelines are clear; you just don't...,pro-mitigation
4,CDC Updates School Guidelines For Students Ret...,unclear
...,...,...
1495,.Sprint To Develop A #COVIDVaccine – // https:...,unclear
1496,Here in the U.S. some localities have brought ...,pro-mitigation
1497,Sanitizer &amp; Mask Manufacturers After Russi...,unclear
1498,We are following all CDC guidelines through a ...,pro-mitigation


# Reddit Stance

In [6]:
reddit_annotators = {}

for k, v in reddit.items():
    current_reddit = v.iloc[:, 1:]
    
    # get labels for each annotator
    annotators = {}
    for anno in current_reddit:
        annotators[anno] = current_reddit[anno].values
    
    # get each pair of annotators -- 1,3 and 3,1 are same
    # calculate agreement score
    scores = {}
    pairs = list(combinations(annotators, 2))
    for pair in pairs:
        scores[pair] = cohen_kappa_score(annotators[pair[0]], annotators[pair[1]])
        
    # claculate average kappa score
    annotators_avg = {}
    for k_, v_ in annotators.items():
        #print(k_)
        total = 0
        for k__, v__ in scores.items():
            if k_ in k__: # find pairs where current annotator is in -- 3 pairs
                #print(k__)
                total += scores[k__]
        annotators_avg[k_] = total / 3
    reddit_annotators[k] = annotators_avg
        
    print(k)
    print("Cohen's kappa agreement score:")
    print('\n'.join([str(k_) + ": " + str(v_) for k_, v_ in scores.items()]))
    print('\n')
    
    print('Average Kappa Score:')
    print('\n'.join([str(k_) + ": " + str(v_) for k_, v_ in annotators_avg.items()]))
    print('\n')

reddit_stance_0
Cohen's kappa agreement score:
('annotation_24', 'annotation_10'): 0.45949087527606647
('annotation_24', 'annotation_12'): 0.525124027565971
('annotation_24', 'annotation_71'): 0.40132563609151173
('annotation_10', 'annotation_12'): 0.48822296214083105
('annotation_10', 'annotation_71'): 0.4135585268590194
('annotation_12', 'annotation_71'): 0.29262844378257624


Average Kappa Score:
annotation_24: 0.4619801796445164
annotation_10: 0.453757454758639
annotation_12: 0.4353251444964594
annotation_71: 0.3691708689110358




In [7]:
# assemble dataset
final_data = {'text': [], 'label': []}
# for dupe texts -- "[deleted]", "[removed]", etc
unique_texts = {}
dupes = {}
for k, v in reddit.items():
    cur_data = v
    annotators = reddit_annotators[k]
    
    # remove annotators with kappa < 0.2 -- unreliable
    for anno in annotators:
        if annotators[anno] < 0.2:
            cur_data = cur_data.drop(columns = [anno])
            #print('unreliable', anno)
    
    # get frequent labels for each text
    frequent_labels = cur_data.iloc[:, 1:].mode(axis = 1)
    #print(frequent_labels)
    for i in range(len(cur_data)):
        cur_text = cur_data.loc[i, 'text']
        
        freqs = {key: (0, 0) for key in frequent_labels.loc[i].tolist() if not pd.isnull(key)}
        # key: (total kappa, # annotators)
        # use freq label with higher reliability annotators
        
        # get kappas for the annotators that chose the most frequent for current text
        for anno, cur_label in cur_data.iloc[i, 1:].iteritems():
            #print('index: ', anno, 'value: ', cur_label)
            if cur_label in freqs:
                freqs[cur_label] = (freqs[cur_label][0] + annotators[anno], freqs[cur_label][1] + 1)
        
        # get best label based on reliability
        max_avg_kappa = 0
        best_label = 'None'
        for cur_label, cur_v in freqs.items():
            cur_avg_kappa = cur_v[0] / cur_v[1] if cur_v[1] else 0
            if cur_avg_kappa > max_avg_kappa and cur_label != 'Missing': 
                # if missing ignore, so essentially use the next available highest kappa
                max_avg_kappa = cur_avg_kappa
                best_label = cur_label
                
        # if text is a duplicate, keep highest kappa
        if cur_text in unique_texts:
            if cur_text in dupes: # already ahve one duplicate -- if current lable higher kappa then change
                if max_avg_kappa > dupes[cur_text][1]:
                    dupes[cur_text] = (best_label, max_avg_kappa)
            else: # new duplkicate, use last dupe (in unique texts) and choose higher kappa
                if max_avg_kappa > unique_texts[cur_text][1]: # current label is best, so change it
                    dupes[cur_text] = (best_label, max_avg_kappa)
                else: # keep value in unique text
                    dupes[cur_text] = (unique_texts[cur_text][0], unique_texts[cur_text][1])
        else:
            unique_texts[cur_text] = (best_label, max_avg_kappa)
        
        final_data['text'].append(cur_text)
        final_data['label'].append(best_label)
    
final_dataset = pd.DataFrame(data = final_data)
# remove duplicates and update the label with the highest kappa
# 14 dupes with 2 uniques, so remove 12
final_dataset = final_dataset.drop_duplicates(subset = ['text'])
for k in dupes:
    final_dataset.loc[final_dataset['text'] == k, 'label'] = dupes[k][0]

# Remove None labels -- labels that were 'Missing' even after choosing the most frequent and avg kappa scores
# no None labels tho
FINAL = final_dataset[final_dataset['label'] != 'None']
FINAL.to_csv('SECONDARY-Reddit_Stance.csv', index = False)

In [8]:
FINAL

Unnamed: 0,text,label
0,"Just give those people a wide berth, and do w...",pro-mitigation
1,"I would love to know as well. But then, daycar...",unclear
2,Moral people will see it as an easy way to not...,pro-mitigation
3,masking all time as i dont trust variants and ...,pro-mitigation
4,"Not just ""when out"", but when inside something...",pro-mitigation
...,...,...
295,What vaccine. Why is it presumed that a vac wi...,unclear
296,"Nope, you can still wear a mask if you want th...",pro-mitigation
297,> Too bad this was published after my district...,pro-mitigation
298,It's simple either youreally care about the ki...,unclear
