In [5]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import string
import unidecode
from statistics import harmonic_mean
from sklearn.metrics import classification_report
import os
import sys

np.random.seed(42)

In [7]:
os.chdir('../')

Make sure that your current working directory (cwd) is `ReproducingAugSS/AugmentedSocialScientist/`

In [8]:
#os.getcwd()

In [9]:
from PATHS import OFF_GS, OFF_RA_GS, OFF_MW_GS
from PATHS import ENDOEXO_GS, ENDOEXO_RA_GS, ENDOEXO_MW_GS

# Policy vs. Politics

In [10]:
### Load annotation data

# Gold Standard
pp_ass_gs = pd.read_csv(ENDOEXO_GS)
# Gold Standard annotated by Research assistants
pp_ra_gs = pd.read_csv(ENDOEXO_RA_GS)
# Gold Standard annotated by Microworkers
pp_mw_gs = pd.read_csv(ENDOEXO_MW_GS)

pp_ass_gs.labels = pp_ass_gs.labels.apply(eval)
pp_ra_gs.labels = pp_ra_gs.labels.apply(eval)
pp_mw_gs.labels = pp_mw_gs.labels.apply(eval)

In [11]:
# sorting (to ensure that the texts are ordered in the same way)
pp_ass_gs = pp_ass_gs.sort_values(by='text').reset_index(drop=True)
pp_ra_gs = pp_ra_gs.sort_values(by='text').reset_index(drop=True)
pp_mw_gs = pp_mw_gs.drop_duplicates(subset=['text']).sort_values(by='text').reset_index(drop=True)

In [12]:
## Alignement by character
def char_labels(texts,labels):
    #inputs: list of text, list of labels from doccano export
    #output: list of labels by character
    c = 0
    list_labels = []
    for i,text in enumerate(texts):
        c += len(text)
        dic_char = {}  
        for j in range(len(text)):
            dic_char[j] = np.nan
        if len(labels[i])>0:
            for label in labels[i]:
                for k in range(label[0],min(label[1],len(text))):
                    dic_char[k] = label[2]
        list_labels += list(dic_char.values())
    assert len(list_labels)==c, f"The length doesn't match:{len(list_labels)},{c}"
    return list_labels

### Policy vs. Politics - Human Performance - Microworkers

In [13]:
print(classification_report(char_labels(pp_ass_gs.text.values, pp_ass_gs.labels.values), char_labels(pp_mw_gs.text.values, pp_mw_gs.labels.values),target_names=['other','politics','policy','nan']))

              precision    recall  f1-score   support

       other       0.47      0.70      0.56      3446
    politics       0.57      0.58      0.57     22668
      policy       0.74      0.69      0.72     33992
         nan       0.12      0.59      0.20        68

    accuracy                           0.65     60174
   macro avg       0.48      0.64      0.51     60174
weighted avg       0.66      0.65      0.65     60174



### Policy vs. Politics - Human Performance - Research Assistants

In [14]:
print(classification_report(char_labels(pp_ass_gs.text.values, pp_ass_gs.labels.values), char_labels(pp_ra_gs.text.values, pp_ra_gs.labels.values),target_names=['other','politics','policy','nan']))

              precision    recall  f1-score   support

       other       0.52      0.91      0.66      3446
    politics       0.77      0.77      0.77     22668
      policy       0.87      0.80      0.83     33992
         nan       0.11      0.69      0.19        68

    accuracy                           0.79     60174
   macro avg       0.57      0.79      0.61     60174
weighted avg       0.81      0.79      0.80     60174



# Off the Record

In [15]:
### Load annotation data

off_ass_gs = pd.read_csv(OFF_GS)
off_ra_gs = pd.read_csv(OFF_RA_GS)
off_mw_gs = pd.read_csv(OFF_MW_GS)

off_ass_gs.labels = off_ass_gs.labels.apply(eval)
off_ra_gs.labels = off_ra_gs.labels.apply(eval)
off_mw_gs.labels = off_mw_gs.labels.apply(eval)

In [16]:
# sorting (to ensure that the texts are ordered in the same way)

off_ass_gs['simplified_text']=off_ass_gs.text.apply(lambda x: unidecode.unidecode(x.translate(str.maketrans('', '', string.punctuation)).lower().replace(" ","")).replace("<","").replace(">",""))
off_ra_gs['simplified_text']=off_ra_gs.text.apply(lambda x: unidecode.unidecode(x.translate(str.maketrans('', '', string.punctuation)).lower().replace(" ","")).replace("<","").replace(">",""))
off_mw_gs['simplified_text']=off_mw_gs.text.apply(lambda x: unidecode.unidecode(x.translate(str.maketrans('', '', string.punctuation)).lower().replace(" ","")).replace("<","").replace(">",""))


off_ass_gs = off_ass_gs.sort_values(by='simplified_text').reset_index(drop=True)
off_ra_gs = off_ra_gs.sort_values(by='simplified_text').reset_index(drop=True)
off_mw_gs = off_mw_gs.sort_values(by='simplified_text').reset_index(drop=True)

## Metric: by Span

For each span (consecutive sequence of highlighted characters) in the gold standard, if the annotator has highlighted at least a quarter of its characters, that span is considered correctly identified by the annotator (counted as a true positive)

In [17]:
# Function to count the number of ture positive according to the metric "by Span"
def count_tp(pr, gs):
    # Inputs: 
    #     pr: list of the spans annotated by the annotator
    #     gs : list of the spans in the gold standard
    # Output: the number of true positives
    tp = 0 
    for i, pred_labels in enumerate(pr.labels):
        for pred_lab in pred_labels:
            #vérifier si pred_lab est un true positive
            for gs_lab in gs.loc[i,'labels']:
                ## si la moitiée du span de gs est identifiée
                if len(set(range(pred_lab[0],pred_lab[1])) & set(range(gs_lab[0],gs_lab[1]))) >= .5*(gs_lab[1]-gs_lab[0]):
                    tp+=1
    return tp

### Off the Record - Human Performance - Microworkers (by Span)

In [18]:
mw_prec = count_tp(off_mw_gs, off_ass_gs)/off_mw_gs.labels.apply(len).sum()
mw_recall = count_tp(off_mw_gs, off_ass_gs)/off_ass_gs.labels.apply(len).sum()
mw_f1 = harmonic_mean([mw_prec, mw_recall])
print(f'Microworkers Precision: {mw_prec:.2f}')
print(f'Microworkers Recall: {mw_recall:.2f}')
print(f'Microworkers F1: {mw_f1:.2f}')

Microworkers Precision: 0.67
Microworkers Recall: 0.73
Microworkers F1: 0.70


### Off the Record -  Human Performance - Research Assistants (by Span)

In [19]:
ra_prec = count_tp(off_ra_gs, off_ass_gs)/off_ra_gs.labels.apply(len).sum()
ra_recall = count_tp(off_ra_gs, off_ass_gs)/off_ass_gs.labels.apply(len).sum()
ra_f1 = harmonic_mean([ra_prec, ra_recall])
print(f'Research Assistants Precision: {ra_prec:.2f}')
print(f'Research Assistants Recall: {ra_recall:.2f}')
print(f'Research Assistants F1: {ra_f1:.2f}')

Research Assistants Precision: 0.85
Research Assistants Recall: 0.86
Research Assistants F1: 0.86


## Metric: by Character

Character-by-character comparison of annotations 

In [20]:
## Alignement by character
def char_labels_off(texts,labels):
    #inputs: list of text, list of labels from doccano export
    #output: list of labels by character
    c = 0
    list_labels = []
    for i,text in enumerate(texts):
        c += len(text)
        dic_char = {}  
        if type(labels[i])==list:
            for j in range(len(text)):
                dic_char[j] = 'autre'
            for label in labels[i]:
                for k in range(label[0],min(label[1],len(text))):
                    dic_char[k] = label[2]
        else:
            for j in range(len(text)):
                dic_char[j] = np.nan        
        list_labels += list(dic_char.values())
    assert len(list_labels)==c, f"The length doesn't match:{len(list_labels)},{c}"
    return list_labels

### Off the Record - Human Performance - Microworkers (by Character)

In [21]:
print(classification_report(char_labels_off(off_ass_gs.text.values, off_ass_gs.labels.values), char_labels_off(off_mw_gs.text.values, off_mw_gs.labels.values),target_names=['non off','off']))

              precision    recall  f1-score   support

     non off       0.99      0.97      0.98    307151
         off       0.44      0.71      0.55      9424

    accuracy                           0.96    316575
   macro avg       0.72      0.84      0.77    316575
weighted avg       0.97      0.96      0.97    316575



### Off the Record - Human Performance - Research Assistants (by Character)

In [22]:
print(classification_report(char_labels_off(off_ass_gs.text.values, off_ass_gs.labels.values), char_labels_off(off_ra_gs.text.values, off_ra_gs.labels.values),target_names=['non off','off']))

              precision    recall  f1-score   support

     non off       0.99      0.99      0.99    307151
         off       0.75      0.83      0.79      9424

    accuracy                           0.99    316575
   macro avg       0.87      0.91      0.89    316575
weighted avg       0.99      0.99      0.99    316575

