In [47]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag, ne_chunk

nltk.download("maxent_ne_chunker")
nltk.download("words")
nltk.download("averaged_perceptron_tagger")
nltk.download("punkt")
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))


def is_noise(word):
    return (
        word.lower() in stop_words
        or word.isdigit()
        or re.match(r"^\W+$", word) is not None
        or re.match(r"[\U0001F600-\U0001F64F]", word) is not None
    )


def is_named_entity(token):
    tagged_token = pos_tag([token])
    chunk = ne_chunk(tagged_token)
    for subtree in chunk:
        if hasattr(subtree, "label"):
            return True
    return False


def normalize(column):
    return (column - column.min()) / (column.max() - column.min())

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/tomas/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/tomas/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tomas/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /home/tomas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/tomas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [48]:
base = pd.read_csv('babe-base-annomatic.csv').rename(columns={'Unnamed: 0':'token'})
magpie = pd.read_csv('magpie-annomatic.csv').rename(columns={'Unnamed: 0':'token'})
synth = pd.read_csv('roberta-anno-lexical-ft.csv').rename(columns={'Unnamed: 0':'token'})

base['token'] = base['token'].astype(str)
magpie['token'] = magpie['token'].astype(str)
synth['token'] = synth['token'].astype(str)

base = base[~base["token"].apply(is_noise)]
magpie = magpie[~magpie["token"].apply(is_noise)]
synth = synth[~synth["token"].apply(is_noise)]

In [49]:
print(f"BASE x MAGPIE: {len(np.intersect1d(base.token.tolist(),magpie.token.tolist()))}")
print(f"BASE x SYNTH: {len(np.intersect1d(base.token.tolist(),synth.token.tolist()))}")
print(f"MAGPIE x SYNTH: {len(np.intersect1d(magpie.token.tolist(),synth.token.tolist()))}")


BASE x MAGPIE: 807
BASE x SYNTH: 792
MAGPIE x SYNTH: 783


In [50]:
magpie_ = magpie[magpie['attribution'] >= 0]
magpie_[magpie_['token'].apply(is_named_entity)]['attribution'].mean()

0.06218049519729795

In [51]:
base_ = base[base['attribution'] >= 0]
base_[base_['token'].apply(is_named_entity)]['attribution'].mean()

0.11847788212482634

In [52]:
synth_ = synth[synth['attribution'] >= 0]
synth_[synth_['token'].apply(is_named_entity)]['attribution'].mean()

0.08675489275717088

In [53]:
merged_df = base.merge(magpie, on='token', suffixes=('_base', '_magpie')).merge(synth, on='token')
merged_df = merged_df.rename(columns={'attribution_base':'base','attribution_magpie':'magpie','attribution':'synth'})

In [54]:
merged_df[merged_df['token'].apply(is_named_entity)][['base','magpie','synth']].mean()

base      0.129211
magpie    0.049537
synth     0.103807
dtype: float64

In [55]:
base['attribution'].mean()

0.07500100158535464

In [31]:
merged_df[merged_df['token'].apply(is_named_entity)]

Unnamed: 0,token,base,count_base,magpie,count_magpie,synth,count
4,GOP,0.12114,4,0.040395,2,0.118335,3
5,Obamacare,0.119445,2,0.027668,3,0.066662,5
13,Obama,0.115195,4,0.077899,2,0.169631,5
16,Abortion,0.147291,2,0.150972,1,0.129188,2
32,Trump,0.146611,68,0.06015,64,0.098368,76
43,Bannon,0.129086,2,0.07529,1,0.246865,1
60,Hitler,0.208815,1,0.125999,1,0.116705,1
65,Wahhabism,0.14878,1,0.024682,1,0.047215,1
78,America,0.076736,8,0.070863,4,0.0713,7
86,Party,0.077661,2,0.036822,2,0.059879,2
