In [1]:
import pandas as pd

## Reading data

In [2]:
df = pd.read_csv("/private/home/gulordava/edouard_data/itwiki/testsets/mturk_data/processed/p12_processed",sep="\t",header=None)

In [3]:
df.columns = ["worker_id","type","class","answer","correct_answer", "opt1","opt2","full_id","sent"]

In [4]:
df[["full_id", "type"]].drop_duplicates().groupby("type").size()

type
control              250
target_generated    1097
target_original      126
dtype: int64

In [5]:
ids = []
t = df[df.type.isin(["target_generated","target_original"])]
for row in t.full_id:
    ids.append(row.split("__"))
ids = pd.DataFrame(ids, columns=["pattern_id","constr_id","sent_id"])

t = t.reset_index(drop=True).join(ids)
t = t[["pattern_id","constr_id","sent_id"]].drop_duplicates().groupby(["pattern_id","constr_id"]).size().reset_index()
subset = t[t[0] < 10][["pattern_id","constr_id"]]
subset["pc"] = subset["pattern_id"] + subset["constr_id"]
#t[~(t.pattern_id + t.constr_id).isin(subset.pc)]

In [6]:
t = df[df.type == "control"].groupby(["worker_id"]).size()
controlled_workers = list(t[t >= 10].index)
len(controlled_workers)

76

## Finding and filtering spammers

In [7]:
control_acc = df[df.worker_id.isin(controlled_workers)][df.type=="control"].groupby(["worker_id","class"]).size().reset_index()
control_acc.columns = ["worker_id","class","N"]
control_acc[["perc"]] = control_acc.groupby(["worker_id"]).transform(lambda x: x/sum(x))
t = control_acc[control_acc["class"] == "F"]

bad_workers = list(t[t.perc > 0.2].worker_id.values)
len(bad_workers)
#t[t.worker_id.isin(bad_workers)]

  """Entry point for launching an IPython kernel.


40

Blocked spammers from the first experiment

In [8]:
blocked_spammers = ['A1JMQLQBLDOZYL',
 'A1YPX4OBE2YR2V',
 'A2MLOF6VG898LQ',
 'AF1HT6VL272QZ',
 'AWDSIX3ULD32V',
 'A352QLA6AIP5J2', 
 'A1YGSIQI52QT9L','A12Q6033OGVFCZ','A1RO22O88I71FD','A2F4FBGDUQ9VQ7','A35CJT7MMT4EVW','A6NN7XDLH3RON',
 'A15UR9T6I32Y2D', 'A31OEH89H0DG82', 'AG3WY7N4UC07C','A13ISH3PAGHO0A',
 'A15UR9T6I32Y2D',
 'A1RUYCD6BDQBBS',
 'A1X5N37U5JPJ6Z',
 'A2PU4YNWITAQVL',
 'A30JJJ2M9K8F5A',
 'A31OEH89H0DG82',
 'A3US40JJ8RYD4S',
 'AEBETUY5OD68H',
 'AG3WY7N4UC07C',
 'AQ07VGUPLAGEU',
 'A15JR89YUENFHY','A24KNAYGIC1V7Q']

New spammers to block

In [9]:
new_spammers = [w for w in bad_workers if not w in blocked_spammers]
t[t.worker_id.isin(new_spammers)][t.N > 10]

  


Unnamed: 0,worker_id,class,N,perc
53,A2FW2B5TJJNALN,F,22,0.5
65,A2X7BE2FSPUKCK,F,25,0.446429
92,A3PKJULC5O1V6C,F,14,0.583333
120,AK5YRGLG7QKI8,F,26,0.448276


In [10]:
#df[df.worker_id.isin(new_spammers)].groupby("worker_id").size()

In [11]:
good_workers = [w for w in controlled_workers if not w in bad_workers]

In [12]:
len(df[df.worker_id.isin(bad_workers)])

13288

In [13]:
len(df[df.worker_id.isin(good_workers)])

13924

In [14]:
len(good_workers)

36

## Data by good workers

In [15]:
len(df)

29109

In [16]:
good_df = df[df.worker_id.isin(good_workers)]
len(good_df)

13924

Some statistics: wrong and correct responses per sentence type

In [17]:
good_df.groupby(["type","class"]).size()

type              class
control           F          69
                  T        2322
target_generated  F        1221
                  T        9143
target_original   F          66
                  T        1103
dtype: int64

Average number of good judgments per example

In [18]:
good_df.groupby(["full_id"]).size().mean()

9.452817379497624

In [45]:
good_df.groupby(["full_id"]).size()

full_id
L!DET_ADJ_NOUN!Number=Sing!Number=Plur__17__0    10
L!DET_ADJ_NOUN!Number=Sing!Number=Plur__17__1     8
L!DET_ADJ_NOUN!Number=Sing!Number=Plur__17__2     8
L!DET_ADJ_NOUN!Number=Sing!Number=Plur__17__3     9
L!DET_ADJ_NOUN!Number=Sing!Number=Plur__17__4    12
L!DET_ADJ_NOUN!Number=Sing!Number=Plur__17__5     9
L!DET_ADJ_NOUN!Number=Sing!Number=Plur__17__6    10
L!DET_ADJ_NOUN!Number=Sing!Number=Plur__17__7     9
L!DET_ADJ_NOUN!Number=Sing!Number=Plur__17__8     9
L!DET_ADJ_NOUN!Number=Sing!Number=Plur__17__9    10
L!DET_ADJ_NOUN!Number=Sing!Number=Plur__1__0      8
L!DET_ADJ_NOUN!Number=Sing!Number=Plur__1__1     11
L!DET_ADJ_NOUN!Number=Sing!Number=Plur__1__2      8
L!DET_ADJ_NOUN!Number=Sing!Number=Plur__1__3      9
L!DET_ADJ_NOUN!Number=Sing!Number=Plur__1__4      8
L!DET_ADJ_NOUN!Number=Sing!Number=Plur__1__5     10
L!DET_ADJ_NOUN!Number=Sing!Number=Plur__1__6     11
L!DET_ADJ_NOUN!Number=Sing!Number=Plur__1__7     10
L!DET_ADJ_NOUN!Number=Sing!Number=Plur__1__8      9
L!DE

In [970]:
pd.set_option('max_colwidth',120)

### Filtering out sentences stranded from merging of two batches

In [19]:
ids = []
target_df = good_df[good_df.type.isin(["target_generated","target_original"])]
for row in target_df.full_id:
    ids.append(row.split("__"))
ids = pd.DataFrame(ids, columns=["pattern_id","constr_id","sent_id"])


target_df = target_df.reset_index(drop=True).join(ids)

patterns = {p:"__".join(p.split("!")[:2]) for p in set(target_df.pattern_id)}
target_df["pattern"] = target_df["pattern_id"].map(patterns)

Since there was merging of batch 1 (small batch) and the second large batch, there are some sentences from batch 1 which were not fully un"unked" and should be filtered out

The final DF contains 119 model sentences and 119*9 generated sentences

In [20]:
t = target_df[["pattern_id","constr_id","sent_id"]].drop_duplicates().groupby(["pattern_id","constr_id"]).size().reset_index()
subset = t[t[0] < 10][["pattern_id","constr_id"]]
subset["pc"] = subset["pattern_id"] + subset["constr_id"]

target_df = target_df[~(target_df.pattern_id + target_df.constr_id).isin(subset.pc)]

In [21]:
target_df.groupby("type").size()

type
target_generated    10163
target_original      1113
dtype: int64

In [22]:
len(target_df.full_id.drop_duplicates())

1190

Each model sentence has 9 generated counterparts

In [23]:
target_df[["type","full_id"]].drop_duplicates().groupby("type").size()

type
target_generated    1071
target_original      119
dtype: int64

### Saving final clean, good workers, data

In [33]:
target_df.type = ["original" if x == "target_original" else "generated" for x in target_df.type]

In [977]:
target_df.to_csv("/private/home/gulordava/edouard_data/itwiki/testsets/mturk_data/merged_clean/batch2.csv",sep="\t",index=False)

## Some stats

Number of different model sentences per pattern

In [24]:
target_df[["pattern","constr_id"]].drop_duplicates().groupby("pattern").size()

pattern
L__DET_ADJ_NOUN                14
L__NOUN_VERB_PRON_VERB          6
L__NOUN_VERB_VERB              27
R__ADJ_ADJ_CCONJ_ADJ           13
R__NOUN_ADJ_PUNCT_PRON_VERB    10
R__NOUN_NOUN_ADV_ADJ           13
R__NOUN_NOUN_VERB              18
R__VERB_NOUN_CCONJ_VERB        18
dtype: int64

In [25]:
pd.set_option('max_colwidth',140)

In [26]:
counts = target_df.groupby(["pattern","class"]).size().reset_index()
counts.columns = ["pattern","class","N"]
counts
counts[["perc"]] = counts.groupby(["pattern"]).transform(lambda x: x/sum(x))

In [27]:
pd.options.display.float_format = '{:,.2f}'.format

Overall accuracy on all target sentences

In [28]:
counts[counts["class"] == "T"]
perc_all = counts[counts["class"] == "T"][["pattern","perc"]]
counts[counts["class"] == "T"]

Unnamed: 0,pattern,class,N,perc
1,L__DET_ADJ_NOUN,T,1302,0.98
3,L__NOUN_VERB_PRON_VERB,T,560,0.95
5,L__NOUN_VERB_VERB,T,2350,0.93
7,R__ADJ_ADJ_CCONJ_ADJ,T,1216,0.98
9,R__NOUN_ADJ_PUNCT_PRON_VERB,T,842,0.9
11,R__NOUN_NOUN_ADV_ADJ,T,975,0.8
13,R__NOUN_NOUN_VERB,T,1281,0.74
15,R__VERB_NOUN_CCONJ_VERB,T,1498,0.87


Accuracy on original sentences

In [34]:
counts = target_df[target_df.type=="original"].groupby(["pattern","class"]).size().reset_index()
counts.columns = ["pattern","class","N"]
counts
counts[["perc_orig"]] = counts.groupby(["pattern"]).transform(lambda x: x/sum(x))
perc_orig = counts[counts["class"] == "T"][["pattern","perc_orig"]]
counts[counts["class"] == "T"]

Unnamed: 0,pattern,class,N,perc_orig
1,L__DET_ADJ_NOUN,T,130,0.98
3,L__NOUN_VERB_PRON_VERB,T,53,0.93
5,L__NOUN_VERB_VERB,T,243,0.97
7,R__ADJ_ADJ_CCONJ_ADJ,T,119,0.98
9,R__NOUN_ADJ_PUNCT_PRON_VERB,T,88,0.96
11,R__NOUN_NOUN_ADV_ADJ,T,107,0.91
13,R__NOUN_NOUN_VERB,T,147,0.86
15,R__VERB_NOUN_CCONJ_VERB,T,163,0.94


Accuracy on generated sentences

In [35]:
counts = target_df[target_df.type=="generated"].groupby(["pattern","class"]).size().reset_index()
counts.columns = ["pattern","class","N"]
counts
counts[["perc_gen"]] = counts.groupby(["pattern"]).transform(lambda x: x/sum(x))
perc_gen = counts[counts["class"] == "T"][["pattern","perc_gen"]]
counts[counts["class"] == "T"]

Unnamed: 0,pattern,class,N,perc_gen
1,L__DET_ADJ_NOUN,T,1172,0.98
3,L__NOUN_VERB_PRON_VERB,T,507,0.95
5,L__NOUN_VERB_VERB,T,2107,0.92
7,R__ADJ_ADJ_CCONJ_ADJ,T,1097,0.98
9,R__NOUN_ADJ_PUNCT_PRON_VERB,T,754,0.9
11,R__NOUN_NOUN_ADV_ADJ,T,868,0.79
13,R__NOUN_NOUN_VERB,T,1134,0.73
15,R__VERB_NOUN_CCONJ_VERB,T,1335,0.87


In [36]:
perc = pd.merge(perc_all, perc_orig, on="pattern")

perc = pd.merge(perc, perc_gen, on="pattern")
perc

Unnamed: 0,pattern,perc,perc_orig,perc_gen
0,L__DET_ADJ_NOUN,0.98,0.98,0.98
1,L__NOUN_VERB_PRON_VERB,0.95,0.93,0.95
2,L__NOUN_VERB_VERB,0.93,0.97,0.92
3,R__ADJ_ADJ_CCONJ_ADJ,0.98,0.98,0.98
4,R__NOUN_ADJ_PUNCT_PRON_VERB,0.9,0.96,0.9
5,R__NOUN_NOUN_ADV_ADJ,0.8,0.91,0.79
6,R__NOUN_NOUN_VERB,0.74,0.86,0.73
7,R__VERB_NOUN_CCONJ_VERB,0.87,0.94,0.87


In [37]:
perc.corr()

Unnamed: 0,perc,perc_orig,perc_gen
perc,1.0,0.9,1.0
perc_orig,0.9,1.0,0.9
perc_gen,1.0,0.9,1.0


In [38]:
tg = target_df[target_df.type=="generated"]
counts = tg[tg.pattern == "L__NOUN_VERB_PRON_VERB"].groupby(["constr_id","class"]).size().reset_index()
counts.columns = ["constr_id","class","N"]
counts[["perc"]] = counts.groupby(["constr_id"]).transform(lambda x: x/sum(x))
counts[counts["class"] == "T"]

Unnamed: 0,constr_id,class,N,perc
1,0,T,84,0.93
2,13,T,89,1.0
4,14,T,79,0.95
6,15,T,78,0.88
8,17,T,91,0.98
10,26,T,86,0.99


In [39]:
tg = target_df[target_df.type=="original"]
counts = tg[tg.pattern == "L__NOUN_VERB_PRON_VERB"].groupby(["constr_id","class"]).size().reset_index()
counts.columns = ["constr_id","class","N"]
counts[["perc"]] = counts.groupby(["constr_id"]).transform(lambda x: x/sum(x))
counts[counts["class"] == "T"]

Unnamed: 0,constr_id,class,N,perc
1,0,T,8,0.89
2,13,T,9,1.0
3,14,T,10,1.0
5,15,T,7,0.7
6,17,T,8,1.0
7,26,T,11,1.0


In [40]:
t = target_df[target_df.pattern == "L__NOUN_VERB_PRON_VERB"][target_df["class"] == "T"][["sent","opt1","opt2","answer","constr_id","sent_id"]]
t[t.constr_id == "15"].drop_duplicates()

  """Entry point for launching an IPython kernel.


Unnamed: 0,sent,opt1,opt2,answer,constr_id,sent_id
2470,Nelle sovrapposizioni per rivincita il capitolo che avanza l' instaurazione per trasportare alla crosta di spirale dell' artista si,comportano,comporta,comporta,15,1
4080,Nelle lacrime per convocazione il presupposto che erge l' istituzione per salvaguardare alla signora di gola dell' agente si,riuniscono,riunisce,riunisce,15,9
4288,Nelle piste per cognata il serpente che adotta l' emanazione per bilanciare alla fattura di pubblicazione dell' artista si,superano,supera,supera,15,3
5197,Nelle teste per fetta il desiderio che acquista l' invasione per contenere alla strada di prospettiva dell' atleta si,riportano,riporta,riporta,15,5
5487,Nelle lettere per striscia il patrimonio che ama l' euforia per bere alla coscienza di rinascita dell' atleta si,comporta,comportano,comporta,15,6
6561,Nelle sostanze per terra il creolo che effettua l' impressione per temere alla coppia di dipendenza dell' ospite si,tenta,tentano,tenta,15,8
6740,Nelle liti per cucina il metallo che appoggia l' alternativa per contenere alla propensione di regia dell' adolescente si,rimane,rimangono,rimane,15,7
9418,Nelle trasmissioni per fede il ramo che apre l' ambientazione per sopravvivere alla ventina di delegazione dell' apprendista si,svolgono,svolge,svolge,15,4
9733,Nelle parti per tomba il lago che occorre l' idea per partire alla follia di spinta dell' autista si,rivelano,rivela,rivela,15,2
11067,Nelle distribuzioni per ruota il tempo che impiega l' acqua per giungere alla bocca di derivazione dell' utente si,consuma,consumano,consuma,15,0


In [990]:
tg[tg.pattern_id == "L!NOUN_VERB_VERB!Number=Sing!Number=Plur"][tg.constr_id == "39"].sent[:1].values

  """Entry point for launching an IPython kernel.


array([ "( 6 ) considerando che l' obbligo di indicare il prezzo di vendita e il prezzo per unità di misura"], dtype=object)

### Correlation of construction accuracy

In [41]:
counts = target_df[target_df.type=="generated"].groupby(["pattern","constr_id","class"]).size().reset_index()
counts.columns = ["pattern","constr_id","class","N"]
counts
counts[["perc_gen"]] = counts.groupby(["pattern", "constr_id"]).transform(lambda x: x/sum(x))
perc_gen = counts[counts["class"] == "T"][["pattern","constr_id","perc_gen"]]


In [42]:
counts = target_df[target_df.type=="original"].groupby(["pattern","constr_id","class"]).size().reset_index()
counts.columns = ["pattern","constr_id","class","N"]
counts
counts[["perc_orig"]] = counts.groupby(["pattern", "constr_id"]).transform(lambda x: x/sum(x))
perc_orig = counts[counts["class"] == "T"][["pattern","constr_id","perc_orig"]]


In [43]:
from scipy.stats import pearsonr
t = pd.merge(perc_gen, perc_orig, on=["pattern","constr_id"])
pearsonr(t.perc_gen, t.perc_orig)

(0.76575394359377313, 3.5607104359515027e-24)