In [9]:
import numpy as np
import pandas as pd
import urllib
import urllib.request

In [10]:
ANNOTATED_COMMENTS_URL = 'https://ndownloader.figshare.com/files/7554634' 
ANNOTATIONS_URL = 'https://ndownloader.figshare.com/files/7554637' 


def download_file(url, fname):
    urllib.request.urlretrieve(url, fname)

                
download_file(ANNOTATED_COMMENTS_URL, 'attack_annotated_comments.tsv')
download_file(ANNOTATIONS_URL, 'attack_annotations.tsv')

In [11]:
demo = pd.read_csv('attack_worker_demographics.tsv', sep="\t")
demo

Unnamed: 0,worker_id,gender,english_first_language,age_group,education
0,833,female,0,45-60,bachelors
1,1072,male,0,30-45,bachelors
2,872,male,0,18-30,hs
3,2116,male,0,30-45,professional
4,453,male,0,30-45,hs
...,...,...,...,...,...
2185,1442,male,0,18-30,hs
2186,529,female,0,30-45,hs
2187,2036,female,0,18-30,masters
2188,393,female,0,18-30,masters


In [12]:
print(len(demo))
print(len(demo[demo["gender"]=="female"]))
print(len(demo[demo["gender"]=="male"]))

2190
840
1349


In [13]:
print(len(demo[demo["english_first_language"]==1]))
print(len(demo[demo["english_first_language"]==0]))

402
1788


In [14]:
set(demo["education"])

{'bachelors', 'doctorate', 'hs', 'masters', 'none', 'professional', 'some'}

In [15]:
print(len(demo[demo["education"]=='bachelors']))
print(len(demo[demo["education"]=='doctorate']))
print(len(demo[demo["education"]=='masters']))
print(len(demo[demo["education"]=='hs']))
print(len(demo[demo["education"]=='none']))
print(len(demo[demo["education"]=='some']))
print(len(demo[demo["education"]=='professional']))

862
20
385
631
1
48
243


In [16]:
set(demo["age_group"])

{'18-30', '30-45', '45-60', 'Over 60', 'Under 18', nan}

In [18]:
print(len(demo[demo["age_group"]=='18-30']))
print(len(demo[demo["age_group"]=='30-45']))
print(len(demo[demo["age_group"]=='45-60']))
print(len(demo[demo["age_group"]=='Over 60']))
print(len(demo[demo["age_group"]=='Under 18']))
print(len(demo[demo["age_group"].isnull()]))

1049
831
218
20
37
35


In [20]:
annotations = pd.read_csv('attack_annotations.tsv',  sep = '\t')
annotations

Unnamed: 0,rev_id,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
0,37675,1362,0.0,0.0,0.0,0.0,0.0
1,37675,2408,0.0,0.0,0.0,0.0,0.0
2,37675,1493,0.0,0.0,0.0,0.0,0.0
3,37675,1439,0.0,0.0,0.0,0.0,0.0
4,37675,170,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
1365212,699897151,628,0.0,0.0,0.0,0.0,0.0
1365213,699897151,15,0.0,0.0,0.0,0.0,0.0
1365214,699897151,57,0.0,0.0,0.0,0.0,0.0
1365215,699897151,1815,0.0,0.0,0.0,0.0,0.0


In [23]:
anno_attack = demo.join(annotations.set_index("worker_id"), on="worker_id")
anno_attack

Unnamed: 0,worker_id,gender,english_first_language,age_group,education,rev_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
0,833,female,0,45-60,bachelors,4763903,0.0,0.0,0.0,0.0,0.0
0,833,female,0,45-60,bachelors,5350726,0.0,0.0,0.0,0.0,0.0
0,833,female,0,45-60,bachelors,5838638,0.0,0.0,0.0,0.0,0.0
0,833,female,0,45-60,bachelors,6270546,0.0,0.0,0.0,0.0,0.0
0,833,female,0,45-60,bachelors,6735507,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2189,3876,female,1,30-45,bachelors,297618709,0.0,0.0,0.0,0.0,0.0
2189,3876,female,1,30-45,bachelors,431984398,0.0,0.0,0.0,0.0,0.0
2189,3876,female,1,30-45,bachelors,480154536,0.0,0.0,0.0,0.0,0.0
2189,3876,female,1,30-45,bachelors,515614948,0.0,0.0,0.0,0.0,0.0


In [24]:
idx_female = np.where((anno_attack["attack"]== 1.0) & (anno_attack["gender"]=="female"))
idx_male = np.where((anno_attack["attack"]== 1.0) & (anno_attack["gender"]=="male"))
print(len(idx_female[0])/len(anno_attack[anno_attack["gender"]=="female"]))
print(len(idx_male[0])/len(anno_attack[anno_attack["gender"]=="male"]))
print((len(anno_attack[anno_attack["gender"]=="female"])))
print((len(anno_attack[anno_attack["gender"]=="male"])))

0.17359779034933123
0.15869094948550047
308103
547328


In [25]:
print(len(anno_attack[anno_attack["recipient_attack"]==1]))
print(len(anno_attack[anno_attack["third_party_attack"]==1]))
print(len(anno_attack[anno_attack["other_attack"]==1]))
print(len(anno_attack[anno_attack["quoting_attack"]==1]))

92682
29552
27886
6522


In [26]:
comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0)
comments

Unnamed: 0_level_0,comment,year,logged_in,ns,sample,split
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,False,article,random,train
44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,2002,False,article,random,train
49851,"NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...",2002,False,article,random,train
89320,"Next, maybe you could work on being less cond...",2002,True,article,random,dev
93890,This page will need disambiguation.,2002,True,article,random,train
...,...,...,...,...,...,...
699848324,`NEWLINE_TOKENNEWLINE_TOKENNEWLINE_TOKENThese ...,2016,True,article,blocked,train
699851288,NEWLINE_TOKENNEWLINE_TOKENThe Institute for Hi...,2016,True,article,blocked,test
699857133,NEWLINE_TOKEN:The way you're trying to describ...,2016,True,article,blocked,train
699891012,NEWLINE_TOKENNEWLINE_TOKEN== Warning ==NEWLINE...,2016,True,user,blocked,dev


In [30]:
print(len(comments[comments["ns"]=="article"]))
print(len(comments[comments["ns"]=="user"]))

51317
64547
