In [1]:
import pandas as pd
import numpy as np

Load data:

In [3]:
aggression_comments = pd.read_csv('../data/aggression_annotated_comments.tsv',  sep='\t') 
aggression_annotations = pd.read_csv('../data/aggression_annotations.tsv', sep='\t')
aggression_annotators = pd.read_csv('../data/aggression_worker_demographics.tsv', sep='\t')

In [5]:
toxicity_comments = pd.read_csv('../data/toxicity_annotated_comments.tsv',  sep='\t') 
toxicity_annotations = pd.read_csv('../data/toxicity_annotations.tsv', sep='\t')
toxicity_annotators = pd.read_csv('../data/toxicity_worker_demographics.tsv', sep='\t')

In [6]:
attack_comments = pd.read_csv('../data/attack_annotated_comments.tsv',  sep='\t') 
attack_annotations = pd.read_csv('../data/attack_annotations.tsv', sep='\t')
attack_annotators = pd.read_csv('../data/attack_worker_demographics.tsv', sep='\t')

In [7]:
aggression_annotations.head()

Unnamed: 0,rev_id,worker_id,aggression,aggression_score
0,37675,1362,1.0,-1.0
1,37675,2408,0.0,1.0
2,37675,1493,0.0,0.0
3,37675,1439,0.0,0.0
4,37675,170,0.0,0.0


In [8]:
aggression_comments.head()

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
1,44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,2002,True,article,random,train
2,49851,"NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...",2002,True,article,random,train
3,89320,"Next, maybe you could work on being less cond...",2002,True,article,random,dev
4,93890,This page will need disambiguation.,2002,True,article,random,train


In [9]:
aggression_annotators.head()

Unnamed: 0,worker_id,gender,english_first_language,age_group,education
0,833,female,0,45-60,bachelors
1,1072,male,0,30-45,bachelors
2,872,male,0,18-30,hs
3,2116,male,0,30-45,professional
4,453,male,0,30-45,hs


# Check if texts are the same

In [10]:
print(f'Number of distinct texts in attack dataset: {len(set(attack_comments.rev_id))}')
print(f'Number of distinct texts in aggression dataset: {len(set(aggression_comments.rev_id))}')
attack_vs_aggression = attack_comments.merge(aggression_comments, on='rev_id')
attack_vs_aggression['same'] = (attack_vs_aggression.comment_x == attack_vs_aggression.comment_y)
print(f'Number of distinct texts in merged dataset: {len(set(attack_vs_aggression.rev_id))}')
print(f'Number of times the texts matched: {attack_vs_aggression.same.sum()}')

Number of distinct texts in attack dataset: 115864
Number of distinct texts in aggression dataset: 115864
Number of distinct texts in merged dataset: 115864
Number of times the texts matched: 115864


In [11]:
print(f'Number of distinct texts in toxicity dataset: {len(set(toxicity_comments.rev_id))}')
print(f'Number of distinct texts in aggression dataset: {len(set(aggression_comments.rev_id))}')
toxicity_vs_aggression = toxicity_comments.merge(aggression_comments, on='rev_id')
toxicity_vs_aggression['same'] = (toxicity_vs_aggression.comment_x == toxicity_vs_aggression.comment_y)
print(f'Number of distinct texts in merged dataset: {len(set(toxicity_vs_aggression.rev_id))}')
print(f'Number of times the texts matched: {toxicity_vs_aggression.same.sum()}')

Number of distinct texts in toxicity dataset: 159686
Number of distinct texts in aggression dataset: 115864
Number of distinct texts in merged dataset: 77972
Number of times the texts matched: 77480


In [12]:
print(f'Number of distinct texts in toxicity dataset: {len(set(toxicity_comments.rev_id))}')
print(f'Number of distinct texts in attack dataset: {len(set(attack_comments.rev_id))}')
toxicity_vs_attack = toxicity_comments.merge(attack_comments, on='rev_id')
toxicity_vs_attack['same'] = (toxicity_vs_attack.comment_x == toxicity_vs_attack.comment_y)
print(f'Number of distinct texts in merged dataset: {len(set(toxicity_vs_attack.rev_id))}')
print(f'Number of times the texts matched: {toxicity_vs_attack.same.sum()}')

Number of distinct texts in toxicity dataset: 159686
Number of distinct texts in attack dataset: 115864
Number of distinct texts in merged dataset: 77972
Number of times the texts matched: 77480


In [13]:
difference = toxicity_vs_aggression.loc[toxicity_vs_aggression.same == False][['rev_id', 'comment_x', 'comment_y']]

In [14]:
print(difference.iloc[1,1])
print('--------------------')
print(difference.iloc[1,2])

`Terri_Schiavo&diff;=18616024&oldid;=18601666 here].NEWLINE_TOKENNEWLINE_TOKENin this one diff, he both denies it is a massive edit while saying it was a couple hours of editing. NEWLINE_TOKENNEWLINE_TOKENThe list of issues with SlimVirgin's edits include the following:NEWLINE_TOKENNEWLINE_TOKEN-NEWLINE_TOKENregarding this diff: The embedded note that USED to be there said:NEWLINE_TOKENNEWLINE_TOKEN< This paragraph is a direct quote from Dr. Bernat's testimony before the U.S. Senate in April 2005. Dr. Bernat's testimony was approved by the AAN Executive Committee. The two links provided document the testimony and the AAN approval.> NEWLINE_TOKENNEWLINE_TOKENSlimVirgin deleted this embedded note, and then inserts into the same paragraph, an embedded note questioning the accuracy of the quote.NEWLINE_TOKENNEWLINE_TOKEN<Is this true? I seem to recall a case in England where a man woke up after a long time in PVS.><What levels of ``prognostic certainty``?: NEWLINE_TOKENNEWLINE_TOKEN-NEWLIN

# Check if annotators are the same

In [15]:
print(f'Number of annotators in attack demographics data: {len(set(attack_annotators.worker_id))}')
print(f'Number of annotators that annotated something in attack data: {len(set(attack_annotations.worker_id))}')
print(f'Number of annotators in aggression demographics data: {len(set(aggression_annotators.worker_id))}')
print(f'Number of annotators that annotated something in aggression data: {len(set(aggression_annotations.worker_id))}')
attack_vs_aggression = attack_annotators.merge(aggression_annotators, on='worker_id')
attack_vs_aggression['same'] = (attack_vs_aggression.gender_x == attack_vs_aggression.gender_y) & \
                (attack_vs_aggression.english_first_language_x == attack_vs_aggression.english_first_language_y) & \
                ((attack_vs_aggression.age_group_x == attack_vs_aggression.age_group_y) | \
                 (attack_vs_aggression.age_group_x.isna() == attack_vs_aggression. age_group_y.isna())) & \
                (attack_vs_aggression.education_x == attack_vs_aggression.education_y)
print(f'Number of annotators with compatible demographic data: {attack_vs_aggression.same.sum()}')

Number of annotators in attack demographics data: 2190
Number of annotators that annotated something in attack data: 4053
Number of annotators in aggression demographics data: 2190
Number of annotators that annotated something in aggression data: 4053
Number of annotators with compatible demographic data: 2190


In [16]:
print(f'Number of annotators in attack demographics data: {len(set(attack_annotators.worker_id))}')
print(f'Number of annotators that annotated something in attack data: {len(set(attack_annotations.worker_id))}')
print(f'Number of annotators in toxicity demographics data: {len(set(toxicity_annotators.worker_id))}')
print(f'Number of annotators that annotated something in toxicity data: {len(set(toxicity_annotations.worker_id))}')
attack_vs_toxicity = attack_annotators.merge(toxicity_annotators, on='worker_id')
attack_vs_toxicity['same'] = (attack_vs_toxicity.gender_x == attack_vs_toxicity.gender_y) & \
                (attack_vs_toxicity.english_first_language_x == attack_vs_toxicity.english_first_language_y) & \
                ((attack_vs_toxicity.age_group_x == attack_vs_toxicity.age_group_y) | \
                 (attack_vs_toxicity.age_group_x.isna() == attack_vs_toxicity. age_group_y.isna())) & \
                (attack_vs_toxicity.education_x == attack_vs_toxicity.education_y)
print(f'Number of annotators with compatible demographic data: {attack_vs_toxicity.same.sum()}')

Number of annotators in attack demographics data: 2190
Number of annotators that annotated something in attack data: 4053
Number of annotators in toxicity demographics data: 3591
Number of annotators that annotated something in toxicity data: 4301
Number of annotators with compatible demographic data: 170


In [17]:
attack_vs_toxicity.loc[attack_vs_toxicity.same == False]

Unnamed: 0,worker_id,gender_x,english_first_language_x,age_group_x,education_x,gender_y,english_first_language_y,age_group_y,education_y,same
0,833,female,0,45-60,bachelors,male,0,18-30,hs,False
1,1072,male,0,30-45,bachelors,female,0,30-45,bachelors,False
2,872,male,0,18-30,hs,male,0,30-45,bachelors,False
3,2116,male,0,30-45,professional,male,0,Under 18,hs,False
4,453,male,0,30-45,hs,male,0,18-30,bachelors,False
...,...,...,...,...,...,...,...,...,...,...
1853,3277,male,1,45-60,hs,female,0,,bachelors,False
1854,529,female,0,30-45,hs,male,0,18-30,professional,False
1855,2036,female,0,18-30,masters,male,0,18-30,masters,False
1856,393,female,0,18-30,masters,male,0,18-30,bachelors,False


# Compare attack & aggression

In [18]:
attack_annotations.head()

Unnamed: 0,rev_id,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack
0,37675,1362,0.0,0.0,0.0,0.0,0.0
1,37675,2408,0.0,0.0,0.0,0.0,0.0
2,37675,1493,0.0,0.0,0.0,0.0,0.0
3,37675,1439,0.0,0.0,0.0,0.0,0.0
4,37675,170,0.0,0.0,0.0,0.0,0.0


In [19]:
aggression_annotations.head()

Unnamed: 0,rev_id,worker_id,aggression,aggression_score
0,37675,1362,1.0,-1.0
1,37675,2408,0.0,1.0
2,37675,1493,0.0,0.0
3,37675,1439,0.0,0.0
4,37675,170,0.0,0.0


In [20]:
print(f'Number of annotations in attack: {len(attack_annotations.drop_duplicates())}')
print(f'Number of annotations in agression: {len(aggression_annotations.drop_duplicates())}')
compare = attack_annotations.merge(aggression_annotations, on=['rev_id', 'worker_id'])
print(f'Number of annotations in merged: {len(compare.drop_duplicates())}')

Number of annotations in attack: 1365217
Number of annotations in agression: 1365217
Number of annotations in merged: 1365217


In [21]:
compare.head()

Unnamed: 0,rev_id,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack,aggression,aggression_score
0,37675,1362,0.0,0.0,0.0,0.0,0.0,1.0,-1.0
1,37675,2408,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,37675,1493,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,37675,1439,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,37675,170,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
compare.describe()

Unnamed: 0,rev_id,worker_id,quoting_attack,recipient_attack,third_party_attack,other_attack,attack,aggression,aggression_score
count,1365217.0,1365217.0,1365217.0,1365217.0,1365217.0,1365217.0,1365217.0,1365217.0,1365217.0
mean,299974500.0,1339.286,0.007524811,0.1117822,0.03264756,0.03231794,0.1669595,0.1830207,-0.2002495
std,198421400.0,1053.387,0.08641871,0.3150985,0.1777125,0.1768432,0.3729399,0.3866837,0.9264773
min,37675.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.0
25%,126867700.0,445.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,269997600.0,1079.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,459291100.0,2062.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,699897200.0,4052.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0


In [23]:
print(f'Annotations with attack = 1 & aggression = 1: {len(compare.loc[np.logical_and(compare.attack==1, compare.aggression==1)])}')
print(f'Annotations with attack = 0 & aggression = 1: {len(compare.loc[np.logical_and(compare.attack==0, compare.aggression==1)])}')
print(f'Annotations with attack = 1 & aggression = 0: {len(compare.loc[np.logical_and(compare.attack==1, compare.aggression==0)])}')
print(f'Annotations with attack = 0 & aggression = 0: {len(compare.loc[np.logical_and(compare.attack==0, compare.aggression==0)])}')

Annotations with attack = 1 & aggression = 1: 213907
Annotations with attack = 0 & aggression = 1: 35956
Annotations with attack = 1 & aggression = 0: 14029
Annotations with attack = 0 & aggression = 0: 1101325
