In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

import sys
sys.path.append('../')

%load_ext autoreload
%autoreload 2

## Download data

only if it doesn't already exist

In [2]:
from data.data import download_data

download_data()

## Load data

In [2]:
aggression_comments = pd.read_csv('../data/aggression_annotated_comments.tsv',  sep='\t') 
aggression_annotations = pd.read_csv('../data/aggression_annotations.tsv', sep='\t')

aggression_df_full = aggression_annotations.merge(aggression_comments)

In [3]:
aggression_df_full

Unnamed: 0,rev_id,worker_id,aggression,aggression_score,comment,year,logged_in,ns,sample,split
0,37675,1362,1.0,-1.0,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
1,37675,2408,0.0,1.0,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
2,37675,1493,0.0,0.0,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
3,37675,1439,0.0,0.0,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
4,37675,170,0.0,0.0,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
...,...,...,...,...,...,...,...,...,...,...
1365212,699897151,628,0.0,0.0,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train
1365213,699897151,15,0.0,0.0,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train
1365214,699897151,57,0.0,0.0,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train
1365215,699897151,1815,0.0,0.0,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train


### Remove annotators with <100 annotations in train split OR <40 annotations in dev split OR <20 annotations in test split

In [15]:
aggression_df = aggression_df_full.copy()

annotations_count_in_splits = aggression_df.groupby(['worker_id', 'split'])['rev_id'].agg(['count']).reset_index()

votes_count_train = aggression_df[aggression_df.split.isin(['train'])].groupby('worker_id')['rev_id'].agg('count')
workers_to_remove = votes_count_train[votes_count_train.values < 100].index.tolist()

votes_count_dev = aggression_df[aggression_df.split.isin(['dev'])].groupby('worker_id')['rev_id'].agg('count')
workers_to_remove.extend(votes_count_dev[votes_count_dev.values < 40].index.tolist())

votes_aggresive_count_dev = aggression_df[aggression_df.split.isin(['dev'])].groupby('worker_id')['aggression'].agg('sum')
workers_to_remove.extend(votes_aggresive_count_dev[votes_aggresive_count_dev.values < 20].index.tolist())

votes_not_aggressive_count_dev = votes_count_dev - votes_aggresive_count_dev
workers_to_remove.extend(votes_not_aggressive_count_dev[votes_not_aggressive_count_dev.values < 20].index.tolist())

votes_count_test = aggression_df[aggression_df.split.isin(['test'])].groupby('worker_id')['rev_id'].agg('count')
workers_to_remove.extend(votes_count_test[votes_count_test.values < 20].index.tolist())
workers_to_remove = set(workers_to_remove)

aggression_df = aggression_df[~aggression_df.worker_id.isin(workers_to_remove)]

print(f'Number of annotators with <100 annotations in train set OR <20 in test set = {len(workers_to_remove)}')

Number of annotators with <100 annotations in train set OR <20 in test set = 3248


In [16]:
print(f"Number of annotators left: {len(set(aggression_df.worker_id))}")

Number of annotators left: 805


Remove 5 random annotators:

In [18]:
random_5 = set(np.random.choice(list(set(aggression_df.worker_id)), 5, replace=False))
random_5

{98, 736, 944, 997, 1856}

In [20]:
aggression_df = aggression_df[~aggression_df.worker_id.isin(random_5)]
print(f"Number of annotators left: {len(set(aggression_df.worker_id))}")

Number of annotators left: 800


Create folds by annotators:

In [22]:
def split_by_annotators(df):
    worker_ids = np.random.permutation(list(set(df.worker_id)))
    
    step = 80
    fold_idx = 1
    
    df['fold'] = 0
    for i in range(0, len(worker_ids), step):
        
        if i+step > len(worker_ids):
            break
        
        df.loc[df.worker_id.isin(worker_ids[i:i+step]), 'fold'] = int(fold_idx)
        fold_idx += 1
        
    df = df[df['fold'] > 0]
    
    return df
    
aggression_df = split_by_annotators(aggression_df)

Fast check to be sure:

In [23]:
aggression_df.groupby('fold').nunique()

Unnamed: 0_level_0,rev_id,worker_id,aggression,aggression_score,comment,year,logged_in,ns,sample,split
fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,41574,80,2,7,41541,16,2,2,2,3
2,41826,80,2,7,41797,16,2,2,2,3
3,42621,80,2,7,42591,15,1,2,2,3
4,42075,80,2,7,42040,16,2,2,2,3
5,41680,80,2,7,41643,16,2,2,2,3
6,39989,80,2,7,39953,16,1,2,2,3
7,43866,80,2,7,43831,16,2,2,2,3
8,44046,80,2,7,44008,15,2,2,2,3
9,43766,80,2,7,43720,16,2,2,2,3
10,41862,80,2,7,41829,16,2,2,2,3


### Annotations of texts from train dataset without annotations from users from fold-1

In [24]:
aggression_df[(aggression_df.fold != 1) & (aggression_df.split == 'train')]

Unnamed: 0,rev_id,worker_id,aggression,aggression_score,comment,year,logged_in,ns,sample,split,fold
3,37675,1439,0.0,0.0,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,9
6,37675,481,0.0,0.0,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,6
7,37675,487,0.0,0.0,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,10
8,37675,578,0.0,0.0,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,7
9,37675,1127,0.0,0.0,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,5
...,...,...,...,...,...,...,...,...,...,...,...
1365208,699897151,1887,0.0,0.0,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train,3
1365210,699897151,1608,0.0,0.0,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train,9
1365213,699897151,15,0.0,0.0,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train,7
1365214,699897151,57,0.0,0.0,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train,2


### Annotations of texts from test dataset with annotations only from users from fold-1

In [12]:
aggression_df[(aggression_df.fold == 1) & (aggression_df.split == 'test')]

Unnamed: 0,rev_id,worker_id,aggression,aggression_score,comment,year,logged_in,ns,sample,split,fold
224,286174,2190,0.0,0.0,NEWLINE_TOKENNEWLINE_TOKENNEWLINE_TOKENNEWLINE...,2002,False,article,random,test,1
377,386473,1020,0.0,1.0,`NEWLINE_TOKENNEWLINE_TOKENNEWLINE_TOKENNEWLIN...,2002,True,user,random,test,1
422,421948,3337,0.0,0.0,`NEWLINE_TOKENNEWLINE_TOKEN::: Pointing out wh...,2002,True,article,random,test,1
891,916927,72,0.0,1.0,NEWLINE_TOKENNEWLINE_TOKENNEWLINE_TOKENThanks ...,2003,True,user,random,test,1
902,923974,964,0.0,0.0,NEWLINE_TOKENNEWLINE_TOKEN:Given that she's on...,2003,True,user,random,test,1
...,...,...,...,...,...,...,...,...,...,...,...
1364931,699661834,763,0.0,0.0,`NEWLINE_TOKENNEWLINE_TOKEN== kys ==NEWLINE_TO...,2016,True,user,blocked,test,1
1365008,699698850,2533,0.0,0.0,"NEWLINE_TOKENNEWLINE_TOKENYeah, I realized I c...",2016,True,user,blocked,test,1
1365032,699703322,3249,0.0,0.0,`NEWLINE_TOKEN:::Yeah and in the earlier sente...,2016,True,article,blocked,test,1
1365185,699851288,12,0.0,1.0,NEWLINE_TOKENNEWLINE_TOKENThe Institute for Hi...,2016,True,article,blocked,test,1


In [27]:
aggression_df.loc[:, ['rev_id', 'worker_id', 'aggression', 'aggression_score', 'fold']].to_csv('../data/folded_aggresion_40.csv', index=False)

In [28]:
folded_aggression = pd.read_csv('../data/folded_aggresion_40.csv')

In [30]:
folded_aggression#.fold.max()

Unnamed: 0,rev_id,worker_id,aggression,aggression_score,fold
0,37675,1439,0.0,0.0,9
1,37675,481,0.0,0.0,6
2,37675,487,0.0,0.0,10
3,37675,578,0.0,0.0,7
4,37675,1127,0.0,0.0,5
...,...,...,...,...,...
545082,699897151,1887,0.0,0.0,3
545083,699897151,1608,0.0,0.0,9
545084,699897151,15,0.0,0.0,7
545085,699897151,57,0.0,0.0,2
