In [1]:
import os
import json
from random import shuffle

import pandas as pd

In [2]:
os.listdir('../data')

['aggression_annotated_comments.tsv',
 'aggression_annotations.tsv',
 'aggression_worker_demographics.tsv',
 'data.py',
 'docs_for_embeddings.json',
 'entropy.csv',
 'folded_aggresion.csv',
 'merged_folds.csv',
 '__pycache__']

In [3]:
comments = pd.read_csv('../data/aggression_annotated_comments.tsv', sep='\t')

In [4]:
comments

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
1,44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,2002,True,article,random,train
2,49851,"NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...",2002,True,article,random,train
3,89320,"Next, maybe you could work on being less cond...",2002,True,article,random,dev
4,93890,This page will need disambiguation.,2002,True,article,random,train
...,...,...,...,...,...,...,...
115859,699848324,`NEWLINE_TOKENNEWLINE_TOKENNEWLINE_TOKENThese ...,2016,True,article,blocked,train
115860,699851288,NEWLINE_TOKENNEWLINE_TOKENThe Institute for Hi...,2016,True,article,blocked,test
115861,699857133,NEWLINE_TOKEN:The way you're trying to describ...,2016,True,article,blocked,train
115862,699891012,NEWLINE_TOKENNEWLINE_TOKEN== Warning ==NEWLINE...,2016,True,user,blocked,dev


In [5]:
data = pd.read_csv('../data/entropy.csv')

In [7]:
data

Unnamed: 0,rev_id,p_aggressive,entropy,num_annotations,split
0,630648413,0.5,1.0,10,train
1,23142982,0.5,1.0,10,dev
2,386891548,0.5,1.0,6,train
3,226719672,0.5,1.0,8,train
4,118177029,0.5,1.0,26,test
...,...,...,...,...,...
115856,230923672,0.0,0.0,9,train
115857,230906115,0.0,0.0,8,train
115858,230899340,0.0,0.0,10,test
115859,230898107,0.0,0.0,14,train


In [8]:
folded_annotations = pd.read_csv('../data/folded_aggresion.csv')

In [9]:
folded_annotations.head()

Unnamed: 0,rev_id,worker_id,aggression,aggression_score,fold
0,37675,1362,1.0,-1.0,8
1,37675,2408,0.0,1.0,7
2,37675,1493,0.0,0.0,1
3,37675,1439,0.0,0.0,3
4,37675,170,0.0,0.0,6


In [10]:
folded_annotations.groupby('fold').nunique()['worker_id']

fold
1     245
2     245
3     245
4     245
5     245
6     245
7     245
8     245
9     245
10    245
Name: worker_id, dtype: int64

In [11]:
def get_random_documents(document_list: list, nbr: int=20) -> list:
    docs = document_list.copy()
    shuffle(docs)
    return docs[:20]

In [12]:
def get_most_controversial(documents: pd.DataFrame, nbr: int=20) -> list:
    """Returns top nbr of controversial documents, needed input is 
        the DataFrame with rev_id and entropy columns
    """
    documents = documents.sort_values(by='entropy', ascending=False)
    
    return documents['rev_id'].to_list()[:nbr]


def get_least_controversial(documents: pd.DataFrame, nbr: int=20) -> list:
    """Returns top nbr of the least controversial documents, needed input is 
        the DataFrame with rev_id and entropy columns
    """
    documents = documents.sort_values(by='entropy', ascending=True)
    
    return documents['rev_id'].to_list()[:nbr]

In [13]:
worker_to_docs = {}

for worker, worker_data in folded_annotations.groupby('worker_id'):
    documents = worker_data.rev_id.to_list()
    dt = data[(data['split'] == 'dev') & (data['rev_id'].isin(documents))]
    random_docs = get_random_documents(dt.rev_id.to_list(), 20)
    controversial_docs = get_most_controversial(dt, 20)
    least_contr_docs = get_least_controversial(dt, 20)
    
    worker_to_docs[worker] = {
        'random': random_docs,
        'controversial': controversial_docs,
        'non_controversial': least_contr_docs
    }

In [13]:
if not os.path.exists('../data/docs_for_embeddings.json'):
    with open('../data/docs_for_embeddings.json', 'w') as file:
        json.dump(worker_to_docs, file, indent=4)

In [14]:
# join folded_annotations with comments on rev_id column values
merged_folds = folded_annotations.merge(comments, how='left', left_on='rev_id', right_on='rev_id', suffixes=('_fa', '_com'))
merged_folds

Unnamed: 0,rev_id,worker_id,aggression,aggression_score,fold,comment,year,logged_in,ns,sample,split
0,37675,1362,1.0,-1.0,8,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
1,37675,2408,0.0,1.0,7,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
2,37675,1493,0.0,0.0,1,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
3,37675,1439,0.0,0.0,3,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
4,37675,170,0.0,0.0,6,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
...,...,...,...,...,...,...,...,...,...,...,...
1213886,699897151,628,0.0,0.0,10,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train
1213887,699897151,15,0.0,0.0,4,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train
1213888,699897151,57,0.0,0.0,10,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train
1213889,699897151,1815,0.0,0.0,7,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train


In [16]:
merged_folds_aggression = merged_folds.merge(data, how='left', left_on='rev_id', right_on='rev_id', suffixes=('_mf', '_da'))
merged_folds_aggression

Unnamed: 0,rev_id,worker_id,aggression,aggression_score,fold,comment,year,logged_in,ns,sample,split_mf,p_aggressive,entropy,num_annotations,split_da
0,37675,1362,1.0,-1.0,8,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10,train
1,37675,2408,0.0,1.0,7,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10,train
2,37675,1493,0.0,0.0,1,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10,train
3,37675,1439,0.0,0.0,3,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10,train
4,37675,170,0.0,0.0,6,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1213886,699897151,628,0.0,0.0,10,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train,0.0,0.000000,9,train
1213887,699897151,15,0.0,0.0,4,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train,0.0,0.000000,9,train
1213888,699897151,57,0.0,0.0,10,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train,0.0,0.000000,9,train
1213889,699897151,1815,0.0,0.0,7,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train,0.0,0.000000,9,train


In [18]:
# save merged_folds to csv
result_df = merged_folds_aggression[['rev_id', 'worker_id', 'aggression', 'fold', 'comment', 'split_mf', 'p_aggressive', 'entropy']]
result_df = result_df.rename(columns={'split_mf': 'split'})
result_df = result_df.dropna()
result_df['rev_id'] = result_df['rev_id'].astype(int)
result_df['worker_id'] = result_df['worker_id'].astype(int)
result_df['aggression'] = result_df['aggression'].astype(int)
result_df['fold'] = result_df['fold'].astype(int)
result_df.to_csv('../data/merged_folds_aggression.csv', index=False)

In [28]:
len(merged_folds[(merged_folds.fold == 10) & (merged_folds.split == 'test')])
len(merged_folds.worker_id.unique())

2450

In [59]:
merged_folds['aggression'].isna().sum()
merged_folds.dtypes

rev_id                int32
worker_id             int32
aggression            int32
aggression_score    float64
fold                  int32
comment              object
year                float64
logged_in           float64
ns                   object
sample               object
split                object
dtype: object

In [54]:
import numpy as np
print(max(merged_folds.aggression))

1


In [38]:
empty_list = merged_folds_aggression[merged_folds_aggression['worker_id']==11111]['comment'].tolist()
result_df[:0]['comment']

Series([], Name: comment, dtype: object)

In [41]:
import random
# result_df.sort_values(by='p_aggressive', ascending=False)
texts = result_df[(result_df.worker_id == 3151) & (result_df.split == 'dev')]
texts = texts.sort_values(by='p_aggressive', ascending=False)
texts[:3]

Unnamed: 0,rev_id,worker_id,aggression,fold,comment,split,p_aggressive,entropy
1180901,671644617,3151,1,1,NEWLINE_TOKEN:@: nobody cares about the Fuckin...,dev,1.0,0.0
777868,370848423,3151,1,1,NEWLINE_TOKENNEWLINE_TOKENNEWLINE_TOKEN@KWWWWW...,dev,0.875,0.543564
795784,381083216,3151,1,1,`tNEWLINE_TOKENNEWLINE_TOKEN:::Brutal. I shall...,dev,0.666667,0.918296


In [2]:
df = pd.read_csv('../data/merged_folds.csv')
df

Unnamed: 0,rev_id,worker_id,aggression,aggression_score,fold,comment,year,logged_in,ns,sample,split,p_aggressive,entropy,num_annotations,majority,is_decision_major,agg_prc_mainstream,n_agg_prc_mainstream,overall_prc_mainstream
0,37675,1362,1.0,-1.0,8,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10,0.0,0,0.853211,0.975524,0.955947
1,37675,2408,0.0,1.0,7,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10,0.0,1,0.866667,0.948052,0.934783
2,37675,1493,0.0,0.0,1,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10,0.0,1,0.918367,0.955521,0.952924
3,37675,1439,0.0,0.0,3,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10,0.0,1,0.836364,0.943534,0.925645
4,37675,170,0.0,0.0,6,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10,0.0,1,0.509091,0.975709,0.915394
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1213886,699897151,628,0.0,0.0,10,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train,0.0,0.000000,9,0.0,1,0.593548,0.989796,0.916766
1213887,699897151,15,0.0,0.0,4,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train,0.0,0.000000,9,0.0,1,0.914474,0.920726,0.919933
1213888,699897151,57,0.0,0.0,10,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train,0.0,0.000000,9,0.0,1,0.994318,0.686660,0.734033
1213889,699897151,1815,0.0,0.0,7,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train,0.0,0.000000,9,0.0,1,0.939024,0.889299,0.895833


In [9]:
import numpy as np
temp_df = df[df.worker_id == 1362]
temp_arr = np.empty((len(temp_df) * 305), dtype=np.float32)
temp_arr[0: 305].shape

(305,)