In [1]:
import os
import json
from random import sample, shuffle

import pandas as pd



In [2]:
os.listdir('../data')

['chude_merged_folds.7z',
 'aggression',
 'aggression_annotations.tsv',
 'entropy_text.csv',
 'data.py',
 'merged_folds.csv',
 'folded_aggresion.csv',
 'aggression_annotated_comments.tsv',
 'chude_merged_folds.csv',
 'toxicity',
 'docs_for_embeddings.json',
 'entropy_test.csv',
 'merged_folds.7z',
 'aggression_worker_demographics.tsv',
 'entropy.csv',
 'attack']

In [3]:
comments = pd.read_csv('../data/aggression_annotated_comments.tsv', sep='\t')

In [4]:
comments

Unnamed: 0,rev_id,comment,year,logged_in,ns,sample,split
0,37675,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
1,44816,`NEWLINE_TOKENNEWLINE_TOKEN:: the term ``stand...,2002,True,article,random,train
2,49851,"NEWLINE_TOKENNEWLINE_TOKENTrue or false, the s...",2002,True,article,random,train
3,89320,"Next, maybe you could work on being less cond...",2002,True,article,random,dev
4,93890,This page will need disambiguation.,2002,True,article,random,train
...,...,...,...,...,...,...,...
115859,699848324,`NEWLINE_TOKENNEWLINE_TOKENNEWLINE_TOKENThese ...,2016,True,article,blocked,train
115860,699851288,NEWLINE_TOKENNEWLINE_TOKENThe Institute for Hi...,2016,True,article,blocked,test
115861,699857133,NEWLINE_TOKEN:The way you're trying to describ...,2016,True,article,blocked,train
115862,699891012,NEWLINE_TOKENNEWLINE_TOKEN== Warning ==NEWLINE...,2016,True,user,blocked,dev


In [5]:
data = pd.read_csv('../data/entropy.csv')
entropy_test = pd.read_csv('../data/entropy_test.csv')

In [42]:
data

Unnamed: 0,rev_id,p_aggressive,entropy,num_annotations,split
0,630648413,0.5,1.0,10,train
1,23142982,0.5,1.0,10,dev
2,386891548,0.5,1.0,6,train
3,226719672,0.5,1.0,8,train
4,118177029,0.5,1.0,26,test
...,...,...,...,...,...
115856,230923672,0.0,0.0,9,train
115857,230906115,0.0,0.0,8,train
115858,230899340,0.0,0.0,10,test
115859,230898107,0.0,0.0,14,train


In [6]:
data

Unnamed: 0,rev_id,p_aggressive,entropy,num_annotations,split
0,630648413,0.5,1.0,10,train
1,23142982,0.5,1.0,10,dev
2,386891548,0.5,1.0,6,train
3,226719672,0.5,1.0,8,train
4,118177029,0.5,1.0,26,test
...,...,...,...,...,...
115856,230923672,0.0,0.0,9,train
115857,230906115,0.0,0.0,8,train
115858,230899340,0.0,0.0,10,test
115859,230898107,0.0,0.0,14,train


In [7]:
folded_annotations = pd.read_csv('../data/folded_aggresion.csv')

In [8]:
folded_annotations.head()

Unnamed: 0,rev_id,worker_id,aggression,aggression_score,fold
0,37675,1362,1.0,-1.0,8
1,37675,2408,0.0,1.0,7
2,37675,1493,0.0,0.0,1
3,37675,1439,0.0,0.0,3
4,37675,170,0.0,0.0,6


In [9]:
folded_annotations.groupby('fold').nunique()['worker_id']

fold
1     245
2     245
3     245
4     245
5     245
6     245
7     245
8     245
9     245
10    245
Name: worker_id, dtype: int64

In [10]:
def get_random_documents(document_list: list, nbr: int=20) -> list:
    docs = document_list.copy()
    shuffle(docs)
    return docs[:20]

In [11]:
def get_most_controversial(documents: pd.DataFrame, nbr: int=20) -> list:
    """Returns top nbr of controversial documents, needed input is 
        the DataFrame with rev_id and entropy columns
    """
    documents = documents.sort_values(by='entropy', ascending=False)
    
    return documents['rev_id'].to_list()[:nbr]


def get_least_controversial(documents: pd.DataFrame, nbr: int=20) -> list:
    """Returns top nbr of the least controversial documents, needed input is 
        the DataFrame with rev_id and entropy columns
    """
    documents = documents.sort_values(by='entropy', ascending=True)
    
    return documents['rev_id'].to_list()[:nbr]


def get_balanced(documents: pd.DataFrame, max_nbr_per_class: int=10) -> dict:
    aggressive = get_most_controversial(
        documents[documents.aggression==1], max_nbr_per_class)
    
    non_aggresive = get_most_controversial(
        documents[documents.aggression==0], max_nbr_per_class)
    
    return {'aggresive': aggressive, 'nonaggresive': non_aggresive}


def shuffle_with_return_at_the_end(doc_ids: list, nbr: int=10) -> list:
    shuffled = []
    
    while len(shuffled) < nbr:
        shuffled += sample(doc_ids, len(doc_ids))
        
    return shuffled[:nbr]
    

def get_random_balanced(documents: pd.DataFrame, max_nbr_per_class: int=10) -> dict:
    
    aggressive = shuffle_with_return_at_the_end(
        documents[documents.aggression==1].rev_id.to_list(), max_nbr_per_class)
    
    non_aggresive = shuffle_with_return_at_the_end(
        documents[documents.aggression==0].rev_id.to_list(), max_nbr_per_class)
    
    return {'aggresive': aggressive, 'nonaggresive': non_aggresive}

In [12]:
worker_to_docs = {}

if not os.path.exists('../data/docs_for_embeddings.json'):

    for worker, worker_data in folded_annotations.groupby('worker_id'):
        documents = worker_data.rev_id.to_list()
        dt = data[(data['split'] == 'dev') & (data['rev_id'].isin(documents))]
        random_docs = get_random_documents(dt.rev_id.to_list(), 20)
        controversial_docs = get_most_controversial(dt, 20)
        least_contr_docs = get_least_controversial(dt, 20)
        
        dt = dt.merge(worker_data, how='left', on='rev_id')
    
        balanced = get_balanced(dt, 10)
        random_balanced = get_random_balanced(dt, 10)

        worker_to_docs[worker] = {
            'random': random_docs,
            'controversial': controversial_docs,
            'non_controversial': least_contr_docs,
            'balanced': balanced,
            'random_balanced': random_balanced
        }

    with open('../data/docs_for_embeddings.json', 'w') as file:
        json.dump(worker_to_docs, file, indent=4)
        
else:
    with open('../data/docs_for_embeddings.json', 'r') as file:
        worker_to_docs = json.load(file)
    

In [13]:
worker_to_docs

{'0': {'random': [169545601,
   65345647,
   138162578,
   595455177,
   609528342,
   682288627,
   24956842,
   122527375,
   213154518,
   112838925,
   131671291,
   126722558,
   184282239,
   432594830,
   637494707,
   160231668,
   231429060,
   42873195,
   124300857,
   415275454],
  'controversial': [463940735,
   195600431,
   122560905,
   174622478,
   213154518,
   168927940,
   682288627,
   562784485,
   298350830,
   44528813,
   272296035,
   37912969,
   215380609,
   695825460,
   423824335,
   116095529,
   31370737,
   45455267,
   437425705,
   204196250],
  'non_controversial': [609528342,
   107408731,
   108274319,
   110603143,
   106052633,
   121526978,
   122527375,
   88930385,
   81801252,
   82243699,
   147577932,
   151178341,
   162152234,
   162313061,
   131793821,
   126722558,
   138162578,
   136609050,
   30375850,
   41818973],
  'balanced': {'aggresive': [463940735,
    195600431,
    122560905,
    213154518,
    168927940,
    174622478,
 

In [14]:
# join folded_annotations with comments on rev_id column values
merged_folds = folded_annotations.merge(comments, how='left', left_on='rev_id', right_on='rev_id', suffixes=('_fa', '_com'))
merged_folds

Unnamed: 0,rev_id,worker_id,aggression,aggression_score,fold,comment,year,logged_in,ns,sample,split
0,37675,1362,1.0,-1.0,8,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
1,37675,2408,0.0,1.0,7,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
2,37675,1493,0.0,0.0,1,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
3,37675,1439,0.0,0.0,3,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
4,37675,170,0.0,0.0,6,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train
...,...,...,...,...,...,...,...,...,...,...,...
1213886,699897151,628,0.0,0.0,10,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train
1213887,699897151,15,0.0,0.0,4,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train
1213888,699897151,57,0.0,0.0,10,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train
1213889,699897151,1815,0.0,0.0,7,Alternate option===NEWLINE_TOKENIs there perha...,2016,True,article,blocked,train


In [15]:
# save merged_folds to csv
merged_folds = merged_folds.dropna()
merged_folds['rev_id'] = merged_folds['rev_id'].astype(int)
merged_folds['worker_id'] = merged_folds['worker_id'].astype(int)
merged_folds['aggression'] = merged_folds['aggression'].astype(int)
merged_folds['fold'] = merged_folds['fold'].astype(int)

In [16]:
len(merged_folds)

1213891

In [17]:
len(merged_folds[(merged_folds.fold == 10) & (merged_folds.split == 'test')])
len(merged_folds.worker_id.unique())

2450

In [18]:
merged_folds['aggression'].isna().sum()
merged_folds.dtypes

rev_id                int64
worker_id             int64
aggression            int64
aggression_score    float64
fold                  int64
comment              object
year                  int64
logged_in              bool
ns                   object
sample               object
split                object
dtype: object

In [19]:
import numpy as np
print(max(merged_folds.aggression))

1


In [20]:
merged_folds = merged_folds.merge(data, how='left')
len(merged_folds)



1213891

In [21]:
merged_folds.head()

Unnamed: 0,rev_id,worker_id,aggression,aggression_score,fold,comment,year,logged_in,ns,sample,split,p_aggressive,entropy,num_annotations
0,37675,1362,1,-1.0,8,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10
1,37675,2408,0,1.0,7,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10
2,37675,1493,0,0.0,1,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10
3,37675,1439,0,0.0,3,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10
4,37675,170,0,0.0,6,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10


In [22]:
def top_value_count(x: pd.Series) -> int:
    values = x.value_counts()
    
    if len(values) > 1 and values[0] == values[1]:
        return 1
    
    return values.index[0]

def get_majority(data: pd.DataFrame) -> pd.DataFrame:
    counts = data.groupby('rev_id').aggression
    majority = counts.apply(top_value_count).reset_index()
    return majority

In [23]:
#it might take a moment
majority = get_majority(merged_folds[['rev_id', 'aggression']])
majority.rename(columns={'aggression': 'majority'}, inplace=True)
majority.head()

Unnamed: 0,rev_id,majority
0,37675,0
1,44816,0
2,49851,0
3,89320,1
4,93890,0


In [24]:
majority.head()

Unnamed: 0,rev_id,majority
0,37675,0
1,44816,0
2,49851,0
3,89320,1
4,93890,0


In [25]:
merged_folds = merged_folds.merge(majority, how='left', on='rev_id')
len(merged_folds)

1213891

In [26]:
merged_folds['is_decision_major'] = (merged_folds.aggression == merged_folds.majority).astype(int)

In [27]:
merged_folds.head()

Unnamed: 0,rev_id,worker_id,aggression,aggression_score,fold,comment,year,logged_in,ns,sample,split,p_aggressive,entropy,num_annotations,majority,is_decision_major
0,37675,1362,1,-1.0,8,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10,0,0
1,37675,2408,0,1.0,7,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10,0,1
2,37675,1493,0,0.0,1,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10,0,1
3,37675,1439,0,0.0,3,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10,0,1
4,37675,170,0,0.0,6,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,train,0.1,0.468996,10,0,1


In [28]:
def count_mainstream(x: pd.DataFrame) -> dict:
    overall = len(x[x.is_decision_major == 1]) / len(x)
    aggr_data = x[x.majority == 1]
    
    try:
        aggr = len(aggr_data[aggr_data.is_decision_major == 1]) / len(aggr_data)
    except ZeroDivisionError:
        aggr = None
        
    n_aggr_data = x[x.majority == 0]
    try:
        n_aggr = len(n_aggr_data[n_aggr_data.is_decision_major == 1]) / len(n_aggr_data)
    except ZeroDivisionError:
        n_aggr = None

    return {
        'agg_prc_mainstream': aggr,
        'n_agg_prc_mainstream': n_aggr,
        'overall_prc_mainstream': overall
    }

def get_mainstream(data: pd.DataFrame) -> pd.DataFrame:
    mainstream_groups = data.groupby('worker_id')[['majority', 'is_decision_major']]
    
    mainstream = mainstream_groups.apply(count_mainstream).reset_index()
    
    mainstream = pd.concat([mainstream.drop([0], axis=1), mainstream[0].apply(pd.Series)], axis=1)
    return mainstream

In [29]:
mainstream = get_mainstream(merged_folds)
mainstream.head()

Unnamed: 0,worker_id,agg_prc_mainstream,n_agg_prc_mainstream,overall_prc_mainstream
0,0,0.975758,0.765734,0.799609
1,1,0.693548,0.991304,0.945946
2,2,0.315789,0.925754,0.9
3,3,0.666667,0.981602,0.947826
4,4,0.738854,0.989704,0.947537


In [30]:
merged_folds = merged_folds.merge(mainstream, how='left', on='worker_id')

In [31]:
def count_weighted_mainstream(x: pd.DataFrame) -> dict:
    x['agreed_with'] = x.apply(lambda x: x.p_aggressive if x.aggression else 1 - x.p_aggressive, axis=1)
    
    overall = sum(x.agreed_with) / len(x)
    
    aggr_data = x[x.aggression == 1]
    try:
        aggr = sum(aggr_data.agreed_with) / len(aggr_data)
    except ZeroDivisionError:
        aggr = None
    
    n_aggr_data = x[x.aggression == 0]
    
    try:
        n_aggr = sum(n_aggr_data.agreed_with) / len(n_aggr_data)
    except ZeroDivisionError:
        n_aggr = None
    
    return {
        'w_agg_prc_mainstream': aggr,
        'w_n_agg_prc_mainstream': n_aggr,
        'w_overall_prc_mainstream': overall
    }

def get_weighted_mainstream(data: pd.DataFrame) -> pd.DataFrame:
    mainstream_groups = data.groupby('worker_id')[['aggression', 'p_aggressive']]
    
    mainstream = mainstream_groups.apply(count_weighted_mainstream).reset_index()
    
    mainstream = pd.concat([mainstream.drop([0], axis=1), mainstream[0].apply(pd.Series)], axis=1)
    return mainstream

In [32]:
weighted_mainstream = get_weighted_mainstream(merged_folds)
weighted_mainstream.head()

Unnamed: 0,worker_id,w_agg_prc_mainstream,w_n_agg_prc_mainstream,w_overall_prc_mainstream
0,0,0.474881,0.957366,0.786633
1,1,0.823387,0.893834,0.885872
2,2,0.25912,0.932343,0.875493
3,3,0.757218,0.916672,0.902652
4,4,0.837419,0.890882,0.883784


In [33]:
merged_folds = merged_folds.merge(weighted_mainstream, how='left', on='worker_id')
merged_folds.to_csv('../data/merged_folds.csv', index=False)
merged_folds.head()

Unnamed: 0,rev_id,worker_id,aggression,aggression_score,fold,comment,year,logged_in,ns,sample,...,entropy,num_annotations,majority,is_decision_major,agg_prc_mainstream,n_agg_prc_mainstream,overall_prc_mainstream,w_agg_prc_mainstream,w_n_agg_prc_mainstream,w_overall_prc_mainstream
0,37675,1362,1,-1.0,8,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,...,0.468996,10,0,0,0.853211,0.975524,0.955947,0.75321,0.920649,0.894341
1,37675,2408,0,1.0,7,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,...,0.468996,10,0,1,0.866667,0.948052,0.934783,0.684949,0.938019,0.891256
2,37675,1493,0,0.0,1,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,...,0.468996,10,0,1,0.918367,0.955521,0.952924,0.609341,0.957012,0.92031
3,37675,1439,0,0.0,3,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,...,0.468996,10,0,1,0.836364,0.943534,0.925645,0.699629,0.909334,0.870193
4,37675,170,0,0.0,6,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,...,0.468996,10,0,1,0.509091,0.975709,0.915394,0.696506,0.858145,0.84409


In [34]:
def calculate_mainstream_for_only_top(dataset: pd.DataFrame, top: int):
    new_worker_docs = {w: d['controversial'][:top] for w, d in worker_to_docs.items()}
    
    dataset['is_top_mainstream'] = dataset.apply(lambda x: 
                                                 1 if x['rev_id'] in new_worker_docs[str(x['worker_id'])] 
                                                 else 0, axis=1)
    
    dataset = dataset[dataset.is_top_mainstream == 1]
    
    return get_mainstream(dataset), get_weighted_mainstream(dataset)

In [35]:
mn_5, w_mn_5 = calculate_mainstream_for_only_top(merged_folds[merged_folds.split=='dev'], 5)
mn_30, w_mn_30 = calculate_mainstream_for_only_top(merged_folds[merged_folds.split=='dev'], 20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['is_top_mainstream'] = dataset.apply(lambda x:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['is_top_mainstream'] = dataset.apply(lambda x:


In [36]:
mn_5 = mn_5.rename(columns = {c: f'5_{c}' for c in mn_5.columns if c != 'worker_id'})
w_mn_5 = w_mn_5.rename(columns = {c: f'5_{c}' for c in w_mn_5.columns if c != 'worker_id'})
mn_30 = mn_30.rename(columns = {c: f'30_{c}' for c in mn_30.columns if c != 'worker_id'})
w_mn_30 = w_mn_30.rename(columns = {c: f'30_{c}' for c in w_mn_30.columns if c != 'worker_id'})

In [37]:
mn_5

Unnamed: 0,worker_id,5_agg_prc_mainstream,5_n_agg_prc_mainstream,5_overall_prc_mainstream
0,0,1.000000,0.000000,0.4
1,1,1.000000,1.000000,1.0
2,2,,0.600000,0.6
3,3,0.000000,1.000000,0.6
4,4,0.500000,1.000000,0.6
...,...,...,...,...
2445,3994,0.750000,0.000000,0.6
2446,4007,0.666667,0.000000,0.4
2447,4008,0.000000,0.333333,0.2
2448,4011,0.333333,0.500000,0.4


In [38]:
merged_folds = merged_folds.merge(mn_5, how='left', on='worker_id')
merged_folds = merged_folds.merge(w_mn_5, how='left', on='worker_id')
merged_folds = merged_folds.merge(mn_30, how='left', on='worker_id')
merged_folds = merged_folds.merge(w_mn_30, how='left', on='worker_id')

In [39]:
merged_folds.columns

Index(['rev_id', 'worker_id', 'aggression', 'aggression_score', 'fold',
       'comment', 'year', 'logged_in', 'ns', 'sample', 'split', 'p_aggressive',
       'entropy', 'num_annotations', 'majority', 'is_decision_major',
       'agg_prc_mainstream', 'n_agg_prc_mainstream', 'overall_prc_mainstream',
       'w_agg_prc_mainstream', 'w_n_agg_prc_mainstream',
       'w_overall_prc_mainstream', '5_agg_prc_mainstream',
       '5_n_agg_prc_mainstream', '5_overall_prc_mainstream',
       '5_w_agg_prc_mainstream', '5_w_n_agg_prc_mainstream',
       '5_w_overall_prc_mainstream', '30_agg_prc_mainstream',
       '30_n_agg_prc_mainstream', '30_overall_prc_mainstream',
       '30_w_agg_prc_mainstream', '30_w_n_agg_prc_mainstream',
       '30_w_overall_prc_mainstream'],
      dtype='object')

In [40]:
merged_folds.to_csv('../data/merged_folds.csv', index=False)
merged_folds.head()

Unnamed: 0,rev_id,worker_id,aggression,aggression_score,fold,comment,year,logged_in,ns,sample,...,5_overall_prc_mainstream,5_w_agg_prc_mainstream,5_w_n_agg_prc_mainstream,5_w_overall_prc_mainstream,30_agg_prc_mainstream,30_n_agg_prc_mainstream,30_overall_prc_mainstream,30_w_agg_prc_mainstream,30_w_n_agg_prc_mainstream,30_w_overall_prc_mainstream
0,37675,1362,1,-1.0,8,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,...,0.6,0.481481,0.555556,0.511111,0.625,0.75,0.7,0.483395,0.488657,0.486553
1,37675,2408,0,1.0,7,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,...,0.4,0.444444,0.527778,0.511111,0.75,0.875,0.8,0.493216,0.492889,0.493053
2,37675,1493,0,0.0,1,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,...,0.6,0.466667,,0.466667,1.0,0.625,0.7,0.511716,0.511389,0.511553
3,37675,1439,0,0.0,3,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,...,0.4,0.481481,0.555556,0.511111,0.666667,0.714286,0.7,0.527145,0.517824,0.521553
4,37675,170,0,0.0,6,`-NEWLINE_TOKENThis is not ``creative``. Thos...,2002,True,article,random,...,0.4,,0.533333,0.533333,0.0,0.909091,0.5,0.625,0.506407,0.512336


In [43]:
# join folded_annotations with comments on rev_id column values
test_merged_folds = folded_annotations.merge(comments, how='left', left_on='rev_id', right_on='rev_id', suffixes=('_fa', '_com'))
test_merged_folds = test_merged_folds[test_merged_folds.split=='test']
test_merged_folds.head()
len(test_merged_folds)

304378

In [45]:
test_merged_folds = test_merged_folds.merge(entropy_test, how='left')
len(test_merged_folds)

304378

In [50]:
test_majority = get_majority(test_merged_folds[['rev_id', 'aggression']])
test_majority.rename(columns={'aggression': 'majority'}, inplace=True)
test_merged_folds = test_merged_folds.merge(test_majority, how='left', on='rev_id')
test_merged_folds['is_decision_major'] = (test_merged_folds.aggression == test_merged_folds.majority).astype(int)

In [51]:
test_mainstream = get_mainstream(test_merged_folds)
test_mainstream.head()

Unnamed: 0,worker_id,agg_prc_mainstream,n_agg_prc_mainstream,overall_prc_mainstream
0,0,0.961538,0.743697,0.782759
1,1,0.69697,0.981132,0.932292
2,2,0.0,0.972603,0.934211
3,3,0.6875,0.976285,0.94386
4,4,0.652174,0.994681,0.92735


In [52]:
test_weighted_mainstream = get_weighted_mainstream(test_merged_folds)
test_weighted_mainstream.head()

Unnamed: 0,worker_id,w_agg_prc_mainstream,w_n_agg_prc_mainstream,w_overall_prc_mainstream
0,0,0.469359,0.957621,0.770735
1,1,0.793505,0.878725,0.867185
2,2,0.105556,0.929724,0.908035
3,3,0.745353,0.914715,0.898076
4,4,0.839492,0.870432,0.866333


In [55]:
test_merged_folds = test_merged_folds.merge(test_mainstream, how='left', on='worker_id')
test_merged_folds = test_merged_folds.merge(test_weighted_mainstream, how='left', on='worker_id')

test_merged_folds.head()

Unnamed: 0,rev_id,worker_id,aggression,aggression_score,fold,comment,year,logged_in,ns,sample,...,entropy,num_annotations,majority,is_decision_major,w_agg_prc_mainstream,w_n_agg_prc_mainstream,w_overall_prc_mainstream,agg_prc_mainstream,n_agg_prc_mainstream,overall_prc_mainstream
0,155243,699,0.0,0.0,2,NEWLINE_TOKENNEWLINE_TOKEN:If I may butt in I...,2002,True,user,random,...,0.811278,8,0.0,1,0.624664,0.929338,0.797023,0.962963,0.801653,0.851429
1,155243,144,0.0,0.0,6,NEWLINE_TOKENNEWLINE_TOKEN:If I may butt in I...,2002,True,user,random,...,0.811278,8,0.0,1,0.55367,0.949803,0.892608,0.944444,0.940828,0.941176
2,155243,214,1.0,-1.0,8,NEWLINE_TOKENNEWLINE_TOKEN:If I may butt in I...,2002,True,user,random,...,0.811278,8,0.0,0,0.626386,0.933698,0.85709,0.901639,0.888889,0.891117
3,155243,240,0.0,0.0,7,NEWLINE_TOKENNEWLINE_TOKEN:If I may butt in I...,2002,True,user,random,...,0.811278,8,0.0,1,0.649939,0.901663,0.830019,1.0,0.877358,0.9
4,155243,449,1.0,-1.0,9,NEWLINE_TOKENNEWLINE_TOKEN:If I may butt in I...,2002,True,user,random,...,0.811278,8,0.0,0,0.493543,0.93759,0.810431,0.885714,0.827027,0.836364


In [56]:
test_merged_folds[['rev_id', 'worker_id', 'aggression', 
                   'p_aggressive', 'entropy', 'majority', 
                   'is_decision_major', 'w_agg_prc_mainstream', 
                   'w_n_agg_prc_mainstream', 'w_overall_prc_mainstream',
                   'agg_prc_mainstream', 'n_agg_prc_mainstream', 'overall_prc_mainstream'
                  ]].to_csv('test_statistics.csv')