In [2]:
import pandas as pd
import numpy as np

In [6]:
toxicity_annotated_comments = pd.read_csv("./wiki_data/toxicity_annotated_comments.tsv", delimiter="\t")
toxicity_annotated_comments.set_index("rev_id", inplace=True)
toxicity_annotations = pd.read_csv("./wiki_data/toxicity_annotations.tsv", delimiter="\t")
toxicity_annotations["id"] = toxicity_annotations["worker_id"].astype(str) + '_' + toxicity_annotations["rev_id"].astype(str)
toxicity_annotations.set_index("id", inplace=True)
toxicity_worker_demographics = pd.read_csv("./wiki_data/toxicity_worker_demographics.tsv", delimiter="\t")
toxicity_worker_demographics.set_index("worker_id", inplace=True)

In [110]:
best_workers_index = toxicity_annotations['worker_id'].value_counts().index[:100]

In [111]:
print(best_workers_index)

Int64Index([ 43,  98,  67, 274,  85,  22, 313, 925, 118, 147, 285, 261,  90,
            157, 342, 278, 281, 348,  93, 186, 164, 494, 252, 265,  73, 502,
            152, 544, 116, 442,  95, 131, 385, 659, 108,  26, 155,  19, 328,
             54,  84, 167, 193, 429, 119, 150,  75, 181, 316, 497, 257, 124,
             99,  55, 204, 106, 234, 449,  15, 406, 583, 529,   6, 269, 685,
            176, 656, 273,  58, 180,  86,  71,   0,   5, 366, 111, 130, 276,
            545, 141,  23, 514, 396, 505, 461, 104, 210, 201, 499, 375,  80,
             70, 485, 132, 239, 466, 381, 249, 140, 138],
           dtype='int64')


In [112]:
best_workers = toxicity_worker_demographics.loc[best_workers_index]
best_workers["index"] = list(range(best_workers.shape[0]))
best_workers.head()

KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike'

In [91]:
rated_works = toxicity_annotations[toxicity_annotations['worker_id'].isin(best_workers.index)]
rev_to_id = rated_works.groupby("rev_id").mean().drop(['worker_id', 'toxicity', "toxicity_score"], axis=1)
rev_to_id["index"] = list(range(rev_to_id.shape[0]))
rev_to_id.tail()

Unnamed: 0_level_0,index
rev_id,Unnamed: 1_level_1
699755057.0,35598
699756185.0,35599
699780538.0,35600
699820699.0,35601
699848324.0,35602


In [92]:
ratings = np.zeros((len(best_workers_index), rev_to_id.shape[0]))

In [93]:
for i, work in rated_works.iterrows(): 
    #print(work)
    worker_index = best_workers.loc[work["worker_id"]]["index"]
    rev_index = rev_to_id.loc[work["rev_id"]]["index"]
    
    ratings[worker_index, rev_index] = -(work["toxicity"] * 2 - 1)

In [94]:
ratings

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# save data locally

In [95]:
from tempfile import TemporaryFile
outfile = TemporaryFile()
np.save(outfile, ratings)

In [96]:
with open('ratings_all.npy', 'wb') as f:
    np.save(f, ratings)

In [97]:
with open('ratings_all.npy', 'rb') as f:
    a = np.load(f)
    
print(a)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
