## Running Example

In [1]:
import crowded.simulate as cs
import crowded.method as cm
import crowded.make as mk
from pycm import *
import pandas as pd

#### Simulate the tasks

In [2]:
total_tasks = 1643
p_hard_tasks = 0
number_of_valid_answers = 9

In [3]:
df_tasks = cs.Tasks(number_of_valid_answers).create(total_tasks, p_hard_tasks)
df_tasks.head()

Unnamed: 0_level_0,task_id,true_answers,label_task,prob_task
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
task_E7xpdJTXNAQv,task_E7xpdJTXNAQv,bundles,easy_task,0.89
task_L6qC6KcDnBca,task_L6qC6KcDnBca,data,easy_task,0.98
task_adu4Y8gyrqKP,task_adu4Y8gyrqKP,bundles,easy_task,0.95
task_X6eJPEofGKf8,task_X6eJPEofGKf8,generations,easy_task,0.65
task_9eLzASnVhraq,task_9eLzASnVhraq,generations,easy_task,0.59


In [4]:
valid_answers = df_tasks['true_answers'].unique()
print(valid_answers)

['bundles' 'data' 'generations' 'thirty' 'limitation' 'yarn' 'detent'
 'advertisement' 'materials']


#### Simulate the workers

In [5]:
total_workers = 145

In [6]:
workers = cs.Workers().create(total_workers)
workers.head()

Unnamed: 0_level_0,worker_id,prob_worker
id,Unnamed: 1_level_1,Unnamed: 2_level_1
6shB43PdNJRu,6shB43PdNJRu,0.67141
MZ29iQSA2nei,MZ29iQSA2nei,0.71003
LAUZfierwB5s,LAUZfierwB5s,0.635493
Gx2PF995tXTr,Gx2PF995tXTr,0.847156
p7wsK846hE3p,p7wsK846hE3p,0.596849


#### Split the tasks

In [7]:
percentage_to_train = 0.3

In [8]:
tasks_train, tasks_rest = mk.tasks_split(df_tasks, percentage_to_train)

#### Assigning workers to tasks

In [9]:
workers_per_task = 3

In [10]:
df_tw = cs.AssignTasks(tasks_train, workers, workers_per_task).create()
df_tw.head()

Unnamed: 0,task_id,worker_id,true_answers,label_task,prob_task,prob_worker
0,task_3GQs5ptNSUVU,7yLqPhe7TJty,yarn,easy_task,0.99,0.69135
1,task_3GQs5ptNSUVU,MnCuNgyMqzbT,yarn,easy_task,0.99,0.805523
2,task_3GQs5ptNSUVU,2bPFmZW8Pq6h,yarn,easy_task,0.99,0.697745
3,task_JDByzAtMpLhT,Ccv2aKc9r95p,thirty,easy_task,0.92,0.597253
4,task_JDByzAtMpLhT,RuiH3eBUzcHU,thirty,easy_task,0.92,0.676876


#### Compute the probability to assess the tasks

In [11]:
cp = cm.ComputeProbability(df_tw['prob_task'], df_tw['prob_worker'], valid_answers)
df_tw['worker_answers'] = cm.WorkerAnswer(df_tw['true_answers'], cp.predict(), valid_answers).match()
df_tw['performance'] = cp.predict()
#df_tw.head()

#### Assess the performance and get the good workers

In [12]:
perf = cm.Performance(df_tw)
trained_workers = perf.trained_workers()
trained_workers.head()

Unnamed: 0_level_0,worker_id,prob_worker
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2XYfi2sDkupf,2XYfi2sDkupf,0.965502
2bPFmZW8Pq6h,2bPFmZW8Pq6h,0.984682
YXBKkNZaLPP8,YXBKkNZaLPP8,0.995149
WhEZYPp57f7m,WhEZYPp57f7m,0.990698
UwyUN7ufTkfh,UwyUN7ufTkfh,0.997898


#### Assign the trained workers to the rest of the tasks

In [13]:
df_tw_2 = cs.AssignTasks(tasks_rest, trained_workers, workers_per_task).create()
df_tw_2.head()

Unnamed: 0,task_id,worker_id,true_answers,label_task,prob_task,prob_worker
0,task_22By4uYxvA6u,hwY3hLriq7R4,materials,easy_task,0.95,0.996099
1,task_22By4uYxvA6u,d36AYErUttLF,materials,easy_task,0.95,0.952213
2,task_22By4uYxvA6u,HAiS5FFnBtQe,materials,easy_task,0.95,0.997918
3,task_24aKhL7YT7hM,mYj9wgdDcPwC,yarn,easy_task,0.82,0.982841
4,task_24aKhL7YT7hM,6YYvoDyCwga6,yarn,easy_task,0.82,0.935455


#### Compute probability to the rest of the tasks

In [14]:
cp2 = cm.ComputeProbability(df_tw_2['prob_task'], df_tw_2['prob_worker'], valid_answers)
df_tw_2['worker_answers'] = cm.WorkerAnswer(df_tw_2['true_answers'], cp2.predict(), valid_answers).match()
df_tw_2['performance'] = cp2.predict()

In [15]:
df_tw_2.head()

Unnamed: 0,task_id,worker_id,true_answers,label_task,prob_task,prob_worker,worker_answers,performance
0,task_22By4uYxvA6u,hwY3hLriq7R4,materials,easy_task,0.95,0.996099,materials,1
1,task_22By4uYxvA6u,d36AYErUttLF,materials,easy_task,0.95,0.952213,materials,1
2,task_22By4uYxvA6u,HAiS5FFnBtQe,materials,easy_task,0.95,0.997918,materials,1
3,task_24aKhL7YT7hM,mYj9wgdDcPwC,yarn,easy_task,0.82,0.982841,yarn,1
4,task_24aKhL7YT7hM,6YYvoDyCwga6,yarn,easy_task,0.82,0.935455,yarn,1


In [16]:
df_tw_2[df_tw_2['performance']==0]

Unnamed: 0,task_id,worker_id,true_answers,label_task,prob_task,prob_worker,worker_answers,performance
295,task_6iyqRhbvJjW5,39B9KgzmQX59,materials,easy_task,0.62,0.935622,materials,0
308,task_6vZtQWDoMjjR,d36AYErUttLF,detent,easy_task,0.72,0.952213,detent,0
386,task_8Qhyaq7kNMaQ,GNR9Un22dixG,materials,easy_task,0.78,0.967849,materials,0
787,task_F9LfZgu4oJBA,AA5oGKPuHfJy,materials,easy_task,0.84,0.989427,materials,0
804,task_FPWYWw9HuKxL,39B9KgzmQX59,detent,easy_task,0.58,0.935622,detent,0
879,task_HHo6SUt4swRz,otYb78RVCS2p,materials,easy_task,0.88,0.931361,materials,0
1341,task_PwFW7KwhRqDr,uUTM5sLTjGAC,materials,easy_task,0.67,0.956246,materials,0
1782,task_XtNq3w7S4xqi,DpriefaKZPHi,materials,easy_task,0.68,0.974834,materials,0
2196,task_duvNqq6AUmE5,uUTM5sLTjGAC,advertisement,easy_task,0.67,0.956246,advertisement,0
2304,task_fv5aQ3WSqU5X,6YYvoDyCwga6,generations,easy_task,0.57,0.935455,generations,0


#### Merge the data and get the overall accuracy

In [17]:
df = df_tw.append(df_tw_2)

In [18]:
mat = ConfusionMatrix(list(df['true_answers']), list(df['worker_answers']))
print('Accuracy of the simulated experiment considering 2 Stages: {}%'.format(round(mat.Overall_ACC*100,2)))

Accuracy of the simulated experiment considering 2 Stages: 98.01%


In [19]:
pd.DataFrame(mat.table)

Unnamed: 0,advertisement,bundles,data,detent,generations,limitation,materials,thirty,yarn
advertisement,510,1,1,1,2,1,0,1,0
bundles,1,511,4,2,2,1,0,0,2
data,0,0,582,0,0,1,0,2,1
detent,1,3,6,531,0,2,1,2,3
generations,1,2,0,3,500,2,1,0,1
limitation,3,0,3,2,2,571,0,2,3
materials,1,1,0,1,3,1,563,1,2
thirty,1,1,3,1,0,2,2,536,1
yarn,1,0,1,2,1,1,0,5,527


#### Compare with NO stage

In [20]:
df_tw1 = cs.AssignTasks(df_tasks, workers, workers_per_task).create()
cp1 = cm.ComputeProbability(df_tw1['prob_task'], df_tw1['prob_worker'], valid_answers)
df_tw1['worker_answers'] = cm.WorkerAnswer(df_tw1['true_answers'], cp1.predict(), valid_answers).match()
df_tw1['performance'] = cp1.predict()

In [21]:
mat1 = ConfusionMatrix(list(df_tw1['true_answers']), list(df_tw1['worker_answers']))
print('Accuracy of the simulated experiment considering NO Stages: {}%\n'.format(round(mat1.Overall_ACC*100,2)))

Accuracy of the simulated experiment considering NO Stages: 93.95%



In [22]:
pd.DataFrame(mat1.table)

Unnamed: 0,advertisement,bundles,data,detent,generations,limitation,materials,thirty,yarn
advertisement,481,4,5,9,2,5,2,3,3
bundles,2,487,3,3,1,3,5,6,2
data,7,3,563,7,4,4,7,3,6
detent,3,4,5,506,2,5,9,8,3
generations,4,0,7,4,489,3,4,4,3
limitation,6,5,2,3,2,546,2,3,3
materials,4,6,3,4,4,4,530,4,2
thirty,5,4,3,3,0,9,3,515,4
yarn,7,6,9,4,6,3,5,3,514


#####  The accuracy of 1 stage algorithm is lower than the 2 stages algorithm *