In [17]:
import pandas as pd

In [18]:
# Import that data
data = pd.read_csv('crowd2.csv')

In [19]:
# Keep the important columns 
data.columns

Index(['HITId', 'HITTypeId', 'Title', 'Description', 'Keywords', 'Reward',
       'CreationTime', 'MaxAssignments', 'RequesterAnnotation',
       'AssignmentDurationInSeconds', 'AutoApprovalDelayInSeconds',
       'Expiration', 'NumberOfSimilarHITs', 'LifetimeInSeconds',
       'AssignmentId', 'WorkerId', 'AssignmentStatus', 'AcceptTime',
       'SubmitTime', 'AutoApprovalTime', 'ApprovalTime', 'RejectionTime',
       'RequesterFeedback', 'WorkTimeInSeconds', 'LifetimeApprovalRate',
       'Last30DaysApprovalRate', 'Last7DaysApprovalRate', 'Input.No',
       'Input.Label_ID', 'Input.From', 'Input.To', 'Input.Type',
       'Input.Annotation', 'Input.Context', 'Input.DO_ID', 'Input.DB_ID',
       'Input.DrugName', 'Input.Drug_Brand_Name', 'Input.atc_label',
       'Input.ATC_LEVEL_0', 'Input.DieaseName', 'Input.WordCount',
       'Input.Set_ID', 'Input.Section', 'Answer.category.label', 'Approve',
       'Reject'],
      dtype='object')

In [20]:
data= data[['WorkerId', 'Input.Annotation','Input.Context','Input.DO_ID', 'Input.DB_ID', 'Input.DrugName', 'Input.Section', 'Answer.category.label' ]]

In [21]:
# Group by unique data instances and collect ansewers from different workers
df = data.groupby(['Input.Annotation', 'Input.Context', 'Input.DO_ID','Input.DB_ID', 'Input.DrugName','Input.Section'])['Answer.category.label'].apply(list).reset_index(name='merged')

In [22]:
data['Answer.category.label'].unique()

array(['Indication: treatment', 'Contraindication', 'Effect',
       'I dont know', 'Indication: Symptomatic relief'], dtype=object)

In [23]:
#  Create a column with lists that collect the ansers per label in lists like:
#  (treatment, contraindication, relief,effect,IDK)

df['votes'] = 'votes'
for i in range(len(df)):
    df['votes'][i] = [df['merged'][i].count('Indication: treatment'),df['merged'][i].count('Contraindication'),
                       df['merged'][i].count('Indication: Symptomatic relief'),df['merged'][i].count('Effect'), df['merged'][i].count('I dont know')]

In [24]:
#  Create one column with the inter-aggrement rate between the workers
#  and one other for assigning the majority label
df['aggrement'] = 'agree'
df['label'] = 'label'
for i in range(len(df)):
    df['aggrement'][i] = max(df['votes'][i])/sum(df['votes'][i])
    idx = df['votes'][i].index(max(df['votes'][i]))
    if idx==0:
        df['label'][i] = 'treatment'
    elif idx==1:
        df['label'][i] = 'contraindication'
    elif idx==2:
        df['label'][i] = 'relief'
    elif idx==3:
        df['label'][i] = 'effect'
    elif idx==4:
        df['label'][i] = 'IDK'
    

In [25]:
df

Unnamed: 0,Input.Annotation,Input.Context,Input.DO_ID,Input.DB_ID,Input.DrugName,Input.Section,merged,votes,aggrement,label
0,ACROMEGALY,Bromocriptine mesylate tablets and capsules ar...,DOID_2449,DB01200,BROMOCRIPTINE,Indication,"[Indication: treatment, Indication: Symptomati...","[4, 0, 1, 0, 0]",0.8,treatment
1,ACTINOMYCOSIS,"Penicillin G Potassium Injection, USP is indic...",DOID_8478,DB01053,PENICILLIN G,Indication,"[Indication: treatment, Indication: Symptomati...","[4, 0, 1, 0, 0]",0.8,treatment
2,ACUTE CYSTITIS,"Nitrofurantoin capsules, USP (monohydrate/macr...",DOID_13148,DB00698,NITROFURANTOIN,Indication,"[Indication: treatment, Indication: treatment,...","[4, 0, 1, 0, 0]",0.8,treatment
3,ACUTE LYMPHOCYTIC LEUKEMIA,Methotrexate is indicated in the treatment of ...,DOID_9952,DB00563,METHOTREXATE,Indication,"[Indication: treatment, Contraindication, Indi...","[3, 1, 0, 0, 0]",0.75,treatment
4,ACUTE MAXILLARY SINUSITIS,Cefpodoxime proxetil is indicated for the trea...,DOID_2050,DB01416,CEFPODOXIME,Indication,"[Indication: treatment, Indication: treatment,...","[4, 0, 0, 0, 0]",1,treatment
...,...,...,...,...,...,...,...,...,...,...
825,YAWS,Tetracycline is indicated in the treatment of ...,DOID_10371,DB00759,TETRACYCLINE,Indication,"[Indication: treatment, Contraindication, Indi...","[3, 2, 0, 0, 0]",0.6,treatment
826,YAWS,To reduce the development of drug-resistant ba...,DOID_10371,DB01053,PENICILLIN G,Indication,"[Indication: treatment, Indication: treatment,...","[2, 2, 1, 0, 0]",0.4,treatment
827,ZOLLINGER-ELLISON SYNDROME,Esomeprazole magnesium delayed-release capsule...,DOID_0050782,DB00736,ESOMEPRAZOLE,Indication,"[Indication: treatment, Indication: treatment,...","[5, 0, 0, 0, 0]",1,treatment
828,ZOLLINGER-ELLISON SYNDROME,Rabeprazole sodium delayed-release tablets are...,DOID_0050782,DB01129,RABEPRAZOLE,Indication,"[Indication: Symptomatic relief, Indication: t...","[3, 0, 2, 0, 0]",0.6,treatment


In [26]:
df_aggred = df[df.aggrement > 0.51].reset_index()
df_aggred = df_aggred[df_aggred.label != 'IDK'].reset_index()

In [27]:
df_aggred = df_aggred[['Input.Annotation','Input.Context', 'Input.DO_ID','Input.DB_ID', 'Input.DrugName', 'Input.Section', 'merged', 'votes', 'aggrement','label']]

In [28]:
df_aggred

Unnamed: 0,Input.Annotation,Input.Context,Input.DO_ID,Input.DB_ID,Input.DrugName,Input.Section,merged,votes,aggrement,label
0,ACROMEGALY,Bromocriptine mesylate tablets and capsules ar...,DOID_2449,DB01200,BROMOCRIPTINE,Indication,"[Indication: treatment, Indication: Symptomati...","[4, 0, 1, 0, 0]",0.8,treatment
1,ACTINOMYCOSIS,"Penicillin G Potassium Injection, USP is indic...",DOID_8478,DB01053,PENICILLIN G,Indication,"[Indication: treatment, Indication: Symptomati...","[4, 0, 1, 0, 0]",0.8,treatment
2,ACUTE CYSTITIS,"Nitrofurantoin capsules, USP (monohydrate/macr...",DOID_13148,DB00698,NITROFURANTOIN,Indication,"[Indication: treatment, Indication: treatment,...","[4, 0, 1, 0, 0]",0.8,treatment
3,ACUTE LYMPHOCYTIC LEUKEMIA,Methotrexate is indicated in the treatment of ...,DOID_9952,DB00563,METHOTREXATE,Indication,"[Indication: treatment, Contraindication, Indi...","[3, 1, 0, 0, 0]",0.75,treatment
4,ACUTE MAXILLARY SINUSITIS,Cefpodoxime proxetil is indicated for the trea...,DOID_2050,DB01416,CEFPODOXIME,Indication,"[Indication: treatment, Indication: treatment,...","[4, 0, 0, 0, 0]",1,treatment
...,...,...,...,...,...,...,...,...,...,...
606,VULVOVAGINITIS,Enter section text here\r\n Tindamax is a nitr...,DOID_2273,DB00911,TINIDAZOLE,Indication,"[Effect, Indication: treatment, Indication: tr...","[4, 0, 0, 1, 0]",0.8,treatment
607,YAWS,Tetracycline is indicated in the treatment of ...,DOID_10371,DB00759,TETRACYCLINE,Indication,"[Indication: treatment, Contraindication, Indi...","[3, 2, 0, 0, 0]",0.6,treatment
608,ZOLLINGER-ELLISON SYNDROME,Esomeprazole magnesium delayed-release capsule...,DOID_0050782,DB00736,ESOMEPRAZOLE,Indication,"[Indication: treatment, Indication: treatment,...","[5, 0, 0, 0, 0]",1,treatment
609,ZOLLINGER-ELLISON SYNDROME,Rabeprazole sodium delayed-release tablets are...,DOID_0050782,DB01129,RABEPRAZOLE,Indication,"[Indication: Symptomatic relief, Indication: t...","[3, 0, 2, 0, 0]",0.6,treatment


In [29]:
df_aggred.to_csv('crowd_2nd.csv')