In [1]:
import pandas as pd
import numpy as np

QuestionNumber = 5  # question per page

# Functions

In [2]:
# functions to organize answers

def arrange_answers(answers):    # organize answers in one instance
    arranged_answers =[[] for _ in range(QuestionNumber)]
    for name in answers.keys():
        answer = answers[name].strip().lower()    # remove sapce & convert to lower
        for i in range(QuestionNumber):
            if "answer"+str(i+1)+"-" in name:     # belong to question i
                arranged_answers[i].append(answer)
    return [sorted(set(t), key=t.index) for t in  arranged_answers]  # remove repeated answer # keep the answer order


def answer_frequency(Answers):          # count the frequency of each answer
    ritem = dict()
    for ans in Answers:
        if ans in ritem: ritem[ans] += 1
        else: ritem[ans] = 1
    return ritem

In [3]:
# functions to calculate metrics

def human_metric(anss,StandAnswer=None,RemoveZero=False):
    '''
    Calculate metrics for answers of one question
    Input:
    anss: All answers for one question
    StandAnswer: Standard answer; 
        If 'None', only take the most frequent word as the "mode"
    RemoveZero: Using RemoveZero method or not
    '''
    if len(anss) == 1:          # only one answer for this question
        if len(anss[0]) != 0:   # this one answer is not empty
            return [[1] for _ in range(25)]
        else:                   # this one answer is empty
            return [[0] for _ in range(25)]
    # Met1, Met2, VQA1, VQA2, VQA3
    # the best,the first,the average of first 2,the average of first 3,the average of all
    Mets = np.zeros((len(anss),5,5))
    ResWorkerNumber = len(anss)-1
    for ind in range(len(anss)):
        ans = anss[ind]
        if len(anss[ind]) == 0:
            for mx in range(5):
                for my in range(5):
                    Mets[ind,mx,my] = 0
        else:                   # answer not empty
            Otherans = [anss[i] for i in range(len(anss)) if i != ind]
            OtherAns = [ite for eachans in Otherans for ite in eachans]
            AnsCountDic = answer_frequency(OtherAns)
            # find mode
            modes = [sorted(AnsCountDic.items(),key=lambda item:item[1])[-1][0]]
            ModeFreq = AnsCountDic[modes[0]]
            if StandAnswer is not None:  # ground truth is also a mode
                modes.append(StandAnswer)
            metrics = []
            metrics.append([AnsCountDic[ite]/ModeFreq if ite in AnsCountDic 
                        else 0 for ite in ans])  # Met1
            metrics.append([1 if ite in modes else 
                            AnsCountDic[ite]/ResWorkerNumber if ite in AnsCountDic 
                            else 0 for ite in ans])  # Met2
            metrics.append([min(AnsCountDic[ite],1) if ite in AnsCountDic 
                            else 0 for ite in ans])  # VQA1
            metrics.append([min(AnsCountDic[ite]/2.0,1) if ite in AnsCountDic 
                            else 0 for ite in ans])  # VQA2
            metrics.append([min(AnsCountDic[ite]/3.0,1) if ite in AnsCountDic 
                            else 0 for ite in ans])  # VAQ3
            # the best,the first,the average of first 2,the average of first 3,the average of all
            for mi in range(5):
                Mets[ind,mi,0] = max(metrics[mi])
                Mets[ind,mi,1] = metrics[mi][0]
                Mets[ind,mi,2] = np.average(metrics[mi][:2]) if len(metrics[mi]) >=2 else np.average(metrics[mi])
                Mets[ind,mi,3] = np.average(metrics[mi][:3]) if len(metrics[mi]) >=3 else np.average(metrics[mi])
                Mets[ind,mi,4] = np.average(metrics[mi])
    if RemoveZero:          # remove the zero score answers in the first round
        remain = [ite != 0 for ite in Mets[:,0,0]]  # mark whether to remain the answers
        Mets = np.zeros((len(anss),5,5))
        for ind in range(len(anss)):
            ans = anss[ind]
            if len(ans)==0 or remain[ind]==0:
                for mx in range(5):
                    for my in range(5):
                        Mets[ind,mx,my] = 0
            else:                   # answer not empty
                Otherans = [anss[i] for i in range(len(anss)) if (i!=ind and remain[i]!=0)]
                OtherAns = [ite for eachans in Otherans for ite in eachans]
                resworkernumber = len(Otherans)
                if resworkernumber != 0:
                    AnsCountDic = answer_frequency(OtherAns)
                    # find mode
                    modes = [sorted(AnsCountDic.items(),key=lambda item:item[1])[-1][0]]
                    ModeFreq = AnsCountDic[modes[0]]
                    if StandAnswer is not None:  # ground truth is also a mode
                        modes.append(StandAnswer)                    
                    metrics = []
                    metrics.append([AnsCountDic[ite]/ModeFreq if ite in AnsCountDic 
                                else 0 for ite in ans])  # Met1
                    metrics.append([1 if ite in modes else 
                                    AnsCountDic[ite]/resworkernumber if ite in AnsCountDic 
                                    else 0 for ite in ans])  # Met2
                    metrics.append([min(AnsCountDic[ite],1) if ite in AnsCountDic 
                                    else 0 for ite in ans])  # VQA1
                    metrics.append([min(AnsCountDic[ite]/2.0,1) if ite in AnsCountDic 
                                    else 0 for ite in ans])  # VQA2
                    metrics.append([min(AnsCountDic[ite]/3.0,1) if ite in AnsCountDic 
                                    else 0 for ite in ans])  # VAQ3
                    # the best,the first,the average of first 2,the average of first 3,the average of all
                    for mi in range(5):
                        Mets[ind,mi,0] = max(metrics[mi])
                        Mets[ind,mi,1] = metrics[mi][0]
                        Mets[ind,mi,2] = np.average(metrics[mi][:2]) if len(metrics[mi]) >=2 else np.average(metrics[mi])
                        Mets[ind,mi,3] = np.average(metrics[mi][:3]) if len(metrics[mi]) >=3 else np.average(metrics[mi])
                        Mets[ind,mi,4] = np.average(metrics[mi])
                else:     # No other answers left 
                    for mx in range(5):
                        for my in range(5):
                            Mets[ind,mx,my] = 1
    return np.reshape(Mets,(len(anss),-1)).tolist()

In [4]:
batch1_result = pd.read_csv("Batch_4058534_batch_results.csv")
batch2_result = pd.read_csv("Batch_4090592_batch_results.csv")
original_result = pd.concat([batch1_result,batch2_result],axis=0)
val = pd.read_csv("Validation.csv")
test = pd.read_csv("Test.csv")

In [5]:
hit_infos = dict() # hit: workers' id, video id, question

for ri, (index, row) in enumerate(original_result.iterrows()):
    chit = row["HITId"]
    if chit in hit_infos:
        hit_infos[chit]["workerid"].append(row["WorkerId"])
    else:
        hit_infos[chit] = dict()
        hit_infos[chit]["workerid"] = [row["WorkerId"]]
        hit_infos[chit]["videoid"] = [row["Input.video{}_id".format(k+1)] for k in range(5)]
        hit_infos[chit]["questions"] = [row["Input.question{}".format(k+1)] for k in range(5)]

In [6]:
hit_answers = dict() # hit: answers, standard answer, approve_sign

with open("batch1_review_result.txt","r") as f:
    for line in f.readlines():
        if "HITId:" in line:
            current_hit = line[7:-1]
            hit_answers[current_hit] = dict()
            hit_answers[current_hit]["anss"] = []
            hit_answers[current_hit]["approve"] = []
        elif "Standard answers:" in line:
            hit_answers[current_hit]["stdans"] = eval(line[18:])
        elif "Appr "in line:
            hit_answers[current_hit]["approve"].append(True)
            hit_answers[current_hit]["anss"].append(arrange_answers(eval(line[7:-1])[0]))
        elif "Reje " in line or "Midd " in line:
            hit_answers[current_hit]["approve"].append(False)
            hit_answers[current_hit]["anss"].append(arrange_answers(eval(line[7:-1])[0]))

with open("batch2_review_result.txt","r") as f:
    for line in f.readlines():
        if "HITId:" in line:
            current_hit = line[7:-1]
            hit_answers[current_hit] = dict()
            hit_answers[current_hit]["anss"] = []
            hit_answers[current_hit]["approve"] = []
        elif "Standard answers:" in line:
            hit_answers[current_hit]["stdans"] = eval(line[18:])
        elif "Appr "in line:
            hit_answers[current_hit]["approve"].append(True)
            hit_answers[current_hit]["anss"].append(arrange_answers(eval(line[7:-1])[0]))
        elif "Reje " in line or "Midd " in line:
            hit_answers[current_hit]["approve"].append(False)
            hit_answers[current_hit]["anss"].append(arrange_answers(eval(line[7:-1])[0]))

In [7]:
ValWorkerScores = dict()
TestWorkerScores = dict()
AllWorkerScores = dict()

for (hit,item) in hit_answers.items():
    Anss = [[] for _ in range(QuestionNumber)]
    for wid in range(len(item["approve"])):  # each worker
        if item["approve"][wid]:  # approve
            for qid in range(QuestionNumber):
                Anss[qid].append(item["anss"][wid][qid])
    for qid in range(QuestionNumber):
        # filter out unavailable video
        if sum(["unavailable video" == ite for ite in 
                [ite2 for ite1 in Anss[qid]
                 for ite2 in ite1]]) < 2: # not unavailable video
            scores = human_metric(Anss[qid],item["stdans"][qid],RemoveZero=True)
            if ((val["video_id"] == hit_infos[hit]["videoid"][qid]) &
                (val["question"] == hit_infos[hit]["questions"][qid])).any():
                # validation
                sid = 0
                for wwi in range(len(item["approve"])):
                    if item["approve"][wwi]:
                        wwid = hit_infos[hit]["workerid"][wwi]
                        if wwid in ValWorkerScores:
                            ValWorkerScores[wwid].append(scores[sid])
                        else:
                            ValWorkerScores[wwid] = [scores[sid]]
                        sid += 1
            elif ((test["video_id"] == hit_infos[hit]["videoid"][qid]) &
                  (test["question"] == hit_infos[hit]["questions"][qid])).any():
                # test
                sid = 0
                for wwi in range(len(item["approve"])):
                    if item["approve"][wwi]:
                        wwid = hit_infos[hit]["workerid"][wwi]
                        if wwid in TestWorkerScores:
                            TestWorkerScores[wwid].append(scores[sid])
                        else:
                            TestWorkerScores[wwid] = [scores[sid]]
                        sid += 1
            else:
                raise LookupError("Can not find the video question in Val and Test!")
            # whole
            sid = 0
            for wwi in range(len(item["approve"])):
                if item["approve"][wwi]:
                    wwid = hit_infos[hit]["workerid"][wwi]
                    if wwid in AllWorkerScores:
                        AllWorkerScores[wwid].append(scores[sid])
                    else:
                        AllWorkerScores[wwid] = [scores[sid]]
                    sid += 1

In [8]:
# average score for each workers
AveValWorker = [np.mean(v,axis=0) for v in ValWorkerScores.values()]
AveTestWorker = [np.mean(v,axis=0) for v in TestWorkerScores.values()]
AveAllWorker = [np.mean(v,axis=0) for v in AllWorkerScores.values()]

# average score
AveVal = np.mean(AveValWorker,axis=0)
AveTest = np.mean(AveTestWorker,axis=0)
AveAll = np.mean(AveAllWorker,axis=0)

In [13]:
AveTest

array([0.75527722, 0.60982668, 0.51348505, 0.48311205, 0.45561596,
       0.70895671, 0.557329  , 0.4651634 , 0.43577816, 0.41001618,
       0.86313238, 0.76650696, 0.68774768, 0.66167367, 0.63092492,
       0.82326183, 0.70957198, 0.62909345, 0.60181429, 0.57145052,
       0.7890162 , 0.66617902, 0.58143996, 0.55275951, 0.52392365])