In [1]:
import pandas as pd
import numpy as np

QuestionNumber = 5  # question per page

# Functions

In [2]:
# functions to organize answers

def arrange_answers(answers):    # organize answers in one instance
    arranged_answers =[[] for _ in range(QuestionNumber)]
    for name in answers.keys():
        answer = answers[name].strip().lower()    # remove sapce & convert to lower
        for i in range(QuestionNumber):
            if "answer"+str(i+1)+"-" in name:     # belong to question i
                arranged_answers[i].append(answer)
    return [sorted(set(t), key=t.index) for t in  arranged_answers]  # remove repeated answer # keep the answer order


def answer_frequency(Answers):          # count the frequency of each answer
    ritem = dict()
    for ans in Answers:
        if ans in ritem: ritem[ans] += 1
        else: ritem[ans] = 1
    return ritem

In [3]:
# functions to calculate metrics

def human_metric(anss,StandAnswer=None,RemoveZero=False):
    '''
    Calculate metrics for answers of one question
    Input:
    anss: All answers for one question
    StandAnswer: Standard answer; 
        If 'None', only take the most frequent word as the "mode"
    RemoveZero: Using RemoveZero method or not
    '''
    if len(anss) == 1:          # only one answer for this question
        if len(anss[0]) != 0:   # this one answer is not empty
            return [[1] for _ in range(8)]
        else:                   # this one answer is empty
            return [[0] for _ in range(8)]
    # the first,the best,the average of first 3,the average of all
    Met1s = [[] for _ in range(4)]
    Met2s = [[] for _ in range(4)]
    ResWorkerNumber = len(anss)-1
    for ind in range(len(anss)):
        ans = anss[ind]
        if len(anss[ind]) == 0:
            for mi in range(4): Met1s[mi].append(0)
            for mi in range(4): Met2s[mi].append(0)
        else:                   # answer not empty
            Otherans = [anss[i] for i in range(len(anss)) if i != ind]
            OtherAns = [ite for eachans in Otherans for ite in eachans]
            AnsCountDic = answer_frequency(OtherAns)
            # find mode
            modes = [sorted(AnsCountDic.items(),key=lambda item:item[1])[-1][0]]
            ModeFreq = AnsCountDic[modes[0]]
            if StandAnswer is not None:  # ground truth is also a mode
                modes.append(StandAnswer)
            metric1s = [AnsCountDic[ite]/ModeFreq if ite in AnsCountDic 
                        else 0 for ite in ans]
            metric2s = [1 if ite in modes else 
                        AnsCountDic[ite]/ResWorkerNumber if ite in AnsCountDic 
                        else 0 for ite in ans]
            # the first,the best,the average of first 3,the average of all
            Met1s[0].append(metric1s[0])
            Met1s[1].append(max(metric1s))
            Met1s[2].append(np.average(metric1s[:3]) if len(metric1s) >=3 else np.average(metric1s))
            Met1s[3].append(np.average(metric1s))
            Met2s[0].append(metric2s[0])
            Met2s[1].append(max(metric2s))
            Met2s[2].append(np.average(metric2s[:3]) if len(metric2s) >=3 else np.average(metric2s))
            Met2s[3].append(np.average(metric2s))
    if RemoveZero:          # remove the zero score answers in the first round
        remain = [Met1s[1][ite] != 0 for ite in range(len(Met1s[1]))]  # mark whether to remain the answers
        Met1s = [[] for _ in range(4)]
        Met2s = [[] for _ in range(4)]
        for ind in range(len(anss)):
            ans = anss[ind]
            if len(ans)==0 or remain[ind]==0:
                for mi in range(4): Met1s[mi].append(0)
                for mi in range(4): Met2s[mi].append(0)
            else:                   # answer not empty
                Otherans = [anss[i] for i in range(len(anss)) if (i!=ind and remain[i]!=0)]
                OtherAns = [ite for eachans in Otherans for ite in eachans]
                resworkernumber = len(Otherans)
                if resworkernumber != 0:
                    AnsCountDic = answer_frequency(OtherAns)
                    # find mode
                    modes = [sorted(AnsCountDic.items(),key=lambda item:item[1])[-1][0]]
                    ModeFreq = AnsCountDic[modes[0]]
                    if StandAnswer is not None:  # ground truth is also a mode
                        modes.append(StandAnswer)
                    metric1s = [AnsCountDic[ite]/ModeFreq if ite in AnsCountDic 
                                else 0 for ite in ans]
                    metric2s = [1 if ite in modes else 
                                AnsCountDic[ite]/resworkernumber if ite in AnsCountDic 
                                else 0 for ite in ans]
                    Met1s[0].append(metric1s[0])
                    Met1s[1].append(max(metric1s))
                    Met1s[2].append(np.average(metric1s[:3]) if len(metric1s) >=3 else np.average(metric1s))
                    Met1s[3].append(np.average(metric1s))
                    Met2s[0].append(metric2s[0])
                    Met2s[1].append(max(metric2s))
                    Met2s[2].append(np.average(metric2s[:3]) if len(metric2s) >=3 else np.average(metric2s))
                    Met2s[3].append(np.average(metric2s))
                else:     # No other answers left 
                    for mi in range(4): Met1s[mi].append(1)
                    for mi in range(4): Met2s[mi].append(1)
    return np.concatenate((np.array(Met1s).T,np.array(Met2s).T),axis=1).tolist()

In [4]:
def CollectAnswerInformation(original_val,original_test,original_result,remove_zero=False):
    
    ValMatrics = []   # collect Validation answers for all HITs, all answers
    TestMatrics = []  # collect Test answers for all HITs, all answers

    # scan Batch_results & cluster answers & calculate score matrix
    last_hit_id = None

    for ri, (index, row) in enumerate(original_result.iterrows()):
        current_hit_id = row["HITId"]
        if (last_hit_id is not None) and (current_hit_id == last_hit_id):                              # same HIT
            line_answer = arrange_answers(eval(row["Answer.taskAnswers"])[0])
            for i in range(QuestionNumber):
                answers[i].append(line_answer[i])
        if (last_hit_id is not None) and (current_hit_id!=last_hit_id or ri==len(original_result)-1):  # a new HIT or the Last row
            # generate precision, recall, score, size matrix for this HIT
            for qid in range(QuestionNumber):
                humat = human_metric(answers[qid],std_answers[qid],RemoveZero=remove_zero)
                if dataset_flag[qid] == "val":
                    ValMatrics.append(humat)
                elif dataset_flag[qid] == "test":
                    TestMatrics.append(humat)
        if  last_hit_id == None or current_hit_id!=last_hit_id:              # collect information & answers for the new HIT
            last_hit_id = current_hit_id
            video_ids = [row["Input.video"+str(wid)+"_id"] for wid in range(1,QuestionNumber+1)]
            video_questions = [row["Input.question"+str(wid)] for wid in range(1,QuestionNumber+1)]
            postags = [row["Input.pos_tag"+str(wid)] for wid in range(1,QuestionNumber+1)]
            std_answers = []  # standard answers
            dataset_flag = []  # mark the source dataset
            for wid in range(QuestionNumber):
                try:  # search validation dataset
                    std_ans = original_val[(original_val.video_id==video_ids[wid]) &
                                           (original_val.question==video_questions[wid])]["answer"].iloc[0]
                    dflg = "val"
                except:  # search test dataset
                    std_ans = original_test[(original_test.video_id==video_ids[wid]) &
                                           (original_test.question==video_questions[wid])]["answer"].iloc[0]
                    dflg = "test"
                std_answers.append(std_ans)
                dataset_flag.append(dflg)
            answers = [[] for _ in range(QuestionNumber)]                    # answers for one HIT
            line_answer = arrange_answers(eval(row["Answer.taskAnswers"])[0])
            for i in range(QuestionNumber):
                answers[i].append(line_answer[i])
    return ValMatrics, TestMatrics

In [5]:
batch1_result = pd.read_csv("Batch_4058534_batch_results.csv")
batch2_result = pd.read_csv("Batch_4090592_batch_results.csv")
original_result = pd.concat([batch1_result,batch2_result],axis=0)
val = pd.read_csv("Validation.csv")
test = pd.read_csv("Test.csv")

In [6]:
ValMatrics,TestMatrics = CollectAnswerInformation(val,test,original_result,remove_zero=True)

In [7]:
# the first,the best,the average of first 3,the average of all
ValMatrics=np.array([ite2 for ite1 in ValMatrics for ite2 in ite1])
TestMatrics=np.array([ite2 for ite1 in TestMatrics for ite2 in ite1])

In [8]:
np.mean(ValMatrics,axis=0)

array([0.55305544, 0.67921498, 0.4360409 , 0.41466375, 0.52282873,
       0.64403387, 0.40480417, 0.38397277])

In [9]:
np.mean(TestMatrics,axis=0)

array([0.55527347, 0.69231947, 0.43867244, 0.41911915, 0.54141623,
       0.67520415, 0.42384358, 0.40431809])