In [1]:
from evaluations import QAEvaluation, EvidenceEvaluation

In [2]:
# Functions for computing human performance

import pandas as pd
import numpy as np
from typing import Iterable, Literal, Mapping
from collections import defaultdict

def evaluate_preds(preds: Iterable, labels: Iterable, 
                   obj:str="answer", iou_threshold:float=0.5, method:str="max") -> [float]:
    """
    Evaluate scores for one person (answer / evidence)
    Input:
        preds: [pred1,pred2...]
        labels:[label1,label2..]
        obj:"answer":calculate answer agreement;
               "evidence":calculate evidence agreement;
        method:"max": return the maximum score;
               "ave": return the average score;
               "first": return score of the first answer
                      for each metric
    Output:
      For answer: 
        [exact_match, BLEU1, BLEU2, BLEU3, ROUGE1, ROUGE2, ROUGE-L]
      For evidence: 
        [iou_f1]
    """
    if obj == "answer":
        evls = [QAEvaluation(["-"], [pred], [labels]) for pred in preds]
        scores = [evl.exact_match() for evl in evls],\
                    *[[evl.bleu(i+1) for evl in evls]for i in range(3)], \
                    *[[evl.rouge(k,stats="f") for evl in evls] for k in [1,2,"l"]]
    elif obj == "evidence":
        scores = [max([EvidenceEvaluation([pred],[label]).iou_f1(threshold=iou_threshold)
                       for label in labels]) for pred in preds]
    else:
        raise ValueError(f"no obj named as \"{obj}\"")
    
    if method=="max": # return maximum score for each metric
        return [max(s) for s in scores] if obj == "answer" else [max(scores)]
    elif method=="ave":
        return [np.average(s) for s in scores] if obj == "answer" else [np.average(scores)]
    elif method=="first":
        return [s[0] for s in scores] if obj == "answer" else [scores[0]]
    else:
        raise ValueError(f"no method named as \"{method}\"")


def evaluate_one_question(predss: Iterable[Iterable],stdLabel=None,obj:str="answer",
                          iou_threshold:float=0.5,method:str="max") -> [[float],[float]]:
    '''
    Input:
        predss: humans' answers for one question
        stdLabel: standard label for this question.
        method:"max": return the maximum score;
               "ave": return the average score;
               "first": return score of the first answer
                      for each metric
    Output:
      For answer: 
        based on stdLabel:
        [person 1: [exact_match, BLEU1, BLEU2, BLEU3, ROUGE1, ROUGE2, ROUGE-L]
         person 2: [exact_match, BLEU1, BLEU2, BLEU3, ROUGE1, ROUGE2, ROUGE-L]
         ...]
        based on leave-one-human-out:
        [person 1: [exact_match, BLEU1, BLEU2, BLEU3, ROUGE1, ROUGE2, ROUGE-L]
         person 2: [exact_match, BLEU1, BLEU2, BLEU3, ROUGE1, ROUGE2, ROUGE-L]
         ...]
      For evidence:
        based on stdLabel:
        [person 1: [iou_f1], person 2: [iou_f1], ...]
        based on leave-one-human-out:
        [person 1: [iou_f1], person 2: [iou_f1], ...]
    '''
    assert stdLabel or len(predss) >= 2, "You have to provide either the standard answer " +\
                                            "or more then 2 humans' answers for evaluation."
    
    if len(predss) == 1: # only one person
        return [evaluate_preds(predss[0],[stdLabel],obj=obj,
                               iou_threshold=iou_threshold,method=method)],[[]]
    
    stdScores = [] # stand answer scores
    leavOneScores = [] # leave one human scores
    for i in range(len(predss)): # for each human
        leavOneLabel = [pred for j in range(len(predss)) if j!=i for pred in predss[j]]
        if stdLabel:
            stdScores.append(evaluate_preds(predss[i],[stdLabel],obj=obj,
                                            iou_threshold=iou_threshold,method=method))
            leavOneLabel.append(stdLabel)
        else:
            stdScores.append([])
        leavOneScores.append(evaluate_preds(predss[i],leavOneLabel,obj=obj,
                                            iou_threshold=iou_threshold,method=method))
    return stdScores, leavOneScores


def evaluate_persons(answerss:Iterable[Iterable[Iterable[str]]],workerss:Iterable[Iterable[str]],
                     stdLabels:Iterable[str]=None, obj:str="answer", 
                     iou_threshold:float=0.5,method:str="max") -> [[float],[float]]:
    '''
    Compute human performance
    Input:
        answerss: human's answers / evidences for multiple questions
        workerss: IDs for these human
        stdLabels: standard answer / evidence for these question
        method:"max": return the maximum score;
               "ave": return the average score;
               "first": return score of the first answer
                      for each metric
    Output:
        based on stdLabel:
        [ave_exact_match, ave_BLEU1, ave_BLEU2, ave_BLEU3, ave_ROUGE1, ave_ROUGE2, ave_ROUGE-L]
        based on leave-one-human-out:
        [ave_exact_match, ave_BLEU1, ave_BLEU2, ave_BLEU3, ave_ROUGE1, ave_ROUGE2, ave_ROUGE-L]
    '''
    
    def eva_scores(stdLeaScores:[[[float],[float]]]) -> [[float],[float]]:
        '''
        Average scores for one person / for all persons
        Input:
            stdLeaScores: [[stdScores1,leaOneScores1],[stdScores2,leaOneScores2],..]
        Output:
            [aveStdScore,aveLeaScore]
        '''
        return [list(np.average(ps,axis=0)) if
                (ps:=[si for s in stdLeaScores if (si:=s[i])!=[]])!=[] else []
                for i in (0,1)]
    
    if stdLabels is None:
        stdLabels = [None for _ in answerss]
    
    assert len(answerss) == len(workerss) \
        == len(stdLabels), "Length of 'answerss', 'workerss', 'stdLabels' are not the same!"
    
    person_scores = defaultdict(list)
    for stdLabel, answers, workers in zip(stdLabels,answerss,workerss):
        assert len(answers) == len(workers), \
            "Number of wokers and answers are not the same for record:\n" + \
            f"stdLabel: {stdLabel}\nanswers:{answers}\nworkers:{workers}"
        
        std_s, lvo_s = evaluate_one_question(answers,stdLabel=stdLabel,obj=obj,
                                             iou_threshold=iou_threshold,method=method)
        for stds,lvos,worker in zip(std_s,lvo_s,workers):
            person_scores[worker].append([stds,lvos])
    person_scores = dict(person_scores)
    for k in person_scores.keys():
        person_scores[k] = eva_scores(person_scores[k])
    return eva_scores(list(person_scores.values()))


## Human Performance

In [3]:
ann_results = pd.read_csv("../organize_stage2_annotation/merged_annotation.csv",
                          converters={cln: lambda x:eval(x) for cln in 
                                      ["evidences",'crowd_answers', 'crowd_evidences','crowd_deleted_evidences','crowd_workerids',
                                        'expert_answers', 'expert_evidences','expert_deleted_evidences','expert_workerids']})
ann_results = ann_results[ann_results["crowd_answers"].apply(lambda x: x !=[])] # remove unannotated (in phase2) questions
ann_results = ann_results[[
    'modified_question', 'modified_answer','evidences', 'domain','video_link',
    'crowd_answers', 'crowd_evidences','crowd_deleted_evidences','crowd_workerids',
    'expert_answers', 'expert_evidences','expert_deleted_evidences','expert_workerids']]
ann_results["evidences"] = ann_results["evidences"].apply(lambda x : [list(t.values())[0] for t in x])

In [4]:
# TODO: boost speed
import copy

def union_workers(x):
    '''
    put crowd and expert annotations together
    '''
    return x["crowd_answers"]+x["expert_answers"],\
        x["crowd_evidences"]+x["expert_evidences"],\
        x["crowd_workerids"]+x["expert_workerids"]


def remove_empty_evid(x):
    new_combine_evidences = copy.deepcopy(x["combine_evidences"])
    new_combine_workerids = copy.deepcopy(x["combine_workerids"])
    for i in range(len(x["combine_answers"])-1,-1,-1):
        if x["combine_evidences"][i] == [[]]:
            del new_combine_evidences[i]
            del new_combine_workerids[i]
    return new_combine_evidences,new_combine_workerids


def fill_empty_evid(x):
    new_combine_evidences = copy.deepcopy(x["combine_evidences"])
    for i in range(len(x["combine_answers"])):
        if x["combine_evidences"][i] == [[]]:
            new_combine_evidences[i][0].append([-2,-3])
    return new_combine_evidences

### As multiple workers
Calculate scores for each worker, then average among workers \
Compared with `as one worker`: view all workers as one worker, and merge their annotations

In [5]:
ann_results[["combine_answers","combine_evidences","combine_workerids"]]=\
    ann_results.apply(union_workers,axis=1,result_type='expand') # union crowd and expert answers 

In [6]:
METHOD = "first"
# method:"max": return the maximum score;
#        "ave": return the average score;
#        "first": return score of the first answer
#                 for each metric

# Answer Agreement
AnsAgree = evaluate_persons(ann_results["combine_answers"],ann_results["combine_workerids"],
                 stdLabels=ann_results["modified_answer"],
                 obj="answer",method=METHOD)

print(f"Answer Agreement: {[[round(b*100,2) for b in A]for A in AnsAgree]}")


# Evidence Agreement

## skip empty evidences
evd_input_df = copy.deepcopy(ann_results[["evidences"]])
# remove empty evidences and corresponding workers
evd_input_df[["combine_evidences","combine_workerids"]]=\
    ann_results.apply(remove_empty_evid,axis=1,result_type='expand')
evd_input_df = evd_input_df[evd_input_df["combine_workerids"].apply(lambda x:x!=[])]

EvdAgree = evaluate_persons(evd_input_df["combine_evidences"],evd_input_df["combine_workerids"],
                 stdLabels=evd_input_df["evidences"],
                 obj="evidence",iou_threshold=0.5, method=METHOD)
print(f"skip Evidence Agreement: {[[round(b*100,2) for b in A] for A in EvdAgree]}")

    
## score empty evidences as 0
evd_input_df = copy.deepcopy(ann_results[["evidences","combine_workerids"]])
evd_input_df["combine_evidences"]=\
    ann_results.apply(fill_empty_evid,axis=1) # remove empty evidences and according workers

EvdAgree = evaluate_persons(evd_input_df["combine_evidences"],evd_input_df["combine_workerids"],
                 stdLabels=evd_input_df["evidences"],
                 obj="evidence",method=METHOD)

print(f"zero Evidence Agreement: {[[round(b*100,2) for b in A] for A in EvdAgree]}")

Answer Agreement: [[1.2, 19.16, 6.83, 3.72, 26.32, 9.63, 23.72], [0.98, 44.17, 16.15, 7.7, 42.41, 19.14, 37.82]]
skip Evidence Agreement: [[11.77], [14.34]]
zero Evidence Agreement: [[6.31], [7.23]]


### as one workers (depricated, decide to use <u>multiple workers</u> strategy )

In [7]:
# ann_results[["combine_answers","combine_evidences","combine_workerids"]]=\
#     ann_results.apply(union_workers,axis=1,result_type='expand') # union crowd and expert answers 

In [8]:
# import random
# random.seed=0
# def select_worker(x):
#     i = random.randint(0,len(x["combine_answers"])-1)
#     return x["combine_answers"][i:i+1],x["combine_evidences"][i:i+1],["OnePerson"]

# ann_results[["combine_answers","combine_evidences","combine_workerids"]]=\
#     ann_results.apply(select_worker,axis=1,result_type='expand')

In [9]:
# METHOD = "first"

# AnsAgree = evaluate_persons(ann_results["combine_answers"],ann_results["combine_workerids"],
#                  stdLabels=ann_results["modified_answer"],
#                  obj="answer",method=METHOD)

# print(f"Answer Agreement: {[[round(b*100,2) for b in A]for A in AnsAgree]}")

# # skip empty evidences
# evd_input_df = copy.deepcopy(ann_results[["evidences"]])
# evd_input_df[["combine_evidences","combine_workerids"]]=\
#     ann_results.apply(remove_empty_evid,axis=1,result_type='expand') # remove empty evidences and according workers
# evd_input_df = evd_input_df[evd_input_df["combine_workerids"].apply(lambda x:x!=[])]

# EvdAgree = evaluate_persons(evd_input_df["combine_evidences"],evd_input_df["combine_workerids"],
#                  stdLabels=evd_input_df["evidences"],
#                  obj="evidence",method=METHOD)

# print(f"skip Evidence Agreement: {[[round(b*100,2) for b in A] for A in EvdAgree]}")

# # score empty evidences as 0
# evd_input_df = copy.deepcopy(ann_results[["evidences","combine_workerids"]])
# evd_input_df["combine_evidences"]=\
#     ann_results.apply(fill_empty_evid,axis=1) # remove empty evidences and according workers

# EvdAgree = evaluate_persons(evd_input_df["combine_evidences"],evd_input_df["combine_workerids"],
#                  stdLabels=evd_input_df["evidences"],
#                  obj="evidence",method=METHOD)

# print(f"zero Evidence Agreement: {[[round(b*100,2) for b in A] for A in EvdAgree]}")