In [1]:
import sys
sys.path.append("src")

In [2]:
from src.evaluations.evaluations import QAEvaluation, EvidenceEvaluation

In [3]:
# Functions for computing human performance

import pandas as pd
import numpy as np
from typing import Iterable, Literal, Mapping
from collections import defaultdict

def evaluate_preds(preds: Iterable, labels: Iterable, 
                   obj:str="answer", method:str="max") -> [float]:
    """
    Evaluate scores for one person (answer / evidence)
    Input:
        preds: [pred1,pred2...]
        labels:[label1,label2..]
        obj:"answer":calculate answer agreement;
               "evidence":calculate evidence agreement;
        method:"max": return the maximum score;
               "ave": return the average score;
               "first": return score of the first answer
                      for each metric
    Output:
      For answer: 
        [exact_match, BLEU1, BLEU2, BLEU3, ROUGE1, ROUGE2, ROUGE3]
      For evidence: 
        [iou_f1]
    """
    if obj == "answer":
        evls = [QAEvaluation(["-"], [pred], [labels]) for pred in preds]
        scores = [evl.exact_match() for evl in evls],\
                    *[[evl.bleu(i+1) for evl in evls]for i in range(3)], \
                    *[[evl.rouge(i+1) for evl in evls] for i in range(3)]
    elif obj == "evidence":
        scores = [max([EvidenceEvaluation([pred],[label]).iou_f1() 
                       for label in labels]) for pred in preds]
    else:
        raise ValueError(f"no obj named as \"{obj}\"")
    
    if method=="max": # return maximum score for each metric
        return [max(s) for s in scores] if obj == "answer" else [max(scores)]
    elif method=="ave":
        return [np.average(s) for s in scores] if obj == "answer" else [np.average(scores)]
    elif method=="first":
        return [s[0] for s in scores] if obj == "answer" else [scores[0]]
    else:
        raise ValueError(f"no method named as \"{method}\"")


def evaluate_one_question(predss: Iterable[Iterable],stdLabel=None,obj:str="answer",
                          method:str="max") -> [[float],[float]]:
    '''
    Input:
        predss: humans' answers for one question
        stdLabel: standard label for this question.
        method:"max": return the maximum score;
               "ave": return the average score;
               "first": return score of the first answer
                      for each metric
    Output:
      For answer: 
        based on stdLabel:
        [person 1: [exact_match, BLEU1, BLEU2, BLEU3, ROUGE1, ROUGE2, ROUGE3]
         person 2: [exact_match, BLEU1, BLEU2, BLEU3, ROUGE1, ROUGE2, ROUGE3]
         ...]
        based on leave-one-human-out:
        [person 1: [exact_match, BLEU1, BLEU2, BLEU3, ROUGE1, ROUGE2, ROUGE3]
         person 2: [exact_match, BLEU1, BLEU2, BLEU3, ROUGE1, ROUGE2, ROUGE3]
         ...]
      For evidence:
        based on stdLabel:
        [person 1: [iou_f1], person 2: [iou_f1], ...]
        based on leave-one-human-out:
        [person 1: [iou_f1], person 2: [iou_f1], ...]
    '''
    assert stdLabel or len(predss) >= 2, "You have to provide either the standard answer " +\
                                            "or more then 2 humans' answers for evaluation."
    
    if len(predss) == 1: # only one person
        return [evaluate_preds(predss[0],[stdLabel],obj=obj,method=method)],[[]]
    
    stdScores = [] # stand answer scores
    leavOneScores = [] # leave one human scores
    for i in range(len(predss)): # for each human
        leavOneLabel = [pred for j in range(len(predss)) if j!=i for pred in predss[j]]
        if stdLabel:
            stdScores.append(evaluate_preds(predss[i],[stdLabel],obj=obj,method=method))
            leavOneLabel.append(stdLabel)
        else:
            stdScores.append([])
        leavOneScores.append(evaluate_preds(predss[i],leavOneLabel,obj=obj,method=method))
    return stdScores, leavOneScores


def evaluate_persons(answerss:Iterable[Iterable[Iterable[str]]],workerss:Iterable[Iterable[str]],
                     stdLabels:Iterable[str]=None, obj:str="answer", method:str="max") -> [[float],[float]]:
    '''
    Compute human performance
    Input:
        answerss: human's answers / evidences for multiple questions
        workerss: IDs for these human
        stdLabels: standard answer / evidence for these question
        method:"max": return the maximum score;
               "ave": return the average score;
               "first": return score of the first answer
                      for each metric
    Output:
        based on stdLabel:
        [ave_exact_match, ave_BLEU1, ave_BLEU2, ave_BLEU3, ave_ROUGE1, ave_ROUGE2, ave_ROUGE3]
        based on leave-one-human-out:
        [ave_exact_match, ave_BLEU1, ave_BLEU2, ave_BLEU3, ave_ROUGE1, ave_ROUGE2, ave_ROUGE3]
    '''
    
    def eva_scores(stdLeaScores:[[[float],[float]]]) -> [[float],[float]]:
        '''
        Average scores for one person / for all persons
        Input:
            stdLeaScores: [[stdScores1,leaOneScores1],[stdScores2,leaOneScores2],..]
        Output:
            [aveStdScore,aveLeaScore]
        '''
        return [list(np.average(ps,axis=0)) if
                (ps:=[si for s in stdLeaScores if (si:=s[i])!=[]])!=[] else []
                for i in (0,1)]
    
    if stdLabels is None:
        stdLabels = [None for _ in answerss]
    
    assert len(answerss) == len(workerss) \
        == len(stdLabels), "Length of 'answerss', 'workerss', 'stdLabels' are not the same!"
    
    person_scores = defaultdict(list)
    for stdLabel, answers, workers in zip(stdLabels,answerss,workerss):
        assert len(answers) == len(workers), \
            "Number of wokers and answers are not the same for record:\n" + \
            f"stdLabel: {stdLabel}\nanswers:{answers}\nworkers:{workers}"
        
        std_s, lvo_s = evaluate_one_question(answers,stdLabel=stdLabel,obj=obj,method=method)
        for stds,lvos,worker in zip(std_s,lvo_s,workers):
            person_scores[worker].append([stds,lvos])
    person_scores = dict(person_scores)
    for k in person_scores.keys():
        person_scores[k] = eva_scores(person_scores[k])
    return eva_scores(list(person_scores.values()))


In [8]:
ann_results = pd.read_csv("ans-exp-5-6-crowd.csv",converters={"answers": lambda x:eval(x),"workerids": lambda x:eval(x),
                                                        "stdEvidences":lambda x:eval(x), "evidences":lambda x:eval(x)})

org_ann = ann_results.groupby(["question","video","stdAnswer"]).\
    agg({"answers":list,"workerids":list,"stdEvidences":list,
        "evidences":list}) # in case same video, same question, same stdAnswer in different HIT
org_ann[["answers","workerids","stdEvidences","evidences"]] =\
    org_ann[["answers","workerids","stdEvidences","evidences"]].apply(lambda x: [b for a in x for b in a])
# TODO: one person answer the same question more than once

org_ann["stdEvidences"] = org_ann["stdEvidences"].apply(lambda x : [list(t.values())[0] for t in x])

In [9]:
METHOD = "first"

AnsAgree = evaluate_persons(org_ann["answers"],org_ann["workerids"],
                 stdLabels=list(org_ann.index.get_level_values(2)),
                 obj="answer",method=METHOD)

EvdAgree = evaluate_persons(org_ann["evidences"],org_ann["workerids"],
                 stdLabels=org_ann["stdEvidences"],
                 obj="evidence",method=METHOD)

print(f"Answer Agreement: {AnsAgree}\n\nEvidence Agreement: {EvdAgree}")

1it [00:00, 19691.57it/s]
1it [00:00, 18477.11it/s]
1it [00:00, 20360.70it/s]
1it [00:00, 25731.93it/s]
1it [00:00, 11554.56it/s]
1it [00:00, 12122.27it/s]
1it [00:00, 12595.51it/s]
1it [00:00, 23967.45it/s]
1it [00:00, 9915.61it/s]
1it [00:00, 10837.99it/s]
1it [00:00, 24244.53it/s]
1it [00:00, 24818.37it/s]
1it [00:00, 9939.11it/s]
1it [00:00, 23831.27it/s]
1it [00:00, 11554.56it/s]
1it [00:00, 6307.22it/s]
1it [00:00, 13486.51it/s]
1it [00:00, 5882.61it/s]
1it [00:00, 8355.19it/s]
1it [00:00, 11155.06it/s]
1it [00:00, 11397.57it/s]
1it [00:00, 9962.72it/s]
1it [00:00, 8004.40it/s]
1it [00:00, 9532.51it/s]
1it [00:00, 11814.94it/s]
1it [00:00, 12264.05it/s]
1it [00:00, 10082.46it/s]
1it [00:00, 13617.87it/s]
1it [00:00, 11781.75it/s]
1it [00:00, 11459.85it/s]
1it [00:00, 7738.57it/s]
1it [00:00, 5216.80it/s]
1it [00:00, 8192.00it/s]
1it [00:00, 6594.82it/s]
1it [00:00, 10330.80it/s]
1it [00:00, 11125.47it/s]
1it [00:00, 11096.04it/s]
1it [00:00, 14122.24it/s]
1it [00:00, 10951.19it/s

Answer Agreement: [[0.0, 0.11552273887539768, 0.012810131195534084, 0.005890236416116417, 0.1854206532392807, 0.027396611496535217, 0.006891133557800224], []]

Evidence Agreement: [[0.14740740740740743], []]



