In [2]:
"""
AUTO EVALUATION
"""

"""
Distractor Evaluator
Author: AndyChiangSH
Time: 2022/10/19
"""

import json
import os
import numpy as np
import csv


def auto_matric_evaluate(results):
    # evaluating
    print("Evaluating...")
    avg_eval = {"P@1": 0.0, "R@1": 0.0, "F1@1": 0.0, "P@3": 0.0, "R@3": 0.0, "F1@3": 0.0, "P@5": 0.0, "R@5": 0.0, "F1@5": 0.0,
            "P@10": 0.0, "R@10": 0.0, "R@30":0.0, "R@50":0.0, "F1@10": 0.0, "MRR@5": 0.0, "MAP@5": 0.0, "NDCG@1": 0.0, "NDCG@3": 0.0, "NDCG@5": 0.0, "NDCG@10": 0.0, "NDCG@30":0.0,
           "NDCG@50":0.0, "NDCG@100":0.0,
           }
    for result in results:
        eval = evaluate(result)
        for k in avg_eval.keys():
            avg_eval[k] += eval[k]

    # calculate average
    for k in avg_eval.keys():
        avg_eval[k] /= len(results)
    # print(avg_eval)

    # save evaluation to csv
    print("Save to csv file...")
    with open(output_file_name, "w", newline="", encoding="utf-8-sig") as csvfile:
        print(output_file_name)
        writer = csv.writer(csvfile)
        key_list = list()
        value_list = list()
        for k in avg_eval.keys():
            key_list.append(k)
            value_list.append(avg_eval[k]*100)

        writer.writerow(key_list)
        writer.writerow(value_list)

    # show evaluation
    for k in avg_eval.keys():
        print(f"{k}: {avg_eval[k]*100}%")

    print("Done!")


def evaluate(result):
    eval = {"P@1": 0.0, "R@1": 0.0, "F1@1": 0.0, "P@3": 0.0, "R@3": 0.0, "F1@3": 0.0, "P@5": 0.0, "R@5": 0.0, "F1@5": 0.0,
            "P@10": 0.0, "R@10": 0.0, "R@30":0.0, "R@50":0.0, "F1@10": 0.0, "MRR@5": 0.0, "MAP@5": 0.0, "NDCG@1": 0.0, "NDCG@3": 0.0, "NDCG@5": 0.0, "NDCG@10": 0.0, "NDCG@30":0.0,
           "NDCG@50":0.0, "NDCG@100":0.0,
           }
    distractors = [d.lower() for d in result["distractors"]]
    generations = [d.lower() for d in result["generated"]]

    relevants = [int(generation in distractors) for generation in generations]
    # print(relevants)
    if len(relevants) < 1:
        return eval
    # P@1
    if relevants[0] == 1:
        eval["P@1"] = 1
    else:
        eval["P@1"] = 0

    # R@1
    eval["R@1"] = relevants[:1].count(1) / len(distractors)

    # F1@1
    try:
        eval["F1@1"] = (2 * eval["P@1"] * eval["R@1"]) / \
            (eval["P@1"] + eval["R@1"])
    except ZeroDivisionError:
        eval["F1@1"] = 0
    
    # P@3
    eval["P@3"] = relevants[:3].count(1) / 3

    # R@3
    eval["R@3"] = relevants[:3].count(1) / len(distractors)
    
    # F1@3
    try:
        eval["F1@3"] = (2 * eval["P@3"] * eval["R@3"]) / \
            (eval["P@3"] + eval["R@3"])
    except ZeroDivisionError:
        eval["F1@3"] = 0
    
    # P@5
    eval["P@5"] = relevants[:5].count(1) / 5

    # R@5
    eval["R@5"] = relevants[:5].count(1) / len(distractors)

    # F1@5
    try:
        eval["F1@5"] = (2 * eval["P@5"] * eval["R@5"]) / \
            (eval["P@5"] + eval["R@5"])
    except ZeroDivisionError:
        eval["F1@5"] = 0
    
    # P@10
    eval["P@10"] = relevants[:10].count(1) / 10

    # R@10
    eval["R@10"] = relevants[:10].count(1) / len(distractors)


    # R@30
    eval["R@30"] = relevants[:30].count(1) / len(distractors)

    # R@50
    eval["R@50"] = relevants[:50].count(1) / len(distractors)
    
    # F1@10
    try:
        eval["F1@10"] = (2 * eval["P@10"] * eval["R@10"]) / \
            (eval["P@10"] + eval["R@10"])
    except ZeroDivisionError:
        eval["F1@10"] = 0

    # MRR@5
    # for i in range(5):
    #     if relevants[i] == 1:
    #         eval["MRR@5"] = 1 / (i+1)
    #         break

    # MAP@5
    # rel_num = 0
    # for i in range(5):
    #     if relevants[i] == 1:
    #         rel_num += 1
    #         eval["MAP@5"] += rel_num / (i+1)
    eval["MAP@5"] = eval["MAP@5"] / len(distractors)
    
    # NDCG@1
    eval["NDCG@1"] = ndcg_at_k(relevants, 1)

    # NDCG@3
    eval["NDCG@3"] = ndcg_at_k(relevants, 3)
    
    # NDCG@5
    eval["NDCG@5"] = ndcg_at_k(relevants, 5)

    # NDCG@10
    eval["NDCG@10"] = ndcg_at_k(relevants, 10)

    # NDCG@30
    eval["NDCG@30"] = ndcg_at_k(relevants, 30)

    # NDCG@50
    eval["NDCG@50"] = ndcg_at_k(relevants, 50)

    # NDCG@100
    eval["NDCG@100"] = ndcg_at_k(relevants, 100)
    return eval

def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
    return 0.


def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)
    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg



In [4]:
'''
Note: 分數最多只比到@30, @50與@30分數一樣是正常的，@50的分數不可參考
'''


# test_file_name = "llama3_CDGP_fewshot_selfanswer.json"
test_file_name = "./[Main result] Performance of Distractor Generation/ISSR_without_self-review.json"
output_file_name = "ISSR_without_self-review_metric.csv"
with open(test_file_name, "r") as f:
    data = json.load(f)

auto_matric_evaluate(data)

Evaluating...
Save to csv file...
ISSR_without_self-review_metric.csv
P@1: 1.0362694300518136%
R@1: 0.3454231433506045%
F1@1: 0.5181347150259068%
P@3: 1.0362694300518134%
R@3: 1.0362694300518134%
F1@3: 1.0362694300518134%
P@5: 1.4507772020725391%
R@5: 2.4179620034542313%
F1@5: 1.8134715025906734%
P@10: 1.2435233160621766%
R@10: 4.145077720207252%
R@30: 5.008635578583765%
R@50: 5.008635578583765%
F1@10: 1.9131127939418087%
MRR@5: 0.0%
MAP@5: 0.0%
NDCG@1: 1.0362694300518136%
NDCG@3: 3.1088082901554404%
NDCG@5: 5.109509386604552%
NDCG@10: 6.7838278267151235%
NDCG@30: 7.442079679787779%
NDCG@50: 7.442079679787779%
NDCG@100: 7.442079679787779%
Done!
