In [None]:
# config
root_path = '../../'
out_dir = './data-out/setup2'

In [None]:
from clef.utils.scoring import eval_run_custom
from clef.utils.data_loading import task5_dir

import pandas as pd
from IPython.core.display import display_html


sample_submission_file = f'{root_path}/{task5_dir}/submission_samples/KGAT_zeroShot_verification_English_dev.json'

rq3_nli_submission_file = f'{out_dir}/zeroshot-ver-rq3-nli.jsonl'
rq3_openai_submission_file = f'{out_dir}/zeroshot-ver-rq3-openai.jsonl'

ground_truth_file = f'{root_path}/{task5_dir}/data/English_dev.json'

In [None]:
import numpy as np

def strict_f1(actual, predicted, actual_evidence, predicted_evidence, label):
    tp = 0
    fp = 0
    fn = 0
    for i in range(len(actual)):
        if actual[i] != "NOT ENOUGH INFO":
            if (actual[i] == label) & (
                (predicted[i] == label)
                & (bool(set(predicted_evidence[i]) & set(actual_evidence[i])) == True)
            ):
                tp = tp + 1
            elif (actual[i] != label) & (predicted[i] == label):
                fp = fp + 1
            elif (actual[i] == label) & (
                (predicted[i] == label)
                & (bool(set(predicted_evidence[i]) & set(actual_evidence[i])) == False)
            ):
                fp = fp + 1
            elif (predicted[i] != label) & (actual[i] == label):
                fn = fn + 1
        else:
            if (actual[i] == label) & (predicted[i] == label):
                tp = tp + 1
            elif (actual[i] != label) & (predicted[i] == label):
                fp = fp + 1
            elif (predicted[i] != label) & (actual[i] == label):
                fn = fn + 1

    try:
        precision = tp / (tp + fp)
    except:
        precision = 0
    try:
        recall = tp / (tp + fn)
    except:
        recall = 0
    try:
        f1 = 2 * (precision * recall) / (precision + recall)
    except:
        f1 = 0
    return f1


def f1(actual, predicted, label):
    tp = 0
    fp = 0
    fn = 0
    for i in range(len(actual)):
        if (actual[i] == label) & (predicted[i] == label):
            tp = tp + 1
        elif (actual[i] != label) & (predicted[i] == label):
            fp = fp + 1
        elif (predicted[i] != label) & (actual[i] == label):
            fn = fn + 1

    try:
        precision = tp / (tp + fp)
    except:
        precision = 0
    try:
        recall = tp / (tp + fn)
    except:
        recall = 0
    try:
        f1 = 2 * (precision * recall) / (precision + recall)
    except:
        f1 = 0
    return f1


def f1_macro(actual, predicted):
    # `macro` f1- unweighted mean of f1 per label
    for label in np.unique(actual):
        pass        
    return np.mean([f1(actual, predicted, label) for label in np.unique(actual)])


def f1_macro_strict(actual, predicted, actual_evidence, predicted_evidence):
    # `macro` f1- unweighted mean of macro-f1 per label
    return np.mean(
        [
            strict_f1(actual, predicted, actual_evidence, predicted_evidence, label)
            for label in np.unique(actual)
        ]
    )

In [None]:
import jsonlines, json

def eval_run_custom(pred_file, gold_file, out_file):
    """
    basically the same, but without saving to file
    """
    gold_dict_labels = {}
    gold_dict_evidence = {}
    for line in jsonlines.open(gold_file):
        gold_dict_labels[line["id"]] = line["label"]
        temp_ev = []
        for ev in line["evidence"]:
            temp_ev.append(str(ev[1]))
        gold_dict_evidence[line["id"]] = temp_ev
    # print(json.dumps(gold_dict_evidence, indent=2))
    # return (1,1)
    pred = [line for line in jsonlines.open(pred_file)]
    pred_labels = [line["predicted_label"] for line in pred]
    pred_evidence = []
    for line in pred:
        pred_instance = []
        for ev in line["predicted_evidence"]:
            pred_instance.append(str(ev[1]))
        pred_evidence.append(pred_instance)
    # print(json.dumps(pred_evidence, indent=2))
    # return (1,1)
    actual_labels = []
    actual_evidence = []
    for line in pred:
        actual_labels.append(gold_dict_labels[line["id"]])
        actual_instance = []
        for i in gold_dict_evidence[line["id"]]:
            actual_instance.append(i)
        actual_evidence.append(actual_instance)
    # print(json.dumps(actual_labels, indent=2))
    # print(json.dumps(actual_evidence, indent=2))
    # return (1,1)
        
    # compute macro-F1 and strict macro-F1
    macro_F1 = f1_macro(actual_labels, pred_labels)
    strict_macro_F1 = f1_macro_strict(
        actual_labels, pred_labels, actual_evidence, pred_evidence
    )
    return (macro_F1, strict_macro_F1)

In [29]:
import jsonlines

def get_actual_pred_labels(pred_file, gold_file):
    gold_dict_labels = {}
    for line in jsonlines.open(gold_file):
        gold_dict_labels[line["id"]] = line["label"]

    pred = [line for line in jsonlines.open(pred_file)]
    pred_labels = [line["predicted_label"] for line in pred]

    actual_labels = []
    for line in pred:
        actual_labels.append(gold_dict_labels[line["id"]])
    
    return (actual_labels, pred_labels)

In [None]:
act, pred = get_actual_pred_labels(rq3_openai_submission_file, ground_truth_file)
list(zip(act, pred))

[('REFUTES', 'REFUTES'),
 ('REFUTES', 'REFUTES'),
 ('REFUTES', 'REFUTES'),
 ('REFUTES', 'NOT ENOUGH INFO'),
 ('REFUTES', 'REFUTES'),
 ('REFUTES', 'REFUTES'),
 ('REFUTES', 'REFUTES'),
 ('REFUTES', 'REFUTES'),
 ('REFUTES', 'REFUTES'),
 ('REFUTES', 'NOT ENOUGH INFO'),
 ('REFUTES', 'REFUTES'),
 ('REFUTES', 'NOT ENOUGH INFO'),
 ('REFUTES', 'NOT ENOUGH INFO'),
 ('SUPPORTS', 'SUPPORTS'),
 ('SUPPORTS', 'SUPPORTS'),
 ('SUPPORTS', 'SUPPORTS'),
 ('SUPPORTS', 'SUPPORTS'),
 ('SUPPORTS', 'SUPPORTS'),
 ('SUPPORTS', 'SUPPORTS'),
 ('NOT ENOUGH INFO', 'NOT ENOUGH INFO'),
 ('NOT ENOUGH INFO', 'NOT ENOUGH INFO'),
 ('NOT ENOUGH INFO', 'NOT ENOUGH INFO'),
 ('NOT ENOUGH INFO', 'NOT ENOUGH INFO'),
 ('NOT ENOUGH INFO', 'NOT ENOUGH INFO'),
 ('NOT ENOUGH INFO', 'NOT ENOUGH INFO'),
 ('NOT ENOUGH INFO', 'NOT ENOUGH INFO'),
 ('NOT ENOUGH INFO', 'NOT ENOUGH INFO'),
 ('NOT ENOUGH INFO', 'NOT ENOUGH INFO'),
 ('NOT ENOUGH INFO', 'NOT ENOUGH INFO'),
 ('NOT ENOUGH INFO', 'NOT ENOUGH INFO'),
 ('NOT ENOUGH INFO', 'NOT ENOU

In [33]:
def get_actual_pred_labels_evidence(pred_file, gold_file):
    gold_dict_labels = {}
    gold_dict_evidence = {}
    for line in jsonlines.open(gold_file):
        gold_dict_labels[line["id"]] = line["label"]
        temp_ev = []
        for ev in line["evidence"]:
            temp_ev.append(str(ev[1]))
        gold_dict_evidence[line["id"]] = temp_ev
    pred = [line for line in jsonlines.open(pred_file)]
    pred_labels = [line["predicted_label"] for line in pred]
    pred_evidence = []
    for line in pred:
        pred_instance = []
        for ev in line["predicted_evidence"]:
            pred_instance.append(str(ev[1]))
        pred_evidence.append(pred_instance)
    actual_labels = []
    actual_evidence = []
    for line in pred:
        actual_labels.append(gold_dict_labels[line["id"]])
        actual_instance = []
        for i in gold_dict_evidence[line["id"]]:
            actual_instance.append(i)
        actual_evidence.append(actual_instance)
    return(actual_labels, pred_labels, actual_evidence, pred_evidence)

In [39]:
actual_labels, pred_labels, actual_evidence, pred_evidence = get_actual_pred_labels_evidence(rq3_openai_submission_file, ground_truth_file)
for act, pred in list(zip(actual_evidence,pred_evidence)):
    for a in act:
        if a not in pred:
            print(a)

1234715165767147523
1233784722238705670
1304111096949866497
1341840863358554115
1341782311889723401
1341779925326581761
1341429691417305091
1340648347502342145
1340641178920873985
1340280471306027009
1341324720164261888
1339883857374539778
1342093406395314178
1342067208856489985
1341764926612844546
1341669834329903104
1341504694603157506
1341478274845839362
1340680064506204162
1340259651171065856
1339948400154963970
1339895685752135684
1339878931177086976
1436952831215476739
1403716782276464643
1403741413817438210
1403795269855191040
1608880491989700608


In [31]:
# basic score eval

# time_now  = datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S') 
# out_file = f'{out_dir}/eval/RQ3-{time_now}.csv'

eval_data = [
    # ['baseline',    *eval_run_custom(sample_submission_file, ground_truth_file, '')],
    # ['RQ3-nli',     *eval_run_custom(rq3_nli_submission_file, ground_truth_file, '')],
    ['RQ3-openai',  *eval_run_custom(rq3_openai_submission_file, ground_truth_file, '')],
]

eval_df = pd.DataFrame(eval_data)
eval_df.columns = ['method', 'macro-F1', 'strict-macro-F1']

df_r5  = eval_df[['method', 'macro-F1']].sort_values('macro-F1', axis=0, ascending=False)
df_map = eval_df[['method', 'strict-macro-F1']].sort_values('strict-macro-F1', axis=0, ascending=False)

df1_styler = df_r5.style.set_table_attributes("style='display:inline'").set_caption('macro-F1')
df2_styler = df_map.style.set_table_attributes("style='display:inline'").set_caption('strict-macro-F1')

# eval_df.to_csv(out_file)

display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)


Unnamed: 0,method,macro-F1
0,RQ3-openai,0.894949

Unnamed: 0,method,strict-macro-F1
0,RQ3-openai,0.894949
