Libraries

In [None]:
# Libraries
import numpy as np
import sys
import os
import json
from pathlib import Path
import re
import pandas as pd

# add path to the dataset entities
sys.path.append(os.path.abspath("../0. Helpers"))
sys.path.append(os.path.abspath("../2. Data Processing/_dataset_entities"))

from datasetProcessing import Entity, recursive_fix
from performance import Prediction, Performance

Evaluate each model

In [None]:
all_configs = {
    "ai": 10,
    "literature": 10,
    "music": 10,
    "politics": 20,
    "science": 20,
    "multinerd_en": 20,
    "multinerd_pt": 20,
    "ener": 20,
    "lener": 20,
    "neuralshift": 20
}

In [None]:
reflection_folders = ["false_negatives/adapted"]

In [None]:
for topic, n in all_configs.items():
    for reflection_folder in reflection_folders:

        reflections_done_path = Path(f"results/error_reflection/{topic}/{reflection_folder}")

        total_reflections = 0
        predicted_new_entities = []
        performance_list = []

        # Process all examples in the folder
        for file_path in reflections_done_path.glob("*.json"):

            total_reflections += 1

            # extract file name
            file_index = Path(file_path).stem

            # read base result file
            base_results_folder = f"results/demo_type/{topic}/in_context_top{n}"
            with open(f"{base_results_folder}/{file_index}.json", "r", encoding="utf-8") as f:
                content = f.read()

            # Fix JSON extra comma
            content = re.sub(r',\s*$', '', content)
            old_data = json.loads(content)

            # read new file
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()

            # Fix JSON extra comma
            content = re.sub(r',\s*$', '', content)
            new_data = json.loads(content)

            # Apply encoding fix
            old_data = recursive_fix(old_data)
            new_data = recursive_fix(new_data)

            # extract entities  
            sentence = old_data.get("sentence", "")
            true_entities = old_data.get("true_entities", [])
            llm_entities = old_data.get("entities", [])
            new_entities = new_data.get("reflected_entities", [])

            # purge empty entities
            new_entities = [entity for entity in new_entities if entity["entity"]]

            predicted_entities = []
            for entity in llm_entities + new_entities:
                
                # remove duplicates
                for added_entity in predicted_entities:
                    if entity["span"].lower() == added_entity["span"].lower() and entity["entity"].lower() == added_entity["entity"].lower():
                        break
                else:
                    predicted_entities.append(entity)

            predicted_new_entities.append((file_index, len(new_entities)))

            # previous perfomance
            true_entities = [Entity.from_dict(entity) for entity in true_entities]

            # create old prediction object
            old_prediction = Prediction(0, "")
            old_prediction.set_results(true_entities, llm_entities)
            old_prediction.compute_performance()

            # create new prediction object
            new_prediction = Prediction(0, "")
            new_prediction.set_results(true_entities, predicted_entities)
            new_prediction.compute_performance()

            performance_list.append((file_index, old_prediction.performance, new_prediction.performance))

        # write to file
        with open(f"performance_reflection/{topic}_{reflection_folder.replace('/', '_')}.txt", "w", encoding="utf-8") as f:

            f.write(f"total reflections: {total_reflections} \n")
            f.write("new predicted entities: \n")
            series = pd.Series([p[1] for p in predicted_new_entities]).value_counts().sort_index()
            f.write(series.to_string())

            f.write("\n\n")
            tp_diff_list = []
            fp_diff_list = []
            fn_diff_list = []
            f1_diff_list = []

            up_from_0 = 0
            down_from_100 = 0

            for performance in performance_list:
                tp_diff_list.append(performance[2].tp - performance[1].tp)
                fp_diff_list.append(performance[2].fp - performance[1].fp)
                fn_diff_list.append(performance[2].fn - performance[1].fn)
                f1_diff_list.append(performance[2].f1() - performance[1].f1())

                tp_check = "✅" if performance[1].tp < performance[2].tp else "➖" if performance[1].tp == performance[2].tp else "❌"
                fp_check = "✅" if performance[1].fp > performance[2].fp else "➖" if performance[1].fp == performance[2].fp else "❌"
                fn_check = "✅" if performance[1].fn > performance[2].fn else "➖" if performance[1].fn == performance[2].fn else "❌"
                f1_check = "✅" if performance[1].f1() < performance[2].f1() else "➖" if performance[1].f1() == performance[2].f1() else "❌"

                if performance[1].f1() == 0 and performance[1].f1() < performance[2].f1():
                    up_from_0 += 1

                if performance[1].f1() == 100 and performance[1].f1() > performance[2].f1():
                    down_from_100 += 1

                f.write(f"{performance[0]:>3} \t tp {performance[1].tp} > {performance[2].tp} {tp_check} \t fp {performance[1].fp} > {performance[2].fp} {fp_check} \t fn {performance[1].fn} > {performance[2].fn} {fn_check} \t f1 {round(performance[1].f1(), 2)} > {round(performance[2].f1(), 2)} {f1_check} \n")

            f.write("\n\n")
            f.write("tp summary\n")
            f.write(pd.Series(tp_diff_list).value_counts().sort_index().to_string())

            f.write("\n\n")
            f.write("fp summary\n")
            f.write(pd.Series(fp_diff_list).value_counts().sort_index().to_string())

            f.write("\n\n")
            f.write("fn summary\n")
            f.write(pd.Series(fn_diff_list).value_counts().sort_index().to_string())

            f.write("\n\n")
            f.write("f1 summary\n")
            f.write(pd.Series(f1_diff_list).describe().to_string())

            f.write("\n\n")
            f.write("before vs after\n")
            old_total_tp = sum([p[1].tp for p in performance_list])
            old_total_fp = sum([p[1].fp for p in performance_list])
            old_total_fn = sum([p[1].fn for p in performance_list])
            old_micro_f1 = Performance(old_total_tp, old_total_fp, old_total_fn).f1()

            new_total_tp = sum([p[2].tp for p in performance_list])
            new_total_fp = sum([p[2].fp for p in performance_list])
            new_total_fn = sum([p[2].fn for p in performance_list])
            new_micro_f1 = Performance(new_total_tp, new_total_fp, new_total_fn).f1()

            f.write(f"TP: {old_total_tp} > {new_total_tp} \n")
            f.write(f"FP: {old_total_fp} > {new_total_fp} \n")
            f.write(f"FN: {old_total_fn} > {new_total_fn} \n")
            f.write(f"Micro F1: {old_micro_f1} > {new_micro_f1} \n")

            f.write("\n\n")
            f.write("f1 sentence-level\n")
            f.write(f"better F1: {sum(1 for x in f1_diff_list if x > 0)}\n")
            f.write(f"same F1: {sum(1 for x in f1_diff_list if x == 0)}\n")
            f.write(f"worse F1: {sum(1 for x in f1_diff_list if x < 0)}\n")
            f.write(f"up from 0: {up_from_0}\n")
            f.write(f"down from 100: {down_from_100}\n")

            print(f"{topic}, {reflection_folder}: Micro F1: {old_micro_f1} > {new_micro_f1}")

    print("")