Libraries

In [None]:
# Libraries
import numpy as np
import sys
import os
import json
from pathlib import Path
import re

# add path to the dataset entities
sys.path.append(os.path.abspath("../0. Helpers"))
sys.path.append(os.path.abspath("../2. Data Processing/_dataset_entities"))

from datasetProcessing import Entity, recursive_fix
from performance import Prediction, Performance

In [None]:
def process_instance(file_path, topic_reflection_folder, ignore_empty=False):
    
    try:
        with open(file_path, mode='r', encoding="utf-8") as f:
            content = f.read()

        if not content.strip():
            print(f"üóëÔ∏è Empty file detected, deleting: {file_path}")
            # file_path.unlink()
            return None

        # Fix JSON extra comma
        content = re.sub(r',\s*$', '', content)
        old_data = json.loads(content)

        # Apply encoding fix
        old_data = recursive_fix(old_data)

        # extract entities
        true_entities = old_data.get("true_entities", [])
        llm_entities = old_data.get("entities", [])
        
        # check for reflection
        reflection_file_path = f"results/error_reflection/{topic_reflection_folder}/{file_path.name}"
        if os.path.exists(reflection_file_path):
            with open(reflection_file_path, mode='r', encoding="utf-8") as rf:
                reflection_content = rf.read()
            
            # Fix JSON extra comma
            reflection_content = re.sub(r',\s*$', '', reflection_content)
            new_data = json.loads(reflection_content)

            # Apply encoding fix
            new_data = recursive_fix(new_data)

            # extract entities
            new_entities = new_data.get("reflected_entities", [])
        else:
            new_entities = []   

        predicted_entities = []
        for entity in llm_entities + new_entities:
            
            # remove duplicates
            for added_entity in predicted_entities:
                if entity["span"].strip().lower() == added_entity["span"].strip().lower() and entity["entity"].strip().lower() == added_entity["entity"].strip().lower():
                    break
            else:
                predicted_entities.append(entity)

        if not true_entities and ignore_empty:
            print(f"üóëÔ∏è No entities found in {file_path}, ignoring performance")
            return None
        
        # process true entities from dict to Entity objects
        true_entities = [Entity.from_dict(entity) for entity in true_entities]

        # create prediction object
        prediction = Prediction(0, "")

        # compute predictions
        prediction.set_results(true_entities, predicted_entities)
        prediction.compute_performance()
        prediction.compute_relaxed_performance()

        return prediction

    except Exception as e:
        print(f"‚ùå Error reading {file_path}: {e}")
        # file_path.unlink()
        return None

Evaluate each model

In [None]:
all_configs = {
    "ai": 10,
    "literature": 10,
    "music": 10,
    "politics": 20,
    "science": 20,
    "multinerd_en": 20,
    "multinerd_pt": 20,
    "ener": 20,
    "lener": 20,
    "neuralshift": 20
}

In [None]:
results_category = "error_reflection"

base_category = "demo_type"
base_folder = "in_context_top"

reflection_folders = ["unseen/adapted", "unseen/adapted_noCoT"]

def custom_sort_key(name):
    if   name == "error_reflection_unseen_adapted":
        return 2
    elif name == "error_reflection_unseen_adapted_noCoT":
        return 3
    else:
        return 0
    
latex_mapping = {
    "error_reflection_unseen_adapted":        "Adapted       ",
    "error_reflection_unseen_adapted_noCoT":  "Adapted no CoT",
}

# reflection_folders = ["false_negatives/adapted", "false_negatives/adapted_noCoT"]

# def custom_sort_key(name):
#     if   name == "error_reflection_false_negatives_adapted":
#         return 2
#     elif name == "error_reflection_false_negatives_adapted_noCoT":
#         return 3
#     else:
#         return 0
    
# latex_mapping = {
#     "error_reflection_false_negatives_adapted":        "FN & \checkmark  ",
#     "error_reflection_false_negatives_adapted_noCoT":  "FN & -           ",
# }

In [None]:
for topic, n in all_configs.items():
    print(f"Processing topic: {topic}")
    config_folder = Path(f"results/{base_category}/{topic}/{base_folder}{n}")

    if not config_folder.exists():
        print(f"‚ùå Topic folder {topic} does not exist.")
        continue

    for reflection_folder in reflection_folders:
        
        if config_folder.is_dir():
            print(f"Processing configuration: {topic} - {config_folder.name} - {reflection_folder}")

            # Process all instances in the folder
            dataset_performance = []
            for file_path in config_folder.glob("*.json"):
                result = process_instance(file_path, f"{topic}/{reflection_folder}")
                dataset_performance.append(result)

            # Filter out None results
            dataset_performance = [instance for instance in dataset_performance if instance is not None]
            if not dataset_performance:
                print(f"‚ö†Ô∏è No valid instances found in {config_folder.name}. Skipping...")
                continue
            else:
                print(f"‚úÖ Processed {len(dataset_performance)} valid instances in {config_folder.name}.")

            # compute individual performance metrics
            metrics_dict = {
                "total_samples": len(dataset_performance),
                "true_0": sum(1 for instance in dataset_performance if len(instance.true_entities) == 0),
                "llm_0": sum(1 for instance in dataset_performance if len(instance.llm_entities) == 0),
                "performance": {
                    "tp": [instance.performance.tp for instance in dataset_performance],
                    "fp": [instance.performance.fp for instance in dataset_performance],
                    "fn": [instance.performance.fn for instance in dataset_performance],
                },
                "relaxed_performance": {
                    "tp": [instance.relaxed_performance.tp for instance in dataset_performance],
                    "fp": [instance.relaxed_performance.fp for instance in dataset_performance],
                    "fn": [instance.relaxed_performance.fn for instance in dataset_performance],
                }
            }

            # compute performance metrics
            overall_performance = Performance(
                tp=sum(metrics_dict["performance"]["tp"]),
                fp=sum(metrics_dict["performance"]["fp"]),
                fn=sum(metrics_dict["performance"]["fn"])
            )

            overall_relaxed_performance = Performance(
                tp=sum(metrics_dict["relaxed_performance"]["tp"]),
                fp=sum(metrics_dict["relaxed_performance"]["fp"]),
                fn=sum(metrics_dict["relaxed_performance"]["fn"])
            )

            # write dataset performance to file
            overall_json = {    
                "total_samples": metrics_dict["total_samples"],
                "true_0": metrics_dict["true_0"],
                "llm_0": metrics_dict["llm_0"],
                "performance": {
                    "tp": overall_performance.tp,
                    "fp": overall_performance.fp,
                    "fn": overall_performance.fn,
                    "precision": overall_performance.precision(),
                    "recall": overall_performance.recall(),
                    "f1": overall_performance.f1()
                },
                "relaxed_performance": {
                    "tp": overall_relaxed_performance.tp,
                    "fp": overall_relaxed_performance.fp,
                    "fn": overall_relaxed_performance.fn,
                    "precision": overall_relaxed_performance.precision(),
                    "recall": overall_relaxed_performance.recall(),
                    "f1": overall_relaxed_performance.f1()
                }
            }

            print("  exact f1 ", round(overall_json["performance"]["f1"], 3), round(overall_json["performance"]["f1"], 1))
            print("relaxed f1 ", round(overall_json["relaxed_performance"]["f1"], 3), round(overall_json["relaxed_performance"]["f1"], 1))

            # Save the overall performance to a JSON file
            save_path = f"performance/{topic}"
            os.makedirs(save_path, exist_ok=True)

            performance_file = f"{save_path}/{results_category}_{reflection_folder.replace('/', '_')}.json"
            with open(performance_file, "w", encoding="utf-8") as f:
                f.write(json.dumps(overall_json, ensure_ascii=False, indent=4))
            
            print(f"‚úÖ Performance saved to {performance_file}")
            print("\n\n")

In [None]:
# include_categories = ["demo_type", "demo_criterium"]
include_categories = [f.replace("/", "_") for f in reflection_folders]
topic_all_f1 = {}

for topic in all_configs.keys():

    all_f1 = []
    performance_folder = Path(f"performance/{topic}")
    for file in performance_folder.glob("*.json"):
        if results_category in file.name and any(cat in file.name for cat in include_categories):
            with open(file, "r") as f:
                overall_json = json.load(f)
                all_f1.append((file.name.removesuffix('.json'), overall_json["performance"]["f1"]))

    # all_f1.sort(key=lambda x: x[1], reverse=True)
    all_f1.sort(key = lambda x : custom_sort_key(x[0]))
    
    # add to topic all f1
    topic_all_f1[topic] = all_f1

    print(f"Topic: {topic}")
    # print only best
    for name, f1 in all_f1:
        print(f"{name}: {f1:.2f}")

    print()