Libraries

In [None]:
# Libraries
import numpy as np
import sys
import os
import json
from pathlib import Path
import re

# add path to the dataset entities
sys.path.append(os.path.abspath("../0. Helpers"))
sys.path.append(os.path.abspath("../2. Data Processing/_dataset_entities"))

from datasetProcessing import Entity, recursive_fix
from performance import Prediction, Performance

In [None]:
def process_instance(file_path, ignore_empty=False):
    
    try:
        with open(file_path, mode='r', encoding="utf-8") as f:
            content = f.read()

        if not content.strip():
            print(f"üóëÔ∏è Empty file detected, deleting: {file_path}")
            # file_path.unlink()
            return None

        # Fix JSON extra comma
        content = re.sub(r',\s*$', '', content)
        data = json.loads(content)

        # Apply encoding fix
        data = recursive_fix(data)

        # extract entities  
        true_entities = data.get("true_entities", [])
        llm_entities = data.get("entities", [])

        # remove duplicates from llm entities
        predicted_entities = []
        for entity in llm_entities:

            for added_entity in predicted_entities:
                if entity["span"].strip().lower() == added_entity["span"].strip().lower() and entity["entity"].strip().lower() == added_entity["entity"].strip().lower():
                    break
            else:
                predicted_entities.append(entity)

        if not true_entities and ignore_empty:
            print(f"üóëÔ∏è No entities found in {file_path}, ignoring performance")
            return None
        
        # process true entities from dict to Entity objects
        true_entities = [Entity.from_dict(entity) for entity in true_entities]

        # create prediction object
        prediction = Prediction(0, "")

        # compute predictions
        prediction.set_results(true_entities, predicted_entities)
        prediction.compute_performance()
        prediction.compute_relaxed_performance()

        # write back to json
        data["performance"] = {
            "tp": prediction.performance.tp,
            "fp": prediction.performance.fp,
            "fn": prediction.performance.fn,
            "precision": prediction.performance.precision(),
            "recall": prediction.performance.recall(),
            "f1": prediction.performance.f1()
        }
        
        data["relaxed_performance"] = {
            "tp": prediction.relaxed_performance.tp,
            "fp": prediction.relaxed_performance.fp,
            "fn": prediction.relaxed_performance.fn,
            "precision": prediction.relaxed_performance.precision(),
            "recall": prediction.relaxed_performance.recall(),
            "f1": prediction.relaxed_performance.f1()
        }

        with open(file_path, mode='w', encoding="utf-8") as f:
            f.write(json.dumps(data, ensure_ascii=False, indent=4))

        return prediction

    except Exception as e:
        print(f"‚ùå Error reading {file_path}: {e}")
        # file_path.unlink()
        return None

Evaluate each model

In [None]:
folder_prefix = "results/demo_type"
folder_suffix = "in_context_top"

all_configs = {
    "ai": 10,
    "literature": 10,
    "music": 10,
    "politics": 20,
    "science": 20,
    "multinerd_en": 20,
    "multinerd_pt": 20,
    "ener": 20,
    "lener": 20,
    "neuralshift": 20
}

In [None]:
topic_performances = {}

for topic, n in all_configs.items():
    
    print(f"Processing topic: {topic}")
    topic_path = Path(f"{folder_prefix}/{topic}/{folder_suffix}{n}")
    print(topic_path)

    if not topic_path.exists():
        print(f"‚ùå Topic folder {topic} does not exist.")
        continue

    # Process all instances in the folder
    dataset_performance = []
    for file_path in topic_path.glob("*.json"):
        result = process_instance(file_path)
        dataset_performance.append(result)

    # Filter out None results
    dataset_performance = [instance for instance in dataset_performance if instance is not None]
    if not dataset_performance:
        print(f"‚ö†Ô∏è No valid instances found in {topic_path.name}. Skipping...")
        continue
    else:
        print(f"‚úÖ Processed {len(dataset_performance)} valid instances in {topic_path.name}.")

    # compute individual performance metrics
    metrics_dict = {
        "total_samples": len(dataset_performance),
        "true_0": sum(1 for instance in dataset_performance if len(instance.true_entities) == 0),
        "llm_0": sum(1 for instance in dataset_performance if len(instance.llm_entities) == 0),
        "performance": {
            "tp": [instance.performance.tp for instance in dataset_performance],
            "fp": [instance.performance.fp for instance in dataset_performance],
            "fn": [instance.performance.fn for instance in dataset_performance],
        },
        "relaxed_performance": {
            "tp": [instance.relaxed_performance.tp for instance in dataset_performance],
            "fp": [instance.relaxed_performance.fp for instance in dataset_performance],
            "fn": [instance.relaxed_performance.fn for instance in dataset_performance],
        }
    }

    # compute performance metrics
    overall_performance = Performance(
        tp=sum(metrics_dict["performance"]["tp"]),
        fp=sum(metrics_dict["performance"]["fp"]),
        fn=sum(metrics_dict["performance"]["fn"])
    )

    print("topic:", topic)
    print("metrics:", metrics_dict)
    print("\n\n")

    topic_performances[topic] = overall_performance