In [1]:
import json
from pathlib import Path
from tqdm import tqdm
import re
from datetime import datetime
import pandas as pd
from nervaluate import Evaluator

from pie import Pie

# Test model

### Load data

In [2]:
def load_test_data(file="data/gold.json"):
    """Load test data from directory"""
    with open(file) as f:
        return json.load(f)

data = load_test_data()

### Run predictions

In [3]:
pie = Pie()

In [20]:
test_data = data[:2]

In [None]:

for example in tqdm(test_data):
    response = pie.bake(example["text"])
    example["predicted"] = response

### format predictions

In [22]:
# Pattern to capture placeholders
ENTITY_PATTERN = re.compile(r"\{[A-Z]+\s?[A-Z]*\}")

def extract_entities_from_text(example: dict) -> list[dict]:
    """ 
    Finds the masked entities in the predicted text and extracts 
    the corresponding substrings with indices from the original text.
    """
    extracted_entities = []

    # Split the predicted text using the placeholders
    chunks = ENTITY_PATTERN.split(example["predicted"])

    # Find the labels of the placeholders
    labels = ENTITY_PATTERN.findall(example["predicted"])

    # Initiate the last index found to 0
    last_idx = 0

    for i, chunk in enumerate(chunks[:-1]):
        
        start_of_chunk = example["text"].find(chunk, last_idx)
        end_of_chunk = start_of_chunk + len(chunk)
        
        # Find the start of the next chunk in the original text
        start_of_next_chunk = example["text"].find(chunks[i + 1], end_of_chunk)

        extracted_entities.append({
            "label": labels[i],
            "start": end_of_chunk,
            "end": start_of_next_chunk,
            "text": example["text"][end_of_chunk:start_of_next_chunk],
        })
        # Update the last index to the end of the current chunk
        last_idx = end_of_chunk

    return extracted_entities


In [None]:
for example in tqdm(test_data):
    example["predicted_entities"] = extract_entities_from_text(example)

### Save predictions

In [24]:
str_date = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")

save_path = f"tests/performance_tests/test_models/{pie.llm.model_name}-{str_date}.json"
p = Path(save_path)

with p.open("w", encoding="utf8") as f:
    json.dump(test_data, f, ensure_ascii=False)

Run evaluation

See documentation on nervaluate and how to interpret the results here:
https://www.davidsbatista.net/blog/2018/05/09/Named_Entity_Evaluation/ 

****

Use in-memory object from above or load previous model predictions on test_data

In [25]:
with p.open("r", encoding="utf8") as f:
    test_data = json.load(f)

In [None]:
test_data

In [27]:
pred = [t["predicted_entities"] for t in test_data]
true = [t["entities"] for t in test_data]

In [28]:
evaluator = Evaluator(
    true, pred,
    tags= ["{{NAME}}", "{{EMAIL}}", "{{PHONE}}", "{{CPR NUMBER}}", "{{ORGANIZATION}}", "{{LOCATION}}", "{{ZIP CODE}}"]
)
# Returns overall metrics and metrics for each tag
results, results_per_tag = evaluator.evaluate()

In [29]:
reform = {
    (outer_key, inner_key): values for outer_key, inner_dict 
    in results_per_tag.items() for inner_key, values in inner_dict.items()
}

In [None]:
dataf = pd.DataFrame(reform).round(2)
dataf