# Evaluating the spaCy NER model

In [1]:
import spacy
from spacy.tokens import DocBin
from spacy.scorer import Scorer
from spacy.training import Example

In [2]:
# Load the trained model
nlp = spacy.load("./output/model-best")

# Load the test dataset
test_data = DocBin().from_disk("./data/test.spacy")
test_docs = list(test_data.get_docs(nlp.vocab))

# Initialize the scorer
scorer = Scorer(default_lang=nlp.lang, default_pipeline=nlp.pipe_names)

# Use the model to predict the document entities
predictions = [Example(nlp(doc.text), doc) for doc in test_docs]

# Score the predictions against the annotations in the test dataset
scores = scorer.score(predictions)

In [3]:
# Print the desired metrics
print(f"F1-score: {scores['ents_f']}")
print(f"Precision: {scores['ents_p']}")
print(f"Recall: {scores['ents_r']}")
print("By PII type:")
for label, score in scores["ents_per_type"].items():
    print(f"\t'{label}': {score['p']} precision, {score['r']} recall, {score['f']} F1")

F1-score: 0.5884297520661158
Precision: 0.6202090592334495
Recall: 0.559748427672956
By PII type:
	'NAME_STUDENT': 0.6179775280898876 precision, 0.6297709923664122 recall, 0.6238185255198486 F1
	'ID_NUM': 0.0 precision, 0.0 recall, 0.0 F1
	'EMAIL': 0.0 precision, 0.0 recall, 0.0 F1
	'URL_PERSONAL': 0.65 precision, 0.5416666666666666 recall, 0.5909090909090908 F1
	'STREET_ADDRESS': 0.0 precision, 0.0 recall, 0.0 F1
	'USERNAME': 0.0 precision, 0.0 recall, 0.0 F1
