# Evaluating the spaCy NER model

In [1]:
import spacy
from spacy.tokens import DocBin
from spacy.scorer import Scorer
from spacy.training import Example

In [2]:
# Load the trained model
nlp = spacy.load("./output/model-best")

# Load the test dataset
test_data = DocBin().from_disk("./data/test.spacy")
test_docs = list(test_data.get_docs(nlp.vocab))

# Initialize the scorer
scorer = Scorer(default_lang=nlp.lang, default_pipeline=nlp.pipe_names)

# Use the model to predict the document entities
predictions = [Example(nlp(doc.text), doc) for doc in test_docs]

# Score the predictions against the annotations in the test dataset
scores = scorer.score(predictions)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Print the desired metrics
print(f"F1-score: {scores['ents_f']}")
print(f"Precision: {scores['ents_p']}")
print(f"Recall: {scores['ents_r']}")
print("By PII type:")
for label, score in scores["ents_per_type"].items():
    print(f"\t'{label}': {score['p']} precision, {score['r']} recall, {score['f']} F1")

F1-score: 0.7548209366391184
Precision: 0.8203592814371258
Recall: 0.6989795918367347
By PII type:
	'NAME_STUDENT': 0.851063829787234 precision, 0.7100591715976331 recall, 0.7741935483870968 F1
	'USERNAME': 0.0 precision, 0.0 recall, 0.0 F1
	'ID_NUM': 0.875 precision, 0.4375 recall, 0.5833333333333334 F1
	'URL_PERSONAL': 0.6153846153846154 precision, 0.8888888888888888 recall, 0.7272727272727274 F1
	'EMAIL': 0.6666666666666666 precision, 1.0 recall, 0.8 F1
