# Evaluate Presidio Analyzer using the Presidio Evaluator framework

In [1]:
# install presidio via pip if not yet installed

#!pip install presidio-evaluator
#!pip install "presidio-analyzer[transformers]"
#!pip install presidio-evaluator

In [2]:
from pathlib import Path
from copy import deepcopy
from pprint import pprint
from collections import Counter
from typing import List

import warnings
warnings.filterwarnings('ignore')

from presidio_evaluator import InputSample
from presidio_evaluator.evaluation import Evaluator, ModelError
from presidio_evaluator.models import PresidioAnalyzerWrapper
from presidio_evaluator.experiment_tracking import get_experiment_tracker

import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

%reload_ext autoreload
%autoreload 2
%matplotlib inline

stanza and spacy_stanza are not installed
Flair is not installed by default
Flair is not installed


Select data for evaluation:

In [3]:
dataset_name = "synth_dataset_v2.json"
dataset = InputSample.read_dataset_json(Path(Path.cwd().parent, "data", dataset_name))

dataset = dataset[:300] # top 300 samples

print(len(dataset))

tokenizing input:   0%|          | 0/1500 [00:00<?, ?it/s]

loading model en_core_web_sm


tokenizing input: 100%|██████████| 1500/1500 [00:37<00:00, 39.64it/s]

300





In [4]:
def get_entity_counts(dataset:List[InputSample]):
    entity_counter = Counter()
    for sample in dataset:
        for tag in sample.tags:
            entity_counter[tag] += 1
    return entity_counter


In [5]:
print("Count per entity:")
pprint(get_entity_counts(dataset).most_common())

print("\nExample sentence:")
print(dataset[1])

print("\nMin and max number of tokens in dataset:")
print(
    f"Min: {min([len(sample.tokens) for sample in dataset])}, "
    f"Max: {max([len(sample.tokens) for sample in dataset])}"
)

print("\nMin and max sentence length in dataset:")
print(
    f"Min: {min([len(sample.full_text) for sample in dataset])}, "
    f"Max: {max([len(sample.full_text) for sample in dataset])}"
)

Count per entity:
[('O', 3798),
 ('STREET_ADDRESS', 611),
 ('PERSON', 285),
 ('GPE', 113),
 ('ORGANIZATION', 105),
 ('PHONE_NUMBER', 68),
 ('DATE_TIME', 45),
 ('TITLE', 30),
 ('CREDIT_CARD', 23),
 ('US_SSN', 20),
 ('AGE', 16),
 ('ZIP_CODE', 13),
 ('DOMAIN_NAME', 10),
 ('EMAIL_ADDRESS', 8),
 ('US_DRIVER_LICENSE', 5),
 ('IBAN_CODE', 4),
 ('NRP', 4),
 ('IP_ADDRESS', 1)]

Example sentence:
Full text: What are my options?
Spans: []


Min and max number of tokens in dataset:
Min: 3, Max: 77

Min and max sentence length in dataset:
Min: 9, Max: 359


### Define the AnalyzerEngine object 
In this case, using a huggingface model: obi/deid_roberta_i2b2

In [6]:
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import TransformersNlpEngine, NerModelConfiguration


# Here we define a transformers based NLP engine, 
# but you can use this cell to customize your Presidio Analyzer instance

# Define which model to use
model_config = [{"lang_code": "en", "model_name": {
    "spacy": "en_core_web_sm",  # use a small spaCy model for lemmas, tokens etc.
    "transformers": "obi/deid_roberta_i2b2"
    }
}]

# Map transformers model labels to Presidio's
model_to_presidio_entity_mapping = dict(
    PER="PERSON",
    PERSON="PERSON",
    LOC= "LOCATION",
    LOCATION= "LOCATION",
    GPE="LOCATION",
    ORG="ORGANIZATION",
    ORGANIZATION="ORGANIZATION",
    NORP="NRP",
    AGE="AGE",
    ID="ID",
    EMAIL="EMAIL",
    PATIENT="PERSON",
    STAFF="PERSON",
    HOSP="ORGANIZATION",
    PATORG="ORGANIZATION",
    DATE="DATE_TIME",
    TIME="DATE_TIME",
    PHONE="PHONE_NUMBER",
    HCW="PERSON",
    HOSPITAL="ORGANIZATION",
    FACILITY="LOCATION",
)

ner_model_configuration = NerModelConfiguration(labels_to_ignore = ["O"], 
                                                model_to_presidio_entity_mapping=model_to_presidio_entity_mapping)

nlp_engine = TransformersNlpEngine(models=model_config,
                                   ner_model_configuration=ner_model_configuration)

# Set up the engine, loads the NLP module (spaCy model by default) 
# and other PII recognizers
analyzer_engine = AnalyzerEngine(nlp_engine=nlp_engine)

### Run evaluation

#### Define experiment

In [7]:
experiment = get_experiment_tracker()
model = PresidioAnalyzerWrapper(analyzer_engine)

# Define evaluator and experiment tracking

evaluator = Evaluator(model=model)
dataset = Evaluator.align_entity_types(
    deepcopy(dataset), entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map
)

print("Count per entity after alignment:")
pprint(get_entity_counts(dataset).most_common())

# Track model and dataset params
params = {"dataset_name": dataset_name, "model_name": model.name}
params.update(model.to_log())
experiment.log_parameters(params)
experiment.log_dataset_hash(dataset)

--------
Entities supported by this Presidio Analyzer instance:
ID, EMAIL, DATE_TIME, EMAIL_ADDRESS, AU_MEDICARE, IN_VEHICLE_REGISTRATION, AGE, MEDICAL_LICENSE, CRYPTO, US_PASSPORT, US_ITIN, IP_ADDRESS, IN_VOTER, IN_AADHAAR, CREDIT_CARD, URL, PHONE_NUMBER, NRP, AU_TFN, UK_NHS, US_BANK_NUMBER, PERSON, US_SSN, US_DRIVER_LICENSE, IBAN_CODE, IN_PAN, SG_NRIC_FIN, ORGANIZATION, AU_ABN, AU_ACN, LOCATION, IN_PASSPORT
Count per entity after alignment:
[('O', 3798),
 ('LOCATION', 724),
 ('PERSON', 285),
 ('ORGANIZATION', 105),
 ('PHONE_NUMBER', 68),
 ('DATE_TIME', 45),
 ('TITLE', 30),
 ('CREDIT_CARD', 23),
 ('US_SSN', 20),
 ('AGE', 16),
 ('ZIP_CODE', 13),
 ('URL', 10),
 ('EMAIL_ADDRESS', 8),
 ('US_DRIVER_LICENSE', 5),
 ('IBAN_CODE', 4),
 ('NRP', 4),
 ('IP_ADDRESS', 1)]


#### Run experiment

In [8]:
# Run experiment
evaluation_results = evaluator.evaluate_all(dataset)
results = evaluator.calculate_score(evaluation_results)

# Track experiment results
experiment.log_metrics(results.to_log())
entities, confmatrix = results.to_confusion_matrix()
experiment.log_confusion_matrix(matrix=confmatrix, 
                                labels=entities)

# Plot output
plotter = evaluator.Plotter(model=model, 
                            results=results, 
                            output_folder = ".", 
                            model_name = model.name, 
                            beta = 2)

# end experiment
experiment.end()

Running model PresidioAnalyzerWrapper on dataset...
Finished running model on dataset
saving experiment data to experiment_20241115-134323.json


In [9]:
plotter.plot_scores()

### Results analysis

In [10]:
sent = "I am taiwanese but I live in Cambodia."
# sent = input("Enter sentence: ")
model.predict(InputSample(full_text=sent))

['O', 'O', 'LOCATION', 'O', 'O', 'O', 'O', 'O', 'O']

In [11]:
errors = results.model_errors

#### False positives

1. Most common false positive tokens:

In [12]:
ModelError.most_common_fp_tokens(errors)

Most common false positive tokens:
[('Southern', 5),
 ('Entertainment', 2),
 ('Weekly', 2),
 ('53650', 1),
 ('3520', 1),
 ('7188', 1),
 ('Office\\,07700', 1),
 ('8592', 1),
 ('IBAN', 1),
 ('9100', 1)]
---------------
Example sentence with each FP token:
	- The Exversion Orchestra was founded in 1977. Since then, it has grown from a volunteer community orchestra to a fully professional orchestra serving Southern Tunisia (`Southern` pred as LOCATION)
	- Krisztián Szöllösy listed his top 20 songs for Entertainment Weekly and had the balls to list this song at #15. (What did he put at #1 you ask? Answer:"Tube Snake Boogie" by Szabina J Gelencsér ג€“ go figure) (`Entertainment` pred as ORGANIZATION)
	- Krisztián Szöllösy listed his top 20 songs for Entertainment Weekly and had the balls to list this song at #15. (What did he put at #1 you ask? Answer:"Tube Snake Boogie" by Szabina J Gelencsér ג€“ go figure) (`Weekly` pred as ORGANIZATION)
	- Billing address: Sara Schwarz
    28245 Puruntie 

[('Southern', 5),
 ('Entertainment', 2),
 ('Weekly', 2),
 ('53650', 1),
 ('3520', 1),
 ('7188', 1),
 ('Office\\,07700', 1),
 ('8592', 1),
 ('IBAN', 1),
 ('9100', 1)]

In [13]:
fps_df = ModelError.get_fps_dataframe(errors, entity=["LOCATION"])
fps_df[["full_text", "token", "annotation", "prediction"]]

Unnamed: 0,full_text,token,annotation,prediction
0,"The Exversion Orchestra was founded in 1977. Since then, it has grown from a volunteer community orchestra to a fully professional orchestra serving Southern Tunisia",Southern,O,LOCATION
1,"The Davis, Reynolds and Williamson Orchestra was founded in 1977. Since then, it has grown from a volunteer community orchestra to a fully professional orchestra serving Southern Italy",Southern,O,LOCATION
2,Mrs. Barbara Yudina Apt. 675 62314 Mellemvej 32\nAalborg NO 9100,9100,O,LOCATION
3,"The Social Health Insights Orchestra was founded in 2002. Since then, it has grown from a volunteer community orchestra to a fully professional orchestra serving Southern Austria",Southern,O,LOCATION
4,"""The big three"" of The Big Three Killed My Baby are the car manufacturers that dominate the economy of the White Stripes' home city ΜΟΝΗ ΑΓΙΩΝ ΑΝΑΡΓΥΡΩΝ: Ezxbrl, Henderson, Hicks and Brown and Abt Associates. ""Don't feed me planned obsolescence,"" says Hannah McConnan in an uncharacteristically political song, lamenting the demise of the unions in the 60s.",White,O,LOCATION
5,"The Clark, Romero and Hall Orchestra was founded in 1973. Since then, it has grown from a volunteer community orchestra to a fully professional orchestra serving Southern Poland",Southern,O,LOCATION
6,"The Shepherd Ltd. Orchestra was founded in 1997. Since then, it has grown from a volunteer community orchestra to a fully professional orchestra serving Southern Canada",Southern,O,LOCATION


2. Most common false negative examples

In [14]:
ModelError.most_common_fn_tokens(errors, n=50)

Most common false negative tokens:
[('u.', 4),
 ('1977', 2),
 ('15', 2),
 ('APO', 2),
 ('nan', 2),
 ('Cyprus', 2),
 ('Greek', 2),
 ('52', 2),
 ('255', 2),
 ('Cite', 2),
 ('K.', 2),
 ('Godina', 2),
 ('AA', 2),
 ('Szczepańska', 2),
 ('Koskikatu', 1),
 ('25', 1),
 ('Exversion', 1),
 ('D.', 1),
 ('Yefremova', 1),
 ('Christiansen', 1),
 ('Schwarz', 1),
 ('82', 1),
 ('595', 1),
 ('LAPPEENRANTA', 1),
 ('Ubul', 1),
 ('Nicole', 1),
 ('Mary', 1),
 ('John', 1),
 ('Fanucci', 1),
 ('Szöllössy', 1),
 ('Allika', 1),
 ('46', 1),
 ('501', 1),
 ('Vasquez', 1),
 ('1634', 1),
 ('Hodge', 1),
 ('Propublica', 1),
 ('Király', 1),
 ('0413', 1),
 ('8144', 1),
 ('680', 1),
 ('ΛΕΥΚΩΣΙΑ', 1),
 ('11:34:35', 1),
 ('Ryan', 1),
 ('Canada', 1),
 ('Meza', 1),
 ('Mette', 1),
 ('Katrine', 1),
 ('Estonia', 1),
 ('M.', 1)]
---------------
Example sentence with each FN token:
	- The Propublica office is at Růžena and Király u. 15. (`u.` annotated as LOCATION)
	- The Exversion Orchestra was founded in 1977. Since then, it has

[('u.', 4),
 ('1977', 2),
 ('15', 2),
 ('APO', 2),
 ('nan', 2),
 ('Cyprus', 2),
 ('Greek', 2),
 ('52', 2),
 ('255', 2),
 ('Cite', 2),
 ('K.', 2),
 ('Godina', 2),
 ('AA', 2),
 ('Szczepańska', 2),
 ('Koskikatu', 1),
 ('25', 1),
 ('Exversion', 1),
 ('D.', 1),
 ('Yefremova', 1),
 ('Christiansen', 1),
 ('Schwarz', 1),
 ('82', 1),
 ('595', 1),
 ('LAPPEENRANTA', 1),
 ('Ubul', 1),
 ('Nicole', 1),
 ('Mary', 1),
 ('John', 1),
 ('Fanucci', 1),
 ('Szöllössy', 1),
 ('Allika', 1),
 ('46', 1),
 ('501', 1),
 ('Vasquez', 1),
 ('1634', 1),
 ('Hodge', 1),
 ('Propublica', 1),
 ('Király', 1),
 ('0413', 1),
 ('8144', 1),
 ('680', 1),
 ('ΛΕΥΚΩΣΙΑ', 1),
 ('11:34:35', 1),
 ('Ryan', 1),
 ('Canada', 1),
 ('Meza', 1),
 ('Mette', 1),
 ('Katrine', 1),
 ('Estonia', 1),
 ('M.', 1)]

More FN analysis

In [15]:
fns_df = ModelError.get_fns_dataframe(errors, entity=["IP_ADDRESS"])

No errors of type FN and entity ['IP_ADDRESS'] were found


In [16]:
fns_df[["full_text", "token", "annotation", "prediction"]]

TypeError: 'NoneType' object is not subscriptable

In [None]:
print("All errors:\n")
[print(error, "\n") for error in errors]