In [13]:
from pathlib import Path
from pprint import pprint
from collections import Counter
from typing import Dict, List
import json
from copy import deepcopy

from presidio_evaluator import InputSample
from presidio_evaluator.evaluation import Evaluator, ModelError
from presidio_evaluator.models import PresidioAnalyzerWrapper
from presidio_evaluator.experiment_tracking import get_experiment_tracker

import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [14]:
dataset_name = "synth_dataset_v2.json"
dataset = InputSample.read_dataset_json(Path(Path.cwd().parent, "data", dataset_name))
dataset = Evaluator.align_entity_types(
    deepcopy(dataset), entities_mapping=PresidioAnalyzerWrapper.presidio_entities_map
)
print(len(dataset))

tokenizing input: 100%|██████████| 1500/1500 [00:30<00:00, 49.56it/s]


1500


In [15]:

presidio_entities_map = dict(
  FIRSTNAME=  "PERSON",
  LASTNAME = "PERSON",
  MIDDLENAME="PERSON",
  FULLNAME = "PERSON",
  PATIENT="PERSON",
  STAFF="PERSON",
  PER="PERSON",
  HCW="PERSON",
  PERSON = "PERSON",



  DATE="DATE_TIME",
  TIME="DATE_TIME",
  DOB="DATE_TIME" ,
  DATE_TIME = "DATE_TIME",

  EMAIL="EMAIL_ADDRESS",
  EMAIL_ADDRESS="EMAIL_ADDRESS",

  PREFIX="TITLE",
  TITLE = "TITLE",

  URL="URL",

  STREET="LOCATION",
  STATE="LOCATION" , 
  CITY="LOCATION" , 
  COUNTY="LOCATION",
  SECONDARYADDRESS="LOCATION" ,
  STREETADDRESS = "LOCATION",
  STREET_ADDRESS ="LOCATION",
  FACILITY="LOCATION",
  LOC= "LOCATION",
  GPE="LOCATION",

  LOCATION = "LOCATION",

  PHONEIMEI="PHONE_NUMBER",
  PHONENUMBER="PHONE_NUMBER",
  PHONE="PHONE_NUMBER",
  PHONE_NUMBER = "PHONE_NUMBER",

  IPV4="IP_ADDRESS",
  IPV6="IP_ADDRESS",
  IP="IP_ADDRESS",
  IP_ADDRESS = "IP_ADDRESS",

  CREDITCARDNUMBER="CREDIT_CARD",
  CREDIT_CARD = "CREDIT_CARD",

  ZIPCODE="ZIP_CODE",
  ZIP_CODE ="ZIP_CODE",

  COMPANYNAME="ORGANIZATION",
  HOSP="ORGANIZATION",
  PATORG="ORGANIZATION",
  HOSPITAL="ORGANIZATION",
  ORG="ORGANIZATION",
  ORGANIZATION= "ORGANIZATION",

  IBAN="IBAN_CODE",
  IBAN_CODE = "IBAN_CODE",

  SSN="US_SSN",
  US_SSN = "US_SSN",

  AGE="AGE",

  NORP="NRP",

  ID="ID",

  

  


  AMOUNT="O",
  USERNAME="O",
  JOBTITLE="O",
  JOBAREA="O",
  ACCOUNTNAME="O",
  ACCOUNTNUMBER="O",
  JOBTYPE="O",
  BUILDINGNUMBER="O" ,
  CURRENCYSYMBOL="O" ,
  PASSWORD="O",
  SEX="O",
  GENDER="O",
  BITCOINADDRESS="O",
  MASKEDNUMBER="O",
  USERAGENT="O",
  CURRENCY="O",
  ETHEREUMADDRESS="O",
  NEARBYGPSCOORDINATE="O",
  CREDITCARDISSUER="O",
  ORDINALDIRECTION="O",
  MAC="O" ,
  VEHICLEVRM="O",
  EYECOLOR="O",
  CREDITCARDCVV="O",
  HEIGHT="O" ,
  LITECOINADDRESS="O",
  VEHICLEVIN="O" ,
  CURRENCYCODE="O",
  CURRENCYNAME="O" ,
  BIC="O",
  PIN="O",
  O= "O",

)



entities_mapping = presidio_entities_map



In [16]:
from presidio_evaluator.models import transformers_model
#  Set up the experiment tracker to log the experiment for reproducibility
experiment = get_experiment_tracker()
 
model = transformers_model.transformers_model(entity_mapping=entities_mapping,model_name="lakshyakh93/deberta_finetuned_pii")
# Create the evaluator object
evaluator = Evaluator(model=model)


# Track model and dataset params
params = {"dataset_name": dataset_name, "model_name": model.name}
params.update(model.to_log())
experiment.log_parameters(params)
experiment.log_dataset_hash(dataset)
#experiment.log_parameter("entity_mappings", json.dumps(entities_mapping))

In [17]:
## Run experiment

evaluation_results = evaluator.evaluate_all(dataset)
results = evaluator.calculate_score(evaluation_results)

# Track experiment results
experiment.log_metrics(results.to_log())
entities, confmatrix = results.to_confusion_matrix()
experiment.log_confusion_matrix(matrix=confmatrix, 
                                labels=entities)

# Plot output
plotter = evaluator.Plotter(model=model, 
                            results=results, 
                            output_folder = ".", 
                            model_name = model.name, 
                            beta = 2)


# end experiment
experiment.end()

Mapping entity values using this dictionary: {'FIRSTNAME': 'PERSON', 'LASTNAME': 'PERSON', 'MIDDLENAME': 'PERSON', 'FULLNAME': 'PERSON', 'PATIENT': 'PERSON', 'STAFF': 'PERSON', 'PER': 'PERSON', 'HCW': 'PERSON', 'PERSON': 'PERSON', 'DATE': 'DATE_TIME', 'TIME': 'DATE_TIME', 'DOB': 'DATE_TIME', 'DATE_TIME': 'DATE_TIME', 'EMAIL': 'EMAIL_ADDRESS', 'EMAIL_ADDRESS': 'EMAIL_ADDRESS', 'PREFIX': 'TITLE', 'TITLE': 'TITLE', 'URL': 'URL', 'STREET': 'LOCATION', 'STATE': 'LOCATION', 'CITY': 'LOCATION', 'COUNTY': 'LOCATION', 'SECONDARYADDRESS': 'LOCATION', 'STREETADDRESS': 'LOCATION', 'STREET_ADDRESS': 'LOCATION', 'FACILITY': 'LOCATION', 'LOC': 'LOCATION', 'GPE': 'LOCATION', 'LOCATION': 'LOCATION', 'PHONEIMEI': 'PHONE_NUMBER', 'PHONENUMBER': 'PHONE_NUMBER', 'PHONE': 'PHONE_NUMBER', 'PHONE_NUMBER': 'PHONE_NUMBER', 'IPV4': 'IP_ADDRESS', 'IPV6': 'IP_ADDRESS', 'IP': 'IP_ADDRESS', 'IP_ADDRESS': 'IP_ADDRESS', 'CREDITCARDNUMBER': 'CREDIT_CARD', 'CREDIT_CARD': 'CREDIT_CARD', 'ZIPCODE': 'ZIP_CODE', 'ZIP_CODE':

In [18]:
plotter.plot_scores()

In [19]:
errors = results.model_errors

In [20]:
ModelError.most_common_fp_tokens(errors)

Most common false positive tokens:
[('...', 27),
 ('Southern', 12),
 ('mail', 10),
 ('3', 9),
 ('2', 9),
 ('1', 9),
 ('liftoff', 9),
 ('Sir', 9),
 ('bot', 9),
 ('Phone', 9)]
---------------
Example sentence with each FP token:
	- 3... 2... 1... liftoff! (`...` pred as NUMBER)
	- The Exversion Orchestra was founded in 1977. Since then, it has grown from a volunteer community orchestra to a fully professional orchestra serving Southern Tunisia (`Southern` pred as LOCATION)
	- Julieta S Almeida

Staff development specialist

Personal Info:
Phone:
467 3395

E-mail:
JulietaSouzaAlmeida@cuvox.de

Website:
https://www.InvestmentInstructor.be/

Address:
230 107 Igias Street Suite 503 Armenomonastiro Cyprus. (`mail` pred as PHONE_NUMBER)
	- 3... 2... 1... liftoff! (`3` pred as NUMBER)
	- 3... 2... 1... liftoff! (`2` pred as NUMBER)
	- 3... 2... 1... liftoff! (`1` pred as NUMBER)
	- 3... 2... 1... liftoff! (`liftoff` pred as NUMBER)
	- Excuse me, Sir bot, but I really don't like this tone (`Sir`

[('...', 27),
 ('Southern', 12),
 ('mail', 10),
 ('3', 9),
 ('2', 9),
 ('1', 9),
 ('liftoff', 9),
 ('Sir', 9),
 ('bot', 9),
 ('Phone', 9)]

In [21]:
ModelError.most_common_fn_tokens(errors, n=50)

Most common false negative tokens:
[('United', 11),
 ('Greek', 9),
 ('nan', 8),
 ('DPO', 8),
 ('Kingdom', 7),
 ('AP', 7),
 ('Cyprus', 7),
 ('States', 7),
 ('France', 7),
 ('33', 7),
 ('Sweden', 7),
 ('Canada', 6),
 ('Norway', 6),
 ('American', 6),
 ('Republic', 5),
 ('46', 5),
 ('53', 5),
 ('Corporation', 4),
 ('Estonia', 4),
 ('Austria', 4),
 ('Finland', 4),
 ('Tuesday', 4),
 ('ΠΑΦΟΣ', 4),
 ('2004', 4),
 ('21', 4),
 ('78', 4),
 ('Technologies', 4),
 ('32', 4),
 ('Greenlander', 4),
 ('Germany', 4),
 ('Russian', 4),
 ('61', 4),
 ('Data', 4),
 ('80', 4),
 ('LLC', 4),
 ('65', 4),
 ('64', 4),
 ('AE', 4),
 ('Czech', 4),
 ('Research', 4),
 ('Thursday', 4),
 ('Consulting', 4),
 ('1977', 3),
 ('79', 3),
 ('74', 3),
 ('60', 3),
 ('85', 3),
 ('1971', 3),
 ('63', 3),
 ('Matthew', 3)]
---------------
Example sentence with each FN token:
	- card number 347415977307943 is lost, can you please send a new one to 14 Crown Street Kishiev Squares
 Suite 321
 LONDON
 United Kingdom 75419? I am in Sutri fo

[('United', 11),
 ('Greek', 9),
 ('nan', 8),
 ('DPO', 8),
 ('Kingdom', 7),
 ('AP', 7),
 ('Cyprus', 7),
 ('States', 7),
 ('France', 7),
 ('33', 7),
 ('Sweden', 7),
 ('Canada', 6),
 ('Norway', 6),
 ('American', 6),
 ('Republic', 5),
 ('46', 5),
 ('53', 5),
 ('Corporation', 4),
 ('Estonia', 4),
 ('Austria', 4),
 ('Finland', 4),
 ('Tuesday', 4),
 ('ΠΑΦΟΣ', 4),
 ('2004', 4),
 ('21', 4),
 ('78', 4),
 ('Technologies', 4),
 ('32', 4),
 ('Greenlander', 4),
 ('Germany', 4),
 ('Russian', 4),
 ('61', 4),
 ('Data', 4),
 ('80', 4),
 ('LLC', 4),
 ('65', 4),
 ('64', 4),
 ('AE', 4),
 ('Czech', 4),
 ('Research', 4),
 ('Thursday', 4),
 ('Consulting', 4),
 ('1977', 3),
 ('79', 3),
 ('74', 3),
 ('60', 3),
 ('85', 3),
 ('1971', 3),
 ('63', 3),
 ('Matthew', 3)]

In [22]:
print("All errors:\n")
[print(error, "\n") for error in errors]

All errors:

type: Wrong entity, Annotation = ORGANIZATION, prediction = PERSON, Token = Persint, Full text = The address of Persint is 6750 Koskikatu 25 Apt. 864
Artilleros
, CO
 Uruguay 64677, Metadata = None 

type: FN, Annotation = LOCATION, prediction = O, Token = CO, Full text = The address of Persint is 6750 Koskikatu 25 Apt. 864
Artilleros
, CO
 Uruguay 64677, Metadata = None 

type: Wrong entity, Annotation = LOCATION, prediction = ZIP_CODE, Token = 64677, Full text = The address of Persint is 6750 Koskikatu 25 Apt. 864
Artilleros
, CO
 Uruguay 64677, Metadata = None 

type: FN, Annotation = ORGANIZATION, prediction = O, Token = Exversion, Full text = The Exversion Orchestra was founded in 1977. Since then, it has grown from a volunteer community orchestra to a fully professional orchestra serving Southern Tunisia, Metadata = None 

type: FN, Annotation = DATE_TIME, prediction = O, Token = 1977, Full text = The Exversion Orchestra was founded in 1977. Since then, it has grown 

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [23]:
plotter.plot_confusion_matrix(entities=entities, confmatrix=confmatrix)

In [25]:
errors = results.model_errors

In [27]:
fns_df = ModelError.get_fns_dataframe(errors, entity=["LOCATION"])
fns_df[["full_text", "token", "annotation", "prediction"]][:20]

Unnamed: 0,full_text,token,annotation,prediction
0,"The address of Persint is 6750 Koskikatu 25 Apt. 864\nArtilleros\n, CO\n Uruguay 64677",CO,LOCATION,O
1,"I will be travelling to Canada next week, so I need my passport to be ready by then",Canada,LOCATION,O
2,William Hughes\n\n20789 Allika 46\n Suite 501\n Riisa\n\n Estonia 62488,Riisa,LOCATION,O
3,"Tomomi Nishiyama lives at 86036 Rua do Arenque 1634, Goiânia",1634,LOCATION,O
4,"The Avalara office is at PSC 0413, Box 8144\nAPO AA 42323",42323,LOCATION,O
5,"card number 630427373398 is lost, can you please send a new one to 1987 74 Diakou Street\n Suite 680\n Kissousa\n Cyprus 36903? I am in ΛΕΥΚΩΣΙΑ for a business trip",1987,LOCATION,O
6,"card number 630427373398 is lost, can you please send a new one to 1987 74 Diakou Street\n Suite 680\n Kissousa\n Cyprus 36903? I am in ΛΕΥΚΩΣΙΑ for a business trip",ΛΕΥΚΩΣΙΑ,LOCATION,O
7,"I'm Naomi Ryan, originally from ΕΓΚΩΜΗ, and i'm 31 y/o.",ΕΓΚΩΜΗ,LOCATION,O
8,"It may be too that Kaczmarek was influenced by an earlier song, ""Carry Me Back To Canada,"" which was arranged and sung by Bonifacy Kaczmarek in 1977 (though Meza's song was actually about a boat!).",Canada,LOCATION,O
9,Estonia was super fun to visit!,Estonia,LOCATION,O


In [31]:
from presidio_evaluator.models import transformers_model
#  Set up the experiment tracker to log the experiment for reproducibility
experiment = get_experiment_tracker()
 
model = transformers_model.transformers_model(entity_mapping=entities_mapping,model_name="lakshyakh93/deberta_finetuned_pii" , aggregation_strategy='max')
# Create the evaluator object
evaluator = Evaluator(model=model)


# Track model and dataset params
params = {"dataset_name": dataset_name, "model_name": model.name}
params.update(model.to_log())
experiment.log_parameters(params)
experiment.log_dataset_hash(dataset)
#experiment.log_parameter("entity_mappings", json.dumps(entities_mapping))


`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues relate

In [32]:
## Run experiment

evaluation_results = evaluator.evaluate_all(dataset)
results = evaluator.calculate_score(evaluation_results)

# Track experiment results
experiment.log_metrics(results.to_log())
entities, confmatrix = results.to_confusion_matrix()
experiment.log_confusion_matrix(matrix=confmatrix, 
                                labels=entities)

# Plot output
plotter = evaluator.Plotter(model=model, 
                            results=results, 
                            output_folder = ".", 
                            model_name = model.name, 
                            beta = 2)


# end experiment
experiment.end()

Mapping entity values using this dictionary: {'FIRSTNAME': 'PERSON', 'LASTNAME': 'PERSON', 'MIDDLENAME': 'PERSON', 'FULLNAME': 'PERSON', 'PATIENT': 'PERSON', 'STAFF': 'PERSON', 'PER': 'PERSON', 'HCW': 'PERSON', 'PERSON': 'PERSON', 'DATE': 'DATE_TIME', 'TIME': 'DATE_TIME', 'DOB': 'DATE_TIME', 'DATE_TIME': 'DATE_TIME', 'EMAIL': 'EMAIL_ADDRESS', 'EMAIL_ADDRESS': 'EMAIL_ADDRESS', 'PREFIX': 'TITLE', 'TITLE': 'TITLE', 'URL': 'URL', 'STREET': 'LOCATION', 'STATE': 'LOCATION', 'CITY': 'LOCATION', 'COUNTY': 'LOCATION', 'SECONDARYADDRESS': 'LOCATION', 'STREETADDRESS': 'LOCATION', 'STREET_ADDRESS': 'LOCATION', 'FACILITY': 'LOCATION', 'LOC': 'LOCATION', 'GPE': 'LOCATION', 'LOCATION': 'LOCATION', 'PHONEIMEI': 'PHONE_NUMBER', 'PHONENUMBER': 'PHONE_NUMBER', 'PHONE': 'PHONE_NUMBER', 'PHONE_NUMBER': 'PHONE_NUMBER', 'IPV4': 'IP_ADDRESS', 'IPV6': 'IP_ADDRESS', 'IP': 'IP_ADDRESS', 'IP_ADDRESS': 'IP_ADDRESS', 'CREDITCARDNUMBER': 'CREDIT_CARD', 'CREDIT_CARD': 'CREDIT_CARD', 'ZIPCODE': 'ZIP_CODE', 'ZIP_CODE':


Tokenizer does not support real words, using fallback heuristic



Finished running model on dataset
saving experiment data to experiment_20241119-225354.json


In [30]:
plotter.plot_scores()