In [3]:
from pathlib import Path
from pprint import pprint
from collections import Counter
from typing import Dict, List
import json

from presidio_evaluator import InputSample
from presidio_evaluator.evaluation import Evaluator, ModelError
from presidio_evaluator.models import PresidioAnalyzerWrapper
from presidio_evaluator.experiment_tracking import get_experiment_tracker

import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
dataset_name = "test_data.json"
dataset = InputSample.read_dataset_json(Path(Path.cwd().parent, "data modification", dataset_name))

print(len(dataset))

tokenizing input:   0%|          | 0/8701 [00:00<?, ?it/s]

loading model en_core_web_sm


tokenizing input: 100%|██████████| 8701/8701 [01:34<00:00, 91.96it/s] 


8701


In [5]:
dataset[0]

Full text: 89200-3325 schools are next in line for education reform pilot program. Mobility team, prepare accordingly!
Spans: [Span(type: ZIPCODE, value: 89200-3325, char_span: [0: 10]), Span(type: JOBAREA, value: Mobility, char_span: [72: 80])]

In [6]:
from presidio_evaluator.models import transformers_model
#  Set up the experiment tracker to log the experiment for reproducibility
experiment = get_experiment_tracker()
 
model = transformers_model.transformers_model("lakshyakh93/deberta_finetuned_pii")
# Create the evaluator object
evaluator = Evaluator(model=model)


# Track model and dataset params
params = {"dataset_name": dataset_name, "model_name": model.name}
params.update(model.to_log())
experiment.log_parameters(params)
experiment.log_dataset_hash(dataset)
#experiment.log_parameter("entity_mappings", json.dumps(entities_mapping))
  

  return torch.load(checkpoint_file, map_location=map_location)


In [7]:
## Run experiment

evaluation_results = evaluator.evaluate_all(dataset)
results = evaluator.calculate_score(evaluation_results)

# Track experiment results
experiment.log_metrics(results.to_log())
entities, confmatrix = results.to_confusion_matrix()
experiment.log_confusion_matrix(matrix=confmatrix, 
                                labels=entities)

# Plot output
plotter = evaluator.Plotter(model=model, 
                            results=results, 
                            output_folder = ".", 
                            model_name = model.name, 
                            beta = 2)


# end experiment
experiment.end()

Mapping entity values using this dictionary: lakshyakh93/deberta_finetuned_pii
Running model transformers_model on dataset...




Finished running model on dataset
saving experiment data to experiment_20241119-123335.json


In [8]:
plotter.plot_scores()

In [9]:
errors = results.model_errors

In [10]:
ModelError.most_common_fp_tokens(errors)

Most common false positive tokens:
[('account', 25),
 ('administrator', 11),
 ('based', 10),
 ('Dr.', 9),
 ('female', 8),
 ('wing', 8),
 ('web', 7),
 ('...', 7),
 ('group', 6),
 ('BIC', 6)]
---------------
Example sentence with each FP token:
	- Dear Buford, we observe multiple sign-in attempts for your distance learning account from this IP 150.109.6.243. (`account` pred as ACCOUNTNAME)
	- Reminder for our administrator Randal29: Please ensure that all the IT systems for the new educational centres are prepared. Send system reports to the 6ac2:e55d:6004:7f5e:c22e:1d42:be32:d2d4 for security checks. (`administrator` pred as JOBTYPE)
	- Policy updates are needed on the Liberian Dollar-based grants for the Creative higher education program. Please email a summary of the latest regulations to Dax.Hoeger72@yahoo.com. (`based` pred as CURRENCYNAME)
	- Dr. Tromp, the VIN J1PYWYG3M0KK21974 you provided for your commuter benefit is incorrect. We need the correct information in order to process

[('account', 25),
 ('administrator', 11),
 ('based', 10),
 ('Dr.', 9),
 ('female', 8),
 ('wing', 8),
 ('web', 7),
 ('...', 7),
 ('group', 6),
 ('BIC', 6)]

In [11]:
ModelError.most_common_fn_tokens(errors, n=50)

Most common false negative tokens:
[('County', 194),
 ('Eye', 64),
 ('color', 64),
 ('Dr.', 51),
 ('Northwest', 37),
 ('Northeast', 32),
 ('Consultant', 30),
 ('Interactions', 27),
 ('Southeast', 26),
 ('Orchestrator', 25),
 ('Assurance', 25),
 ('Facilitator', 22),
 ('LLC', 22),
 ('Intranet', 22),
 ('Grey', 22),
 ('person', 22),
 ('Southwest', 20),
 ('Liaison', 20),
 ('Female', 19),
 ('Brown', 19),
 ('Green', 18),
 ('inches', 18),
 ('Paradigm', 18),
 ('feet', 17),
 ('Functionality', 17),
 ('visa', 16),
 ('Gender', 16),
 ('Planner', 15),
 ('Mozilla/5.0', 15),
 ('Transexual', 14),
 ('1', 12),
 ('Research', 12),
 ('Marketing', 11),
 ('Cisgender', 11),
 ('Optimization', 11),
 ('Branding', 11),
 ('cm', 11),
 ('Communications', 11),
 ('Usability', 10),
 ('Account', 10),
 ('Blue', 10),
 ('compatible', 10),
 ('m', 10),
 ('8', 10),
 ('Amber', 10),
 ('4', 10),
 ('Officer', 10),
 ('Program', 10),
 ('Operations', 9),
 ('5', 9)]
---------------
Example sentence with each FN token:
	- I've arranged 

[('County', 194),
 ('Eye', 64),
 ('color', 64),
 ('Dr.', 51),
 ('Northwest', 37),
 ('Northeast', 32),
 ('Consultant', 30),
 ('Interactions', 27),
 ('Southeast', 26),
 ('Orchestrator', 25),
 ('Assurance', 25),
 ('Facilitator', 22),
 ('LLC', 22),
 ('Intranet', 22),
 ('Grey', 22),
 ('person', 22),
 ('Southwest', 20),
 ('Liaison', 20),
 ('Female', 19),
 ('Brown', 19),
 ('Green', 18),
 ('inches', 18),
 ('Paradigm', 18),
 ('feet', 17),
 ('Functionality', 17),
 ('visa', 16),
 ('Gender', 16),
 ('Planner', 15),
 ('Mozilla/5.0', 15),
 ('Transexual', 14),
 ('1', 12),
 ('Research', 12),
 ('Marketing', 11),
 ('Cisgender', 11),
 ('Optimization', 11),
 ('Branding', 11),
 ('cm', 11),
 ('Communications', 11),
 ('Usability', 10),
 ('Account', 10),
 ('Blue', 10),
 ('compatible', 10),
 ('m', 10),
 ('8', 10),
 ('Amber', 10),
 ('4', 10),
 ('Officer', 10),
 ('Program', 10),
 ('Operations', 9),
 ('5', 9)]

In [17]:
fns_df = ModelError.get_fns_dataframe(errors, entity=["CREDITCARDNUMBER"])

In [18]:
fns_df[["full_text", "token", "annotation", "prediction"]]

Unnamed: 0,full_text,token,annotation,prediction
0,"I need to order a new EKG machine. The invoice should be sent to our building number, 841 and make sure they charge it to the company card (6036994285533224).",6036994285533224,CREDITCARDNUMBER,O
1,"Having paid for your mindfulness therapy sessions via your credit card (7545941043284555), your receipt will be emailed to Dawson_Hyatt63@yahoo.com.",7545941043284555,CREDITCARDNUMBER,O
2,"In need of bankruptcy assistance. Name is Malika24, used to work as a International Directives Consultant. Unable to clear my credit card (4169787547142829) debt, now contemplating bankruptcy. Please guide.",4169787547142829,CREDITCARDNUMBER,O
3,"In Health Informatics, we need to carefully handle all unique identifiers, such as vehicle details (GX66RMK, 0SNZZ81FWMKZ56406) or financial identifiers (1308363720032284, diners_club). It's our responsibility to secure and anonymize this data.",1308363720032284,CREDITCARDNUMBER,O
4,"Ms.. Rolfson, we appreciate your valuable work in the Research of Cardiology. To process the payment for your services, we need your american_express credit card number (9481981196784543) and CVV (443). You can reach me at 56-071529-222898-3 for any queries.",9481981196784543,CREDITCARDNUMBER,O
5,Purchase resources required for the Hadleyland project using the company card (8542404393455399). The budget allocated is 441.39k Taka.,8542404393455399,CREDITCARDNUMBER,O
6,"Rest assured Cisgender woman, all the details you've shared (3512698105233176, 1ebd:bbde:6b34:3cf5:c0ee:2fab:a3ce:c978) are stored securely to ensure your privacy.",3512698105233176,CREDITCARDNUMBER,O
7,"We plan to allow users to make donations towards our Global Education fund. They can use credit cards (2148279638648395, 3113140070520773), Crypto wallets (3TVAAwZmUbgzxCj2qJrkYBUFZd1boG3, 0xeadabf7ede9ddd2db97da3d12b7abafde2d5e2aa) or other digital methods.",2148279638648395,CREDITCARDNUMBER,O
8,"Imani.Nolan16 here, an Global Intranet Orchestrator residing in Quebec. Unfortunately, I landed in strong financial crosswinds and am unable to pay off my credit card (5912797576245347) debt. Looking to declare bankruptcy.",5912797576245347,CREDITCARDNUMBER,O
9,"I am Delaney_Hills96, used to work as an Legacy Response Specialist in Bavaria. Sadly, lost my job and struggling job and have been struggling with credit card (5497785126603439) debt. Needing to file bankruptcy, please advise.",5497785126603439,CREDITCARDNUMBER,O


In [1]:
print("All errors:\n")
[print(error, "\n") for error in errors]

All errors:



NameError: name 'errors' is not defined

In [15]:
plotter.plot_confusion_matrix(entities=entities, confmatrix=confmatrix)