In [21]:
from pathlib import Path
from pprint import pprint
from collections import Counter
from typing import Dict, List
import json

from presidio_evaluator import InputSample
from presidio_evaluator.evaluation import Evaluator, ModelError
from presidio_evaluator.models import PresidioAnalyzerWrapper
from presidio_evaluator.experiment_tracking import get_experiment_tracker

import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [22]:
dataset_name = "data2.json"
dataset = InputSample.read_dataset_json(Path(Path.cwd().parent, "data modification", dataset_name))

print(len(dataset))

tokenizing input: 100%|██████████| 15/15 [00:00<00:00, 40.69it/s]

15





In [23]:
dataset[0]

Full text: 89200-3325 schools are next in line for education reform pilot program. Mobility team, prepare accordingly!
Spans: [Span(type: ZIPCODE, value: 89200-3325, char_span: [0: 10]), Span(type: JOBAREA, value: Mobility, char_span: [72: 80])]

In [10]:
from presidio_evaluator.models import transformers_model
#  Set up the experiment tracker to log the experiment for reproducibility
experiment = get_experiment_tracker()
 
model = transformers_model.transformers_deberta_finetuned_pii()
# Create the evaluator object
evaluator = Evaluator(model=model)


# Track model and dataset params
params = {"dataset_name": dataset_name, "model_name": model.name}
params.update(model.to_log())
experiment.log_parameters(params)
experiment.log_dataset_hash(dataset)
#experiment.log_parameter("entity_mappings", json.dumps(entities_mapping))
  

In [11]:
## Run experiment

evaluation_results = evaluator.evaluate_all(dataset)
results = evaluator.calculate_score(evaluation_results)

# Track experiment results
experiment.log_metrics(results.to_log())
entities, confmatrix = results.to_confusion_matrix()
experiment.log_confusion_matrix(matrix=confmatrix, 
                                labels=entities)

# Plot output
plotter = evaluator.Plotter(model=model, 
                            results=results, 
                            output_folder = ".", 
                            model_name = model.name, 
                            beta = 2)


# end experiment
experiment.end()

Running model transformers_deberta_finetuned_pii on dataset...




Finished running model on dataset
saving experiment data to experiment_20241109-180619.json


In [12]:
plotter.plot_scores()

In [13]:
errors = results.model_errors

In [14]:
ModelError.most_common_fp_tokens(errors)

Most common false positive tokens:
[('Mobility', 1),
 ('BND', 1),
 ('Turkish', 1),
 ('Lira', 1),
 ('FJD', 1),
 ('257k', 1),
 ('a4O0ARjeKE9', 1),
 ('G', 1),
 ('0544', 1),
 ('fees', 1)]
---------------
Example sentence with each FP token:
	- 89200-3325 schools are next in line for education reform pilot program. Mobility team, prepare accordingly! (`Mobility` pred as JOBAREA)
	- We need funding for new mathematics textbooks. Donations in BND or Turkish Lira are highly appreciated. Contact c2cb:5d6b:0e1e:2afc:de76:64a6:3ea5:eeab for more details. (`BND` pred as CURRENCYCODE)
	- We need funding for new mathematics textbooks. Donations in BND or Turkish Lira are highly appreciated. Contact c2cb:5d6b:0e1e:2afc:de76:64a6:3ea5:eeab for more details. (`Turkish` pred as CURRENCYNAME)
	- We need funding for new mathematics textbooks. Donations in BND or Turkish Lira are highly appreciated. Contact c2cb:5d6b:0e1e:2afc:de76:64a6:3ea5:eeab for more details. (`Lira` pred as CURRENCYNAME)
	- "To guara

[('Mobility', 1),
 ('BND', 1),
 ('Turkish', 1),
 ('Lira', 1),
 ('FJD', 1),
 ('257k', 1),
 ('a4O0ARjeKE9', 1),
 ('G', 1),
 ('0544', 1),
 ('fees', 1)]

In [15]:
ModelError.most_common_fn_tokens(errors, n=50)

Most common false negative tokens:
[('Cronin', 1), ('Course', 1), ('6036994285533224', 1)]
---------------
Example sentence with each FN token:
	- Our records mention that you have recently changed your Cronin Course and Suite 737. As we send weekly alerts and therapy materials to your location, we ask you to confirm this change. (`Cronin` annotated as LOCATION)
	- Our records mention that you have recently changed your Cronin Course and Suite 737. As we send weekly alerts and therapy materials to your location, we ask you to confirm this change. (`Course` annotated as LOCATION)
	- I need to order a new EKG machine. The invoice should be sent to our building number, 841 and make sure they charge it to the company card (6036994285533224). (`6036994285533224` annotated as CREDIT_CARD)


[('Cronin', 1), ('Course', 1), ('6036994285533224', 1)]

In [17]:
fns_df = ModelError.get_fns_dataframe(errors, entity=["IP"])

No errors of type FN and entity ['IP'] were found


In [18]:
fns_df[["full_text", "token", "annotation", "prediction"]]

TypeError: 'NoneType' object is not subscriptable

In [19]:
print("All errors:\n")
[print(error, "\n") for error in errors]

All errors:

type: Wrong entity, Annotation = ZIP_CODE, prediction = ZIPCODE, Token = 89200, Full text = 89200-3325 schools are next in line for education reform pilot program. Mobility team, prepare accordingly!, Metadata = None 

type: Wrong entity, Annotation = ZIP_CODE, prediction = ZIPCODE, Token = 3325, Full text = 89200-3325 schools are next in line for education reform pilot program. Mobility team, prepare accordingly!, Metadata = None 

type: FP, Annotation = O, prediction = JOBAREA, Token = Mobility, Full text = 89200-3325 schools are next in line for education reform pilot program. Mobility team, prepare accordingly!, Metadata = None 

type: Wrong entity, Annotation = PERSON, prediction = FIRSTNAME, Token = Jessyca, Full text = Jessyca, you should compare our performance to the industry averages. This includes leads, conversion rates, bounce rates, page views, average spend per customer, and customer acquisition costs. Send a report to Roosevelt_Kshlerin@yahoo.com., Metadata

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

[Full text: 89200-3325 schools are next in line for education reform pilot program. Mobility team, prepare accordingly!
 Spans: [Span(type: ZIP_CODE, value: 89200-3325, char_span: [0: 10]), Span(type: O, value: Mobility, char_span: [72: 80])],
 Full text: Jessyca, you should compare our performance to the industry averages. This includes leads, conversion rates, bounce rates, page views, average spend per customer, and customer acquisition costs. Send a report to Roosevelt_Kshlerin@yahoo.com.
 Spans: [Span(type: PERSON, value: Jessyca, char_span: [0: 7]), Span(type: EMAIL_ADDRESS, value: Roosevelt_Kshlerin@yahoo.com, char_span: [212: 240])],
 Full text: We need funding for new mathematics textbooks. Donations in BND or Turkish Lira are highly appreciated. Contact c2cb:5d6b:0e1e:2afc:de76:64a6:3ea5:eeab for more details.
 Spans: [Span(type: O, value: BND, char_span: [60: 63]), Span(type: O, value: Turkish Lira, char_span: [67: 79]), Span(type: IP_ADDRESS, value: c2cb:5d6b:0e1e:2afc:de76