In [25]:
# install presidio via pip if not yet installed

#!pip install presidio-analyzer
#!pip install presidio-evaluator

In [1]:
import datetime
import pprint
from collections import Counter
from pathlib import Path
from typing import Dict, List

import pandas as pd
import numpy as np

from presidio_evaluator import InputSample
from presidio_evaluator.data_generator import PresidioSentenceFaker
from faker.providers import BaseProvider

  from .autonotebook import tqdm as notebook_tqdm


# Generate fake PII data using the Presidio Sentence Faker

The Presidio Sentence Faker enables you to generate a synthetic dataset from sentence templates.
Example templates:

> I live at {{address}}

> You can email me at {{email}}. Thanks, {{first_name}}

> What's your last name? It's {{last_name}}

> Every time I see you falling I get down on my knees and pray

### Simple example for German sentences

Steps:
1) Preprocess dataset
2) Initialize dependencies
3) Generate sample sentences

#### 1) Preprocess dataset
Converts upper case, custom entity columns to lowercase

Example:
- Vehicle : vehicle


In [2]:
# Converts the selected upper-case columns to lower-case (needed for custom entities)
def column_to_lower(column_name:str, records: pd.DataFrame):
    records.rename(columns={column_name: column_name.lower()}, inplace=True)

#### 2) Initialize the needed dependencies

In [3]:
# German sentences
with open("templates/german_templates_v2.txt", "r", encoding="utf-8") as f:
    german_templates = [line.strip() for line in f if line.strip()]

# Update the entity mapping to include our custom entities
german_entity_mapping = PresidioSentenceFaker.ENTITY_TYPE_MAPPING.copy()
german_entity_mapping.update({
    "vehicle": "AUT_LICENSE_PLATE"  # Map to Presidio Entity Type
})

# Read German entries for PII and convert custom column titles to lower
german_records = pd.read_csv("data/austrian_pii_dataset.csv")
column_to_lower("Vehicle", german_records)

german_records.head()

Unnamed: 0,NameSet,Number,GivenName,MiddleInitial,Surname,StreetAddress,City,StateFull,ZipCode,Country,...,Birthday,Age,NationalID,Company,vehicle,Domain,GUID,Latitude,Longitude,Title
0,Austrian,0,Joshua,E,Hosp,Schandlring 6/9,Köflach,Steiermark,5319,AT,...,01/26/1944,80,240-05-3938,Böhler Ltd,O 37660 GZ,hinteregger-stocker.co.at,627900e1-7982-4807-9917-2d162f937d5b,81.463266,-59.257238,Dr.
1,Austrian,1,Ina,F,Ofner,Liliana-Salzmann-Platz 0,Bischofshofen,Oberösterreich,3463,AT,...,01/18/1942,82,347-89-5908,Strohmeier Group,L 68011 LJ,wallner-thurner.com,c630a082-3553-445b-a0d2-8e9c94e6adda,-84.80337,-77.110354,Ing.
2,Austrian,2,Ajna,A,Schrempf,Aaron-Zechner-Weg 366,Lilienfeld,Oberösterreich,4923,AT,...,06/05/1943,81,243-56-7343,Jäger-Gstrein,B 22003 QF,rabitsch.co.at,f483e0da-5442-47d2-a2b3-5db050b624dd,-63.573471,68.746492,Frau
3,Austrian,3,Lukas,U,Brandner,Larissa-Fasching-Ring 5/3,Bad Ischl,Oberösterreich,4565,AT,...,05/30/1942,82,050-84-4562,"Loibl, Graf and Schweitzer",T 4601 QS,kirchner-zoehrer.org,61eff7f4-58fc-458e-b3ef-8956841aeb3d,57.89946,123.404391,Ing.
4,Austrian,4,Lara-Sophie,L,Windisch,Reinischstr. 0,Leoben,Niederösterreich,9365,AT,...,06/18/2000,24,159-24-6956,Eberhard LLC,V 45528 EW,kahr-frank.co.at,b6461253-7eb1-4f85-a241-e5cc8a6ca6d0,47.395934,-95.218662,Herr


#### 3) Generate some fake sentences

In [4]:
# Create an instance of the Faker with the custom objects
sentence_faker = PresidioSentenceFaker(
    locale="de_DE",
    lower_case_ratio=0.5,
    sentence_templates=german_templates,
    base_records=german_records,
    entity_type_mapping=german_entity_mapping)

# Fake some sentences
fake_german_sentence_results = sentence_faker.generate_new_fake_sentences(10)

print(fake_german_sentence_results[0].masked)
print(fake_german_sentence_results[0].spans)

Using default entity providers
Using default provider aliases


Sampling: 100%|██████████| 10/10 [00:00<00:00, 2204.16it/s]

Das neu angeschaffte Hybridfahrzeug mit dem amtlichen Kennzeichen {{AUT_LICENSE_PLATE}} ist ab sofort über das digitale Buchungssystem für dienstliche Fahrten reservierbar.
[Span(type: AUT_LICENSE_PLATE, value: k 51592 aw, char_span: [66: 76])]





### Generate a full German dataset

In this example we generate a large dataset with multiple entity types and save it in in JSON and CONLL03 formats.

Steps: 
1) Prepare the output files and number of samples
2) Generate the dataset
3) Analyze
4) Write to output

#### 1) Prepare output and number of samples and the faker

In [5]:
number_of_samples = 1500
lower_case_ratio = 0.05
locale = "de"
cur_time = datetime.date.today().strftime("%B_%d_%Y")

output_file = f"data/generated_size_{number_of_samples}_date_{cur_time}.json"
output_conll = f"data/generated_size_{number_of_samples}_date_{cur_time}.tsv"

In [6]:
# Adjusted final faker
sentence_faker = PresidioSentenceFaker(
    locale="de_DE",
    lower_case_ratio=0.5,
    sentence_templates=german_templates,
    base_records=german_records,
    entity_type_mapping=german_entity_mapping)

Using default entity providers
Using default provider aliases


In [7]:
pd.DataFrame(sentence_faker._sentence_faker.records).head()

Unnamed: 0,NameSet,Number,GivenName,MiddleInitial,Surname,StreetAddress,City,StateFull,ZipCode,Country,...,Birthday,Age,NationalID,Company,vehicle,Domain,GUID,Latitude,Longitude,Title
0,Austrian,0,Joshua,E,Hosp,Schandlring 6/9,Köflach,Steiermark,5319,AT,...,01/26/1944,80,240-05-3938,Böhler Ltd,O 37660 GZ,hinteregger-stocker.co.at,627900e1-7982-4807-9917-2d162f937d5b,81.463266,-59.257238,Dr.
1,Austrian,1,Ina,F,Ofner,Liliana-Salzmann-Platz 0,Bischofshofen,Oberösterreich,3463,AT,...,01/18/1942,82,347-89-5908,Strohmeier Group,L 68011 LJ,wallner-thurner.com,c630a082-3553-445b-a0d2-8e9c94e6adda,-84.80337,-77.110354,Ing.
2,Austrian,2,Ajna,A,Schrempf,Aaron-Zechner-Weg 366,Lilienfeld,Oberösterreich,4923,AT,...,06/05/1943,81,243-56-7343,Jäger-Gstrein,B 22003 QF,rabitsch.co.at,f483e0da-5442-47d2-a2b3-5db050b624dd,-63.573471,68.746492,Frau
3,Austrian,3,Lukas,U,Brandner,Larissa-Fasching-Ring 5/3,Bad Ischl,Oberösterreich,4565,AT,...,05/30/1942,82,050-84-4562,"Loibl, Graf and Schweitzer",T 4601 QS,kirchner-zoehrer.org,61eff7f4-58fc-458e-b3ef-8956841aeb3d,57.89946,123.404391,Ing.
4,Austrian,4,Lara-Sophie,L,Windisch,Reinischstr. 0,Leoben,Niederösterreich,9365,AT,...,06/18/2000,24,159-24-6956,Eberhard LLC,V 45528 EW,kahr-frank.co.at,b6461253-7eb1-4f85-a241-e5cc8a6ca6d0,47.395934,-95.218662,Herr


#### 2) Generate the dataset

In [8]:
fake_records = sentence_faker.generate_new_fake_sentences(num_samples=number_of_samples)
pprint.pprint(fake_records[0:10])

Sampling: 100%|██████████| 1500/1500 [00:00<00:00, 3172.87it/s]

[Full text: Bei Rückfragen zur neuen Datenschutzrichtlinie steht Ihnen Olena Wilmsen als zertifizierter Datenschutzbeauftragter jederzeit telefonisch oder per E-Mail zur Verfügung.
Spans: [Span(type: PERSON, value: Olena Wilmsen, char_span: [59: 72])]
,
 Full text: Der Techniker Rebekka Hettner-Koch wird mit dem Servicefahrzeug L 44458 EC morgen zwischen 13:00 und 15:00 Uhr in Unit 8541 Box 5249
DPO AA 66549 eintreffen und ist vorab unter +41 28 982 75 85 erreichbar.
Spans: [Span(type: PHONE_NUMBER, value: +41 28 982 75 85, char_span: [177: 193]), Span(type: STREET_ADDRESS, value: Unit 8541 Box 5249
DPO AA 66549, char_span: [114: 145]), Span(type: AUT_LICENSE_PLATE, value: L 44458 EC, char_span: [64: 74]), Span(type: PERSON, value: Rebekka Hettner-Koch, char_span: [14: 34])]
,
 Full text: Zum Abholen der Messematerialien steht Ihnen morgen zwischen 8:00 und 12:00 Uhr unser Servicefahrzeug mit dem Kennzeichen K 26653 YQ vor dem Haupteingang zur Verfügung.
Spans: [Span(type: AUT_LICENSE_




#### 3) Analyze

In [9]:
count_per_template_id = Counter([sample.template_id for sample in fake_records])

print(f"Total: {sum(count_per_template_id.values())}")
print(f"Avg # of records per template: {np.mean(list(count_per_template_id.values()))}")
print(
    f"Median # of records per template: {np.median(list(count_per_template_id.values()))}"
)
print(f"Std: {np.std(list(count_per_template_id.values()))}")

Total: 1500
Avg # of records per template: 14.150943396226415
Median # of records per template: 14.0
Std: 4.032398164747644


##### Which entities did we generate?

In [10]:
count_per_entity = Counter()
for record in fake_records:
    count_per_entity.update(Counter([span.entity_type for span in record.spans]))

count_per_entity

Counter({'PERSON': 623,
         'STREET_ADDRESS': 589,
         'AUT_LICENSE_PLATE': 422,
         'EMAIL_ADDRESS': 325,
         'PHONE_NUMBER': 302})

In [11]:
for record in fake_records[:10]:
    print(record)

Full text: Bei Rückfragen zur neuen Datenschutzrichtlinie steht Ihnen Olena Wilmsen als zertifizierter Datenschutzbeauftragter jederzeit telefonisch oder per E-Mail zur Verfügung.
Spans: [Span(type: PERSON, value: Olena Wilmsen, char_span: [59: 72])]

Full text: Der Techniker Rebekka Hettner-Koch wird mit dem Servicefahrzeug L 44458 EC morgen zwischen 13:00 und 15:00 Uhr in Unit 8541 Box 5249
DPO AA 66549 eintreffen und ist vorab unter +41 28 982 75 85 erreichbar.
Spans: [Span(type: PHONE_NUMBER, value: +41 28 982 75 85, char_span: [177: 193]), Span(type: STREET_ADDRESS, value: Unit 8541 Box 5249
DPO AA 66549, char_span: [114: 145]), Span(type: AUT_LICENSE_PLATE, value: L 44458 EC, char_span: [64: 74]), Span(type: PERSON, value: Rebekka Hettner-Koch, char_span: [14: 34])]

Full text: Zum Abholen der Messematerialien steht Ihnen morgen zwischen 8:00 und 12:00 Uhr unser Servicefahrzeug mit dem Kennzeichen K 26653 YQ vor dem Haupteingang zur Verfügung.
Spans: [Span(type: AUT_LICENSE_PLATE

#### Write to output

Both:
- JSON
- CONLL (Todo)

##### JSON

In [12]:
InputSample.to_json(dataset=fake_records, output_file=output_file)

In [13]:
output_file

'data/generated_size_1500_date_April_11_2025.json'

##### CONLL

In [39]:
#conll = InputSample.create_conll_dataset(dataset=fake_records)
#conll.head(10)

In [40]:
#conll.to_csv(output_conll, sep="\t")
#print(f"CoNLL2003 dataset structure output location: {output_conll}")

#### Copyright notice:


Data generated for evaluation was created using Fake Name Generator.

Fake Name Generator identities by the [Fake Name Generator](https://www.fakenamegenerator.com/) 
are licensed under a [Creative Commons Attribution-Share Alike 3.0 United States License](http://creativecommons.org/licenses/by-sa/3.0/us/). Fake Name Generator and the Fake Name Generator logo are trademarks of Corban Works, LLC.