In [1]:
# install presidio via pip if not yet installed

#!pip install presidio-analyzer
#!pip install presidio-evaluator

In [2]:
import datetime
import pprint
from collections import Counter
from pathlib import Path
from typing import Dict, List

import numpy as np
import pandas as pd
import tqdm

from presidio_evaluator import InputSample
from presidio_evaluator.data_generator import PresidioDataGenerator
from presidio_evaluator.data_generator.faker_extensions import (
    FakerSpansResult,
    RecordsFaker,
    IpAddressProvider,
    NationalityProvider,
    OrganizationProvider,
    UsDriverLicenseProvider,
    AgeProvider,
    AddressProviderNew,
    PhoneNumberProviderNew,
)

# Generate fake PII data using Presidio's data generator

Presidio's data generator is based on the [Python Faker tool](https://faker.readthedocs.io/en/master/)
and allows you to generate a synthetic dataset from sentence templates.
It features wrappers for Faker which allows you to sample from existing sources of fake data.

Example templates:

> I live at {{address}}

> You can email me at {{email}}. Thanks, {{first_name}}

> What's your last name? It's {{last_name}}

> Every time I see you falling I get down on my knees and pray


### Simple example
This uses all the default values to generate 10 samples based on three templates

In [3]:
sentence_templates = [
    "My name is {{name}}",
    "Please send it to {{address}}",
    "I just moved to {{city}} from {{country}}",
]


data_generator = PresidioDataGenerator()
fake_records = data_generator.generate_fake_data(
    templates=sentence_templates, n_samples=10
)

fake_records = list(fake_records)

# Print the spans of the first sample
print(fake_records[0].fake)
print(fake_records[0].spans)

Sampling: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 9149.88it/s]

My name is Joshua Jackson
[{"value": "Joshua Jackson", "start": 11, "end": 25, "type": "name"}]





## Generate a full dataset

In this example we customize the data generator to:
1. Accept more types of entities (by adding more providers to Faker. see [Faker's documentation](https://faker.readthedocs.io/en/master/index.html#how-to-create-a-provider)
2. Handle records of multiple PII entities per fake person for a more realistic dataset

We then translate the generated entity types to match Presidio's, and save the new dataset in json and CONLL03 formats.

a. Specify parameters:

In [4]:
number_of_samples = 1500
cur_time = datetime.date.today().strftime("%B_%d_%Y")

raw_data_path = Path("../presidio_evaluator/data_generator/raw_data")
output_file = f"../data/generated_size_{number_of_samples}_date_{cur_time}.json"
output_conll = f"../data/generated_size_{number_of_samples}_date_{cur_time}.tsv"

templates_file_path = Path(raw_data_path, "templates.txt").resolve()
fake_name_generator_file = Path(raw_data_path, "FakeNameGenerator.com_3000.csv").resolve()

lower_case_ratio = 0.05

b. Read [FakeNameGenerator](https://www.fakenamegenerator.com/) data (optional, extends the set of fake values)
and create a `RecordsFaker` which returns a fake person record (with multiple values) instead of one value,
allowing dependencies between values belonging to the same fake person
(e.g. name = Michael Smith with the email michael.smith@gmail.com).

The `fake_name_generator_file` can be downloaded from https://www.fakenamegenerator.com/order.php

> Note that you can create fake records for multiple name sets, allowing you to adapt the fake data to the real data if needed. 

In [5]:
# Read FakeNameGenerator CSV
fake_name_generator_df = pd.read_csv(fake_name_generator_file)

# Update to match existing templates
fake_name_generator_df = PresidioDataGenerator.update_fake_name_generator_df(fake_name_generator_df)
fake_name_generator_df.head()

Unnamed: 0,number,gender,nationality,prefix,first_name,middle_initial,last_name,street_name,city,state_abbr,...,company,domain_name,person,name,first_name_female,first_name_male,prefix_female,prefix_male,last_name_female,last_name_male
0,1,female,Czech,Mrs.,Marie,J,Hamanová,P.O. Box 255,Kangerlussuaq,QE,...,Simple Solutions,MarathonDancing.gl,Marie J Hamanová,Marie J Hamanová,Marie,,Mrs.,,Hamanová,
1,2,female,French,Ms.,Patricia,G,Desrosiers,Avenida Noruega 42,Vila Real,VR,...,Formula Gray,LostMillions.com.pt,Patricia Desrosiers,Patricia Desrosiers,Patricia,,Ms.,,Desrosiers,
2,3,female,American,Ms.,Debra,O,Neal,1659 Hoog St,Brakpan,GA,...,Dahlkemper's,MediumTube.co.za,Debra O Neal,Debra O Neal,Debra,,Ms.,,Neal,
3,4,male,French,Mr.,Peverell,C,Racine,183 Epimenidou Street,Limassol,LI,...,Quickbiz,ImproveLook.com.cy,Peverell Racine,Peverell Racine,,Peverell,,Mr.,,Racine
4,5,female,Slovenian,Mrs.,Iolanda,S,Tratnik,Karu põik 61,Pärnu,PR,...,Dubrow's Cafeteria,PostTan.com.ee,Iolanda Tratnik,Iolanda Tratnik,Iolanda,,Mrs.,,Tratnik,


c. Create a Faker object (in this case, a `RecordsFaker`)

In [6]:
# Create RecordsFaker (extension which handles records instead of independent values) and add additional specific providers
fake = RecordsFaker(records=fake_name_generator_df, locale="en_US")

d. Add more providers, not part of the original Faker package

In [7]:
fake.add_provider(IpAddressProvider)  # Both Ipv4 and IPv6 IP addresses
fake.add_provider(NationalityProvider)  # Read countries + nationalities from file
fake.add_provider(OrganizationProvider)  # Read organization names from file
fake.add_provider(UsDriverLicenseProvider)  # Read US driver license numbers from file
fake.add_provider(AgeProvider)  # Age values (unavailable on Faker)
fake.add_provider(AddressProviderNew)  # Extend the default address formats
fake.add_provider(PhoneNumberProviderNew)  # Extend the default phone number formats

e. Create the Presidio Data Generator object and add provider aliases if the templates have a different entity name than the Faker object

In [8]:
# Create Presidio Data Generator
data_generator = PresidioDataGenerator(
    custom_faker=fake, lower_case_ratio=lower_case_ratio
)

# Create entity aliases (e.g. if faker supports "name" but templates contain "person").
data_generator.add_provider_alias(provider_name="name", new_name="person")
data_generator.add_provider_alias(
    provider_name="credit_card_number", new_name="credit_card"
)
data_generator.add_provider_alias(provider_name="date_of_birth", new_name="birthday")

f. Generate data

In [9]:
sentence_templates = PresidioDataGenerator.read_template_file(templates_file_path)
fake_records = data_generator.generate_fake_data(
    templates=sentence_templates, n_samples=number_of_samples
)

fake_records = list(fake_records)
pprint.pprint(fake_records[0])

Sampling: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 17987.56it/s]

{"fake": "Title VII of the Civil Rights Act of 2005 protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\n", "spans": [{"value": "2005", "start": 37, "end": 41, "type": "year"}], "template": "Title VII of the Civil Rights Act of {{year}} protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\n", "template_id": 190}





#### Verify randomness of dataset

In [10]:
count_per_template_id = Counter([sample.template_id for sample in fake_records])

print(f"Total: {sum(count_per_template_id.values())}")
print(f"Avg # of records per template: {np.mean(list(count_per_template_id.values()))}")
print(f"Median # of records per template: {np.median(list(count_per_template_id.values()))}")
print(f"Std: {np.std(list(count_per_template_id.values()))}")

Total: 1500
Avg # of records per template: 7.142857142857143
Median # of records per template: 7.0
Std: 2.5872528966106905


#### Which entities did we generate?

In [11]:
count_per_entity = Counter()
for record in fake_records:
    count_per_entity.update(Counter([span.type for span in record.spans]))

count_per_entity

Counter({'organization': 257,
         'first_name': 244,
         'person': 238,
         'city': 235,
         'address': 209,
         'street_name': 164,
         'name': 162,
         'country': 154,
         'credit_card_number': 152,
         'phone_number': 121,
         'last_name': 119,
         'building_number': 110,
         'age': 72,
         'secondary_address': 64,
         'year': 58,
         'nationality': 55,
         'postcode': 49,
         'zipcode': 45,
         'url': 39,
         'email': 39,
         'name_female': 37,
         'job': 33,
         'first_name_male': 31,
         'name_male': 29,
         'prefix_male': 28,
         'date_of_birth': 24,
         'iban': 22,
         'date_time': 21,
         'prefix_female': 21,
         'day_of_week': 16,
         'state_abbr': 15,
         'last_name_male': 15,
         'prefix': 12,
         'ip_address': 11,
         'ssn': 11,
         'nation_plural': 9,
         'nation_woman': 8,
         'first_name_

#### Translate tags from Faker's to Presidio's (optional)

In [12]:
translator = {
    "person": "PERSON",
    "ip_address": "IP_ADDRESS",
    "us_driver_license": "US_DRIVER_LICENSE",
    "organization": "ORGANIZATION",
    "name_female": "PERSON",
    "address": "STREET_ADDRESS",
    "country": "GPE",
    "state": "GPE",
    "credit_card_number": "CREDIT_CARD",
    "city": "GPE",
    "street_name": "STREET_ADDRESS",
    "building_number": "STREET_ADDRESS",
    "name": "PERSON",
    "iban": "IBAN_CODE",
    "last_name": "PERSON",
    "last_name_male": "PERSON",
    "last_name_female": "PERSON",
    "first_name": "PERSON",
    "first_name_male": "PERSON",
    "first_name_female": "PERSON",
    "phone_number": "PHONE_NUMBER",
    "url": "DOMAIN_NAME",
    "ssn": "US_SSN",
    "email": "EMAIL_ADDRESS",
    "date_time": "DATE_TIME",
    "date_of_birth": "DATE_TIME",
    "day_of_week": "DATE_TIME",
    "year": "DATE_TIME",
    "name_male": "PERSON",
    "prefix_male": "TITLE",
    "prefix_female": "TITLE",
    "prefix": "TITLE",
    "nationality": "NRP",
    "nation_woman": "NRP",
    "nation_man": "NRP",
    "nation_plural": "NRP",
    "first_name_nonbinary": "PERSON",
    "postcode": "STREET_ADDRESS",
    "secondary_address": "STREET_ADDRESS",
    "job": "TITLE",
    "zipcode": "ZIP_CODE",
    "state_abbr": "GPE",
    "age": "AGE",
}

def update_entity_types(dataset:List[FakerSpansResult], entity_mapping:Dict[str,str]):
    """Replace entity types using a translator dictionary."""

    for sample in dataset:
        # update entity types on spans
        for span in sample.spans:
            span.type = entity_mapping[span.type]
        # update entity types on the template string
        for key, value in entity_mapping.items():
            sample.template = sample.template.replace("{{" + key + "}}", "{{" + value + "}}")

update_entity_types(fake_records, entity_mapping=translator)

In [13]:
fake_records[0]

{"fake": "Title VII of the Civil Rights Act of 2005 protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\n", "spans": [{"value": "2005", "start": 37, "end": 41, "type": "DATE_TIME"}], "template": "Title VII of the Civil Rights Act of {{DATE_TIME}} protects individuals against employment discrimination on the basis of race and color as well as national origin, sex, or religion.\n", "template_id": 190}

Frequency of new entity types after mapping

In [14]:

count_per_entity_new = Counter()
for record in fake_records:
    for span in record.spans:
        count_per_entity_new[span.type] += 1

count_per_entity_new.most_common()

[('PERSON', 887),
 ('STREET_ADDRESS', 596),
 ('GPE', 404),
 ('ORGANIZATION', 257),
 ('CREDIT_CARD', 152),
 ('PHONE_NUMBER', 121),
 ('DATE_TIME', 119),
 ('TITLE', 94),
 ('NRP', 72),
 ('AGE', 72),
 ('ZIP_CODE', 45),
 ('DOMAIN_NAME', 39),
 ('EMAIL_ADDRESS', 39),
 ('IBAN_CODE', 22),
 ('IP_ADDRESS', 11),
 ('US_SSN', 11),
 ('US_DRIVER_LICENSE', 6)]

#### Tokenize and transform the fake samples to a list of `InputSample` objects (Common data structure for this package)

Download the spaCy tokenizer model if missing

In [None]:
!python -m spacy download en_core_web_sm

In [15]:
%%time
input_samples = [
    InputSample.from_faker_spans_result(faker_spans_result=fake_record)
    for fake_record in tqdm.tqdm(fake_records)
]

  0%|                                                                                                                   | 0/1500 [00:00<?, ?it/s]

loading model en_core_web_sm


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:06<00:00, 215.70it/s]

CPU times: user 6.76 s, sys: 33.8 ms, total: 6.8 s
Wall time: 6.96 s





#### Save as json

In [16]:
InputSample.to_json(dataset=input_samples, output_file=output_file)

#### Create a CONLL like data frame

In [17]:
conll = InputSample.create_conll_dataset(input_samples)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:00<00:00, 76888.23it/s]


In [18]:
conll.to_csv(output_conll, sep="\t")

### Next steps

- Evaluate Presidio using this fake data. [Sample](4_Evaluate_Presidio_Analyzer.ipynb)
- Split to train/test/validation while ensuring sentences originiating from the same template are all on the same subset. [Sample](3_Split_by_pattern_number.ipynb)
- Conduct a small exploratory data analysis on the generated data. [Sample](2_PII_EDA.ipynb)

#### Copyright notice:


Data generated for evaluation was created using Fake Name Generator.

Fake Name Generator identities by the [Fake Name Generator](https://www.fakenamegenerator.com/) 
are licensed under a [Creative Commons Attribution-Share Alike 3.0 United States License](http://creativecommons.org/licenses/by-sa/3.0/us/). Fake Name Generator and the Fake Name Generator logo are trademarks of Corban Works, LLC.