In [3]:
# !pip install presidio-analyzer presidio-anonymizer
# !python -m spacy download en_core_web_lg

- https://microsoft.github.io/presidio/samples/

In [4]:
import re

from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

In [5]:
#load the analyzer
analyzer = AnalyzerEngine()

In [6]:
def print_colored_pii(string):
    colored_string = re.sub(
        r"(<[^>]*>)", lambda m: "\033[31m" + m.group(1) + "\033[0m", string
    )
    print(colored_string)

In [7]:
document_content = """Date: October 19, 2021
 Witness: John Doe
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is John Doe and on October 19, 2021.

 Dr. Emily Johnson recently visited our clinic. Her contact number is (555) 123-4567, and her email is emily.johnson@example.com.

 She lives at 456 E m Street, Springfield, IL 62704 and also houses my Social Security Number, 602-76-4532.

 What's more, I had my polish identity card there, with the number ABC123456.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at 9:30 AM.

 In case any information, please reach out to me on my phone number, 999-888-7777, or through my personal email, johndoe@example.com.

 My representative at the clinic is Victoria Cherry (her business phone: 987-654-3210).

 Thank you for your assistance,

 John Doe"""

In [9]:
# Call the analyzer to get results
results = analyzer.analyze(text=document_content,
                           language='en')

for res in results:
    print(res)

type: EMAIL_ADDRESS, start: 280, end: 305, score: 1.0
type: EMAIL_ADDRESS, start: 720, end: 739, score: 1.0
type: UK_NHS, start: 815, end: 827, score: 1.0
type: DATE_TIME, start: 6, end: 22, score: 0.85
type: PERSON, start: 33, end: 41, score: 0.85
type: PERSON, start: 142, end: 150, score: 0.85
type: DATE_TIME, start: 158, end: 174, score: 0.85
type: PERSON, start: 182, end: 195, score: 0.85
type: LOCATION, start: 338, end: 349, score: 0.85
type: US_SSN, start: 403, end: 414, score: 0.85
type: DATE_TIME, start: 597, end: 604, score: 0.85
type: PERSON, start: 778, end: 793, score: 0.85
type: PERSON, start: 865, end: 873, score: 0.85
type: PHONE_NUMBER, start: 247, end: 261, score: 0.75
type: PHONE_NUMBER, start: 676, end: 688, score: 0.75
type: PHONE_NUMBER, start: 815, end: 827, score: 0.75
type: URL, start: 280, end: 288, score: 0.5
type: URL, start: 294, end: 305, score: 0.5
type: URL, start: 728, end: 739, score: 0.5


In [10]:
# the analyzer results are passed to the AnonymizerEngine for redaction(anonymization)
anonymizer = AnonymizerEngine()
anonymized_text = anonymizer.anonymize(text=document_content, analyzer_results=results)

print_colored_pii(anonymized_text.text)

Date: [31m<DATE_TIME>[0m
 Witness: [31m<PERSON>[0m
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is [31m<PERSON>[0m and on [31m<DATE_TIME>[0m.

 Dr. [31m<PERSON>[0m recently visited our clinic. Her contact number is [31m<PHONE_NUMBER>[0m, and her email is [31m<EMAIL_ADDRESS>[0m.

 She lives at 456 E m Street, [31m<LOCATION>[0m, IL 62704 and also houses my Social Security Number, [31m<US_SSN>[0m.

 What's more, I had my polish identity card there, with the number ABC123456.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at [31m<DATE_TIME>[0m.

 In case any information, please reach out to me on my phone number, [31m<PHONE_NUMBER>[0m, or through my personal email, [31m<EMAIL_ADDRESS>[0m.

 My representative at the clinic is [31m<PERSON>[0m (her business phone: [31m<UK_NHS>[0m).

 Thank you for your assistance,

 [31m<PERSON>[0m


**Add new recognizers**
- https://microsoft.github.io/presidio/analyzer/adding_recognizers/

In [23]:
from presidio_analyzer import Pattern, PatternRecognizer

polish_id_pattern = Pattern(
    name="polish_id_pattern",
    regex="[A-Z]{3}\d{6}",
    score=1,
)
time_pattern = Pattern(
    name="time_pattern",
    regex="(1[0-2]|0?[1-9]):[0-5][0-9] (AM|PM)",
    score=1,
)

# Define the recognizer with one or more patterns
polish_id_recognizer = PatternRecognizer(
    supported_entity="POLISH_ID", patterns=[polish_id_pattern]
)
time_recognizer = PatternRecognizer(supported_entity="TIME", patterns=[time_pattern])

In [24]:
analyzer.registry.add_recognizer(polish_id_recognizer)
analyzer.registry.add_recognizer(time_recognizer)

In [25]:
results = analyzer.analyze(text=document_content,
                           language='en')
anonymized_text = anonymizer.anonymize(text=document_content, analyzer_results=results)

print_colored_pii(anonymized_text.text)

Date: [31m<DATE_TIME>[0m
 Witness: [31m<PERSON>[0m
 Subject: Testimony Regarding the Loss of Wallet

 Testimony Content:

 Hello Officer,

 My name is [31m<PERSON>[0m and on [31m<DATE_TIME>[0m.

 Dr. [31m<PERSON>[0m recently visited our clinic. Her contact number is [31m<PHONE_NUMBER>[0m, and her email is [31m<EMAIL_ADDRESS>[0m.

 She lives at 456 E m Street, [31m<LOCATION>[0m, IL 62704 and also houses my Social Security Number, [31m<US_SSN>[0m.

 What's more, I had my polish identity card there, with the number [31m<POLISH_ID>[0m.

 I would like this data to be secured and protected in all possible ways. I believe It was stolen at [31m<TIME>[0m.

 In case any information, please reach out to me on my phone number, [31m<PHONE_NUMBER>[0m, or through my personal email, [31m<EMAIL_ADDRESS>[0m.

 My representative at the clinic is [31m<PERSON>[0m (her business phone: [31m<UK_NHS>[0m).

 Thank you for your assistance,

 [31m<PERSON>[0m
