# Microsoft Presidio Demo

In [None]:
# download presidio
!pip install presidio_analyzer presidio_anonymizer
!python -m spacy download en_core_web_lg

In [None]:
# If you encounter issues with loading the spacy model try

#pip install typing-inspect==0.8.0 typing_extensions==4.4.0
#pip install spacy==3.1.2

In [1]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

## 1. Detect sensitive data in text

In [2]:
text_to_anonymize = "His name is Mr. Bill Jones and his phone number is 212-777-6666 his email address is bill22@gmail.com and his ID is 123456"

In [3]:
analyzer = AnalyzerEngine()
analyzer_results = analyzer.analyze(text=text_to_anonymize, entities=["PHONE_NUMBER", "PERSON", "EMAIL_ADDRESS"], language='en')

print(analyzer_results)

[type: EMAIL_ADDRESS, start: 85, end: 101, score: 1.0, type: PERSON, start: 16, end: 26, score: 0.85, type: PHONE_NUMBER, start: 51, end: 63, score: 0.75]


In [4]:
for result in analyzer_results : 
    print(text_to_anonymize[result.start:result.end], result.entity_type)

bill22@gmail.com EMAIL_ADDRESS
Bill Jones PERSON
212-777-6666 PHONE_NUMBER


## 2. Add custom entity

In [5]:
# An entity ID to identify any token that contains 6 digits

from presidio_analyzer import Pattern, PatternRecognizer

# Define the regex pattern in a Presidio `Pattern` object:
id_pattern = Pattern(name="id_pattern", regex="\d{6}", score=0.5)

# Define the recognizer with one or more patterns
id_recognizer = PatternRecognizer(
    supported_entity="ID", patterns=[id_pattern]
)

# Add custom recognizer to analyzer

analyzer.registry.add_recognizer(id_recognizer)

In [6]:
analyzer_results = analyzer.analyze(text=text_to_anonymize, entities=["PHONE_NUMBER", "PERSON", "EMAIL_ADDRESS", "ID"], language='en')

print(analyzer_results)

[type: EMAIL_ADDRESS, start: 85, end: 101, score: 1.0, type: PERSON, start: 16, end: 26, score: 0.85, type: PHONE_NUMBER, start: 51, end: 63, score: 0.75, type: ID, start: 116, end: 122, score: 0.5]


In [7]:
for result in analyzer_results : 
    print(text_to_anonymize[result.start:result.end], result.entity_type)

bill22@gmail.com EMAIL_ADDRESS
Bill Jones PERSON
212-777-6666 PHONE_NUMBER
123456 ID


## 3. Anonymize sensitive data

In [8]:
anonymizer = AnonymizerEngine()

anonymized_results = anonymizer.anonymize(
    text=text_to_anonymize,
    analyzer_results=analyzer_results
)

print(f"text: {anonymized_results.text}")


text: His name is Mr. <PERSON> and his phone number is <PHONE_NUMBER> his email address is <EMAIL_ADDRESS> and his ID is <ID>


## 4. Custom anonymization

In [9]:
from presidio_anonymizer.entities import OperatorConfig

operators={ "PHONE_NUMBER": OperatorConfig("mask", {"type": "mask", "masking_char" : "*", "chars_to_mask" : 12, "from_end" : True}),
          "DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"})}

custom_anonymized_results = anonymizer.anonymize(
    text=text_to_anonymize,
    analyzer_results=analyzer_results,
    operators = operators
)

print(f"text: {custom_anonymized_results.text}")

text: His name is Mr. <ANONYMIZED> and his phone number is ************ his email address is <ANONYMIZED> and his ID is <ANONYMIZED>
