### PII Detection In Semi-Structured Data

In [6]:
import pandas as pd
from presidio_structured import StructuredEngine, PandasAnalysisBuilder
from presidio_anonymizer.entities import OperatorConfig
from faker import Faker # optionally using faker as an example

In [7]:
# Initialize the engine with a Pandas data processor (default)
pandas_engine = StructuredEngine()

# Create a sample DataFrame
sample_df = pd.DataFrame({'name': ['John Doe', 'Jane Smith'], 'email': ['john.doe@example.com', 'jane.smith@example.com']})

In [9]:
# Generate a tabular analysis which detects the PII entities in the DataFrame.
tabular_analysis = PandasAnalysisBuilder().generate_analysis(sample_df)
tabular_analysis

StructuredAnalysis(entity_mapping={'name': 'PERSON', 'email': 'URL'})

In [10]:
# Define anonymization operators
fake = Faker()
operators = {
    "PERSON": OperatorConfig("replace", {"new_value": "REDACTED"}),
    "EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda x: fake.safe_email()})
}

In [11]:
operators

{'PERSON': operator_name: replace, params: {'new_value': 'REDACTED'},
 'EMAIL_ADDRESS': operator_name: custom, params: {'lambda': <function <lambda> at 0x728776403520>}}

In [12]:
# Anonymize DataFrame
anonymized_df = pandas_engine.anonymize(sample_df, tabular_analysis, operators=operators)
print(anonymized_df)

       name   email
0  REDACTED  <None>
1  REDACTED  <None>
