<a href="https://colab.research.google.com/github/Lynneice/hello-world/blob/master/piimasker2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# !pip install presidio-analyzer presidio-anonymizer

Collecting presidio-analyzer
  Downloading presidio_analyzer-2.2.358-py3-none-any.whl.metadata (3.2 kB)
Collecting presidio-anonymizer
  Downloading presidio_anonymizer-2.2.358-py3-none-any.whl.metadata (8.1 kB)
Collecting phonenumbers<9.0.0,>=8.12 (from presidio-analyzer)
  Downloading phonenumbers-8.13.55-py2.py3-none-any.whl.metadata (11 kB)
Collecting tldextract (from presidio-analyzer)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract->presidio-analyzer)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading presidio_analyzer-2.2.358-py3-none-any.whl (114 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading presidio_anonymizer-2.2.358-py3-none-any.whl (31 kB)
Downloading phonenumbers-8.13.55-py2.py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m8

In [4]:
import pandas as pd
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine

In [5]:

import pandas as pd
data = {'name': ['Alice Smith', 'Bob Johnson', 'Charlie Brown'],
        'address': ['123 Main St, Anytown, CA 91234', '456 Oak Ave, Somewhere, NY 10001', '789 Pine Ln, Nowhere, TX 75001'],
        'phone number': ['(123) 456-7890', '987-654-3210', '555.123.4567'],
        'fav ice cream': ['Vanilla', 'Chocolate Chip', 'Strawberry'],
        'fav animal': ['Dog', 'Cat', 'Elephant'],
        'fav color': ['Blue', 'Green', 'Red'],
        'fav character': ['Batman', 'Wonder Woman', 'Spider-Man'],
        'birthdate': ['1990-05-15', '1985-11-30', '2000-01-01']}
df = pd.DataFrame(data)
df

Unnamed: 0,name,address,phone number,fav ice cream,fav animal,fav color,fav character,birthdate
0,Alice Smith,"123 Main St, Anytown, CA 91234",(123) 456-7890,Vanilla,Dog,Blue,Batman,1990-05-15
1,Bob Johnson,"456 Oak Ave, Somewhere, NY 10001",987-654-3210,Chocolate Chip,Cat,Green,Wonder Woman,1985-11-30
2,Charlie Brown,"789 Pine Ln, Nowhere, TX 75001",555.123.4567,Strawberry,Elephant,Red,Spider-Man,2000-01-01


In [6]:


def mask_pii_in_dataframe(df: pd.DataFrame, pii_columns: list) -> pd.DataFrame:
    """
    Identify and mask PII in specified columns of a DataFrame using Presidio.

    Parameters:
        df (pd.DataFrame): Input DataFrame containing data.
        pii_columns (list of str): List of column names to scan and anonymize.

    Returns:
        pd.DataFrame: The DataFrame with PII in the specified columns masked.

    Note:
        - Non-string values (e.g., numbers or NaN) are left unchanged.
        - If a column from pii_columns is not in the DataFrame, it will be skipped.
        - This function modifies the DataFrame in place and also returns it.
    """
    # Initialize Presidio's analyzer and anonymizer engines.
    analyzer = AnalyzerEngine()  # Uses default NLP model (e.g., en_core_web_lg for English).
    batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
    batch_anonymizer = BatchAnonymizerEngine()

    # Prepare the data for batch processing: dict of column -> list of values
    data_to_process = {}
    for col in pii_columns:
        if col not in df.columns:
            continue  # Skip columns that are not present to be robust.
        # Convert column to list. Non-string values (None/NaN/numbers) are included as is.
        data_to_process[col] = df[col].tolist()

    if not data_to_process:
        return df  # No valid columns to process

    # Run batch analysis on the prepared data dictionary (PII detection).
    analyzer_results = batch_analyzer.analyze_dict(input_dict=data_to_process, language="en")
    # The analyzer_results is an iterator of DictAnalyzerResult; convert to list for re-use.
    analyzer_results = list(analyzer_results)

    # Run batch anonymization using the detection results (PII masking).
    anonymized_data = batch_anonymizer.anonymize_dict(analyzer_results=analyzer_results)
    # `anonymized_data` is a dict with the same keys as data_to_process and masked values.

    # Update the original DataFrame with anonymized values for each column.
    for col, masked_values in anonymized_data.items():
        # Assign the masked values back to the DataFrame column (preserve original index alignment).
        df[col] = pd.Series(masked_values, index=df.index)

    return df


In [7]:
df_masked = mask_pii_in_dataframe(df, ['name', 'address', 'phone number'])
df_masked



[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.




Unnamed: 0,name,address,phone number,fav ice cream,fav animal,fav color,fav character,birthdate
0,<PERSON>,"123 <LOCATION>, <LOCATION>, CA 91234",<PHONE_NUMBER>,Vanilla,Dog,Blue,Batman,1990-05-15
1,<PERSON>,"456 Oak Ave, <LOCATION>, <LOCATION> 10001",<UK_NHS>,Chocolate Chip,Cat,Green,Wonder Woman,1985-11-30
2,<PERSON>,"789 <PERSON>, Nowhere, <LOCATION> 75001",<PHONE_NUMBER>,Strawberry,Elephant,Red,Spider-Man,2000-01-01
