# PII Masking

In [1]:
%pip install presidio-analyzer presidio-anonymizer
%pip install spacy spacy-transformers
%pip install pandas
%pip install transformers torch
%python -m spacy download sv_core_news_lg

Collecting sv-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/sv_core_news_lg-3.8.0/sv_core_news_lg-3.8.0-py3-none-any.whl (228.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m228.9/228.9 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('sv_core_news_lg')


In [2]:
import pandas as pd

# Load the JSON dataset
df = pd.read_json('datasets/6_dataset_full.json')

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,id,text,to_mask
0,1,Hej! Jag heter Anna Svensson och vill flytta m...,"[{'label': '[NAMN]', 'value': 'Anna Svensson'}..."
1,2,"Jag har nyligen flyttat till Storgatan 12, 123...","[{'label': '[ADRESS]', 'value': 'Storgatan 12,..."
2,3,Jag vill ändra mina kontaktuppgifter. Mitt nya...,"[{'label': '[TELEFONNUMMER]', 'value': '070-12..."
3,4,Jag försökte logga in med mitt passnummer AB12...,"[{'label': '[PASSNUMMER]', 'value': 'AB1234567..."
4,5,Jag behöver uppdatera mitt bankkontonummer. De...,"[{'label': '[BANKKONTONUMMER]', 'value': 'SE98..."


In [3]:
# prettify
from pprint import pprint
import textwrap

In [4]:
# Display the first example with text and to_mask columns
first_example = df[['text', 'to_mask']].iloc[0]

# Split the text into multiple lines for better readability
first_example['text'] = '\n'.join(textwrap.wrap(first_example['text'], width=80))
print("Text:")
print(first_example['text'])
print("\nTo Mask:")
pprint(first_example['to_mask'])

Text:
Hej! Jag heter Anna Svensson och vill flytta min pension till er. Behöver jag
uppge mitt personnummer 19921212-5678 redan nu eller räcker det med
organisationsnumret 556677-8899 som jag fått från min arbetsgivare?

To Mask:
[{'label': '[NAMN]', 'value': 'Anna Svensson'},
 {'label': '[PERSONNUMMER]', 'value': '19921212-5678'},
 {'label': '[ORG-NUMMER]', 'value': '556677-8899'}]


#### NLP: presidio with Spacy


##### Test

In [5]:
from presidio_analyzer import AnalyzerEngine,  PatternRecognizer

# Set up the engine, loads the NLP module (spaCy model by default) and other PII recognizers
analyzer = AnalyzerEngine()

# Call analyzer to get results
results = analyzer.analyze(text="My phone number is 212-555-5555",
                           entities=["PHONE_NUMBER"],
                           language='en')
print(results)

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


[type: PHONE_NUMBER, start: 19, end: 31, score: 0.75]


#### Swedish installation of Spacy

In [6]:
# Load the Swedish model
import spacy
from presidio_analyzer.nlp_engine import SpacyNlpEngine

nlp = spacy.load("sv_core_news_lg")

# Create a custom SpacyNlpEngine with the loaded model
class LoadedSpacyNlpEngine(SpacyNlpEngine):
    def __init__(self, loaded_spacy_model):
        super().__init__()
        self.nlp = {"sv": loaded_spacy_model}

# Initialize the custom NLP engine
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model=nlp)

# Initialize the AnalyzerEngine with the custom NLP engine
analyzer = AnalyzerEngine(nlp_engine=loaded_nlp_engine, supported_languages=["sv"])

#### Test in swedish simple sample

In [7]:
sample = "Mitt telefonnummer är 0768888888"
# Analyze text
results = analyzer.analyze(
    text=sample,
     entities=["PHONE_NUMBER"],
    language="sv"
)

# Print results
for result in results:
    print(f"Entity: {result.entity_type}, Text: {result.start}-{result.end}, Score: {result.score}")


Entity: PHONE_NUMBER, Text: 22-32, Score: 0.4


In [8]:
for result in results:
    if result.score > 0.3:
        masked_sample = sample[:result.start] + f"[ {result.entity_type} ]" + sample[result.end:]
print(masked_sample)

Mitt telefonnummer är [ PHONE_NUMBER ]


#### Test with Sample from dataset

In [9]:
from presidio_analyzer import Pattern

swedish_org_pattern = Pattern(
    name="SWEDISH_ORG_NUMBER",
    regex=r"\b\d{6}[-]?\d{4}\b",
    score=0.5
)
swedish_personnr_pattern = Pattern(
    name="SWEDISH_PERSONAL_NUMBER",
    regex=r"\b(?:[12]\d{3}|\d{2})(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])[-]?\d{4}\b",
    score=0.5
)

swedish_phone_number_pattern = Pattern(
    name="TELEFONNUMMER",
    regex=r"\b07[0-9]{1,2}[-]?[0-9]{3}[-]?[0-9]{3}\b",
    score=0.5
)

swedish_phone_number_recognizer = PatternRecognizer(
    supported_entity="TELEFONNUMMER",
    patterns=[swedish_phone_number_pattern],
    supported_language="sv"
)


swedish_org_recognizer = PatternRecognizer(
    supported_entity="SWEDISH_ORG_NUMBER",
    patterns=[swedish_org_pattern],
    supported_language="sv"
)
swedish_person_recognizer = PatternRecognizer(
    supported_entity="SWEDISH_PERSONAL_NUMBER",
    patterns=[swedish_personnr_pattern],
    supported_language="sv"
)



#### Add the recognizers to the analyzer

In [10]:
remove_recognizer_list = ["PhoneRecognizer"]

for rec in list(analyzer.registry.get_recognizers(language="sv", all_fields=True)):
    if rec.name in remove_recognizer_list:
        analyzer.registry.remove_recognizer(rec.name, language="sv")


add_recognizer_list = [
    swedish_org_recognizer, 
    swedish_person_recognizer, 
    swedish_phone_number_recognizer
]

# Get all recognizers for the Swedish language.
existing_recognizers = analyzer.registry.get_recognizers(language="sv", all_fields=True)

# Loop over the custom recognizers.
for custom_rec in add_recognizer_list:
    # Check if a recognizer with the same name already exists in the registry.
    if any(existing_rec.name == custom_rec.name for existing_rec in existing_recognizers):
        print(f"{custom_rec.name} already exists in the registry, skipping addition.")
    else:
        analyzer.registry.add_recognizer(custom_rec)
        print(f"Added {custom_rec.name} to the registry.")

Added PatternRecognizer to the registry.
Added PatternRecognizer to the registry.
Added PatternRecognizer to the registry.


In [11]:
first_exe_text = first_example['text']
print(first_exe_text + "\n")

# Analyze text for PHONE_NUMBER, SWEDISH_ORG_NUMBER, and SWEDISH_PERSONAL_NUMBER
results = analyzer.analyze(
    text=first_exe_text,
    entities=["PHONE_NUMBER", "SWEDISH_ORG_NUMBER", "SWEDISH_PERSONAL_NUMBER"],
    language="sv"
)

# Print detected entities
for result in results:
    detected_text = first_exe_text[result.start:result.end]
    print(f"Entity: {result.entity_type}, Text: '{detected_text}', Score: {result.score}, Start: {result.start}, End: {result.end}")

# Mask detected PII in the text by processing in reverse order
first_exe_text_masked = first_exe_text
for result in sorted(results, key=lambda r: r.start, reverse=True):
    if result.score > 0.3:
        first_exe_text_masked = (
            first_exe_text_masked[:result.start]
            + f"[ {result.entity_type} ]"
            + first_exe_text_masked[result.end:]
        )

print("=" * 80)
print("\n" + first_exe_text_masked)


Hej! Jag heter Anna Svensson och vill flytta min pension till er. Behöver jag
uppge mitt personnummer 19921212-5678 redan nu eller räcker det med
organisationsnumret 556677-8899 som jag fått från min arbetsgivare?

Entity: SWEDISH_PERSONAL_NUMBER, Text: '19921212-5678', Score: 0.5, Start: 102, End: 115
Entity: SWEDISH_ORG_NUMBER, Text: '556677-8899', Score: 0.5, Start: 166, End: 177

Hej! Jag heter Anna Svensson och vill flytta min pension till er. Behöver jag
uppge mitt personnummer [ SWEDISH_PERSONAL_NUMBER ] redan nu eller räcker det med
organisationsnumret [ SWEDISH_ORG_NUMBER ] som jag fått från min arbetsgivare?


In [12]:
# Iterate through all registered recognizers
for recognizer in analyzer.registry.recognizers:
    # Access the supported entities
    supported_entities = recognizer.supported_entities
    print(f"Recognizer supports entities: {supported_entities}")
    print(f"Type: {recognizer.__class__.__name__}")
    
    # Check if the recognizer has patterns
    if hasattr(recognizer, 'patterns') and recognizer.patterns:
        print("Patterns:")
        for pattern in recognizer.patterns:
            # Ensure pattern is a Pattern object
            if isinstance(pattern, Pattern):
                print(f"  - Name: {pattern.name}")
                print(f"    Regex: {pattern.regex}")
                print(f"    Score: {pattern.score}")
            else:
                print("  - Pattern is not a Pattern object.")
    else:
        print("No patterns defined.")
    print("-" * 40)


Recognizer supports entities: ['CRYPTO']
Type: CryptoRecognizer
Patterns:
  - Name: Crypto (Medium)
    Regex: (bc1|[13])[a-zA-HJ-NP-Z0-9]{25,59}
    Score: 0.5
----------------------------------------
Recognizer supports entities: ['DATE_TIME']
Type: DateRecognizer
Patterns:
  - Name: mm/dd/yyyy or mm/dd/yy
    Regex: \b(([1-9]|0[1-9]|1[0-2])/([1-9]|0[1-9]|[1-2][0-9]|3[0-1])/(\d{4}|\d{2}))\b
    Score: 0.6
  - Name: dd/mm/yyyy or dd/mm/yy
    Regex: \b(([1-9]|0[1-9]|[1-2][0-9]|3[0-1])/([1-9]|0[1-9]|1[0-2])/(\d{4}|\d{2}))\b
    Score: 0.6
  - Name: yyyy/mm/dd
    Regex: \b(\d{4}/([1-9]|0[1-9]|1[0-2])/([1-9]|0[1-9]|[1-2][0-9]|3[0-1]))\b
    Score: 0.6
  - Name: mm-dd-yyyy
    Regex: \b(([1-9]|0[1-9]|1[0-2])-([1-9]|0[1-9]|[1-2][0-9]|3[0-1])-\d{4})\b
    Score: 0.6
  - Name: dd-mm-yyyy
    Regex: \b(([1-9]|0[1-9]|[1-2][0-9]|3[0-1])-([1-9]|0[1-9]|1[0-2])-\d{4})\b
    Score: 0.6
  - Name: yyyy-mm-dd
    Regex: \b(\d{4}-([1-9]|0[1-9]|1[0-2])-([1-9]|0[1-9]|[1-2][0-9]|3[0-1]))\b
    Score: 0.6