# PII Masking

In [1]:
!pip install presidio-analyzer presidio-anonymizer
!pip install spacy spacy-transformers
!pip install pandas
!pip install transformers torch



In [2]:
!python3 -m spacy init fill-config base_config.cfg config.cfg

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [3]:
Labels = [
    # **Identifikationsnummer**
    "[ORG-NUMMER]",
    "[PERSONNUMMER]",
    "[PASSNUMMER]",             # Passport Number
    "[MEDARBETARID]",           # Employee ID

    # **Kontaktinformation**
    "[TELEFONNUMMER]",
    "[E-POST]",                 
    "[SOCIALA_MEDIER_PROFILER]",

    # **Adressinformation**
    "[ADRESS]",
    "[ADRESSHISTORIA]",         
    "[POSTNUMMER]",
    "[STAD]",                   
    "[LÄN]",                    

    # **Demografisk Information**
    "[NAMN]",
    "[FÖDELSEDAG]",
    "[FÖDELSEPLATS]",
    "[ÅLDER]",
    "[KÖN]",
    "[NATIONALITET]",           
    "[CIVILSTAND]",             

    # **Övrig Information**
    "[IP-ADRESS]",              
    "[RELIGION]",
    "[ETNICITET]",              

    # **Finansiell Information**
    "[BANKKONTONUMMER]",        # Bank Account Number
    "[KREDITKORTNUMMER]",       # Credit Card Number
    "[BANKGIRONUMMER]",         # Bankgironummer
    "[CLEARINGNUMMER]",         # Clearingnummer
    "[IBAN]",                   # International Bank Account Number
    "[BIC]",                    # Bank Identifier Code


    # **Yrkesinformation**
    "[ARBETSPLATS]",            # Arbetsplats
    "[TITEL]",                  # Jobbtitel
    "[ARBETSGIVENHET]",         # Arbetsgivar-ID
    "[ANSTÄLLNINGSTID]",        # Anställningstid
]


In [4]:
import pandas as pd

# Load the JSON dataset
df = pd.read_json('datasets/6_dataset_full.json')

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,id,text,to_mask
0,1,Hej! Jag heter Anna Svensson och vill flytta m...,"[{'label': '[NAMN]', 'value': 'Anna Svensson'}..."
1,2,"Jag har nyligen flyttat till Storgatan 12, 123...","[{'label': '[ADRESS]', 'value': 'Storgatan 12,..."
2,3,Jag vill ändra mina kontaktuppgifter. Mitt nya...,"[{'label': '[TELEFONNUMMER]', 'value': '070-12..."
3,4,Jag försökte logga in med mitt passnummer AB12...,"[{'label': '[PASSNUMMER]', 'value': 'AB1234567..."
4,5,Jag behöver uppdatera mitt bankkontonummer. De...,"[{'label': '[BANKKONTONUMMER]', 'value': 'SE98..."


In [5]:
# prettify
from pprint import pprint
import textwrap

In [6]:
# Display the first example with text and to_mask columns
first_example = df[['text', 'to_mask']].iloc[0]

# Split the text into multiple lines for better readability
first_example['text'] = '\n'.join(textwrap.wrap(first_example['text'], width=80))
print("Text:")
print(first_example['text'])
print("\nTo Mask:")
pprint(first_example['to_mask'])

Text:
Hej! Jag heter Anna Svensson och vill flytta min pension till er. Behöver jag
uppge mitt personnummer 19921212-5678 redan nu eller räcker det med
organisationsnumret 556677-8899 som jag fått från min arbetsgivare?

To Mask:
[{'label': '[NAMN]', 'value': 'Anna Svensson'},
 {'label': '[PERSONNUMMER]', 'value': '19921212-5678'},
 {'label': '[ORG-NUMMER]', 'value': '556677-8899'}]


#### NLP: presidio with Spacy


##### Test

In [7]:
from presidio_analyzer import AnalyzerEngine

# Set up the engine, loads the NLP module (spaCy model by default) and other PII recognizers
analyzer = AnalyzerEngine()

# Call analyzer to get results
results = analyzer.analyze(text="My phone number is 212-555-5555",
                           entities=["PHONE_NUMBER"],
                           language='en')
print(results)

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


[type: PHONE_NUMBER, start: 19, end: 31, score: 0.75]


#### Swedish installation of Spacy

In [8]:
!python -m spacy download sv_core_news_lg

Collecting sv-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/sv_core_news_lg-3.8.0/sv_core_news_lg-3.8.0-py3-none-any.whl (228.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m228.9/228.9 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('sv_core_news_lg')


In [12]:
# Load the Swedish model
import spacy
from presidio_analyzer.nlp_engine import SpacyNlpEngine

nlp = spacy.load("sv_core_news_lg")

# Create a custom SpacyNlpEngine with the loaded model
class LoadedSpacyNlpEngine(SpacyNlpEngine):
    def __init__(self, loaded_spacy_model):
        super().__init__()
        self.nlp = {"sv": loaded_spacy_model}

# Initialize the custom NLP engine
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model=nlp)

# Initialize the AnalyzerEngine with the custom NLP engine
analyzer = AnalyzerEngine(nlp_engine=loaded_nlp_engine, supported_languages=["sv"])

#### Test in swedish

In [None]:
sample = "Mitt telefonnummer är 0768888888"
# Analyze text
results = analyzer.analyze(
    text=sample,
    entities=["[ PHONE_NUMBER ]"],
    language="sv"
)

# Print results
for result in results:
    print(f"Entity: {result.entity_type}, Text: {result.start}-{result.end}, Score: {result.score}")


ValueError: No matching recognizers were found to serve the request.

In [41]:
def mask_phone_numbers(text, results):
    for result in results:
        if result.score > 0.3 and result.entity_type == "PHONE_NUMBER":
            text = text[:result.start] + result.entity_type + text[result.end:]
    return text

results = analyzer.analyze(text=sample, entities=["PHONE_NUMBER"], language="sv")
masked_text = mask_phone_numbers(sample, results)
print(masked_text)

NameError: name 'sample' is not defined