In [1]:
# !pip install presidio-analyzer
# !pip install presidio-anonymizer
# !pip install sentencepiece transformers
# !python -m spacy download sv_core_news_lg

- https://github.com/abderrahmane-mhd/text-anonymization/blob/main/TextAnonymization.ipynb
- https://www.philschmid.de/pii-huggingface-sagemaker

# Swedish Text Anonymization

In [2]:
document_content = """Ämne: Förfrågan om Byte av Adressuppgifter

Hej,

Mitt namn är Anna Svensson och jag skriver till er för att uppdatera
mina personliga uppgifter i era register.
Nyligen har jag flyttat till en ny adress och det är viktigt för mig
att mina uppgifter är aktuella hos er.

Här är mina uppdaterade uppgifter:

Namn: Anna Svensson
Personnummer: 860711-1234
Gammal adress: Storgatan 12, 123 45 Gamlastaden
Ny adress: Lillgatan 34, 543 21 Nystaden
Telefonnummer: 070-1234567
E-post: anna.svensson@email.com

Tack på förhand för er hjälp med denna ärende.

Med vänliga hälsningar,
Anna Svensson"""

In [3]:
import re
import spacy
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult, Pattern, PatternRecognizer

from presidio_analyzer.nlp_engine import NlpArtifacts,NlpEngineProvider

from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

2024-01-27 21:04:13.948106: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-27 21:04:16.042460: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-01-27 21:04:16.042722: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2024-01-27 21:04:17.885822: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcu

In [4]:
def print_colored_pii(string):
    colored_string = re.sub(
        r"(<[^>]*>)", lambda m: "\033[31m" + m.group(1) + "\033[0m", string
    )
    print(colored_string)

## 1. NER SpaCy Model

We will start by trying NER model proposed by SpaCy

In [5]:
lang_code = "sv"
model_name = "sv_core_news_lg"
nlp = spacy.load(model_name)

In [6]:
configuration = {"nlp_engine_name":"spacy",
                 "models":[{
                     "lang_code": lang_code,
                     "model_name":model_name
                     }]
                }

provider = NlpEngineProvider(nlp_configuration=configuration)

nlp_engine = provider.create_engine()

analyzer = AnalyzerEngine(
    nlp_engine=nlp_engine,
    supported_languages = [lang_code]
)

In [7]:
results = analyzer.analyze(text=document_content, language=lang_code)

for res in results:
    print(res)

type: EMAIL_ADDRESS, start: 476, end: 499, score: 1.0
type: LOCATION, start: 367, end: 376, score: 0.85
type: LOCATION, start: 411, end: 420, score: 0.85
type: URL, start: 476, end: 483, score: 0.5
type: URL, start: 490, end: 499, score: 0.5
type: PHONE_NUMBER, start: 340, end: 351, score: 0.4


In [8]:
found_entities = [document_content[obj.to_dict()['start']:obj.to_dict()['end']] for obj in results]

spacy_detected_entities = set(found_entities)

print(spacy_detected_entities)

{'anna.sv', 'anna.svensson@email.com', '860711-1234', 'Storgatan', 'email.com', 'Lillgatan'}


In [9]:
# the analyzer results are passed to the AnonymizerEngine for redaction(anonymization)
anonymizer = AnonymizerEngine()
anonymized_text = anonymizer.anonymize(text=document_content, analyzer_results=results)

print_colored_pii(anonymized_text.text)

Ämne: Förfrågan om Byte av Adressuppgifter

Hej,

Mitt namn är Anna Svensson och jag skriver till er för att uppdatera
mina personliga uppgifter i era register.
Nyligen har jag flyttat till en ny adress och det är viktigt för mig
att mina uppgifter är aktuella hos er.

Här är mina uppdaterade uppgifter:

Namn: Anna Svensson
Personnummer: [31m<PHONE_NUMBER>[0m
Gammal adress: [31m<LOCATION>[0m 12, 123 45 Gamlastaden
Ny adress: [31m<LOCATION>[0m 34, 543 21 Nystaden
Telefonnummer: 070-1234567
E-post: [31m<EMAIL_ADDRESS>[0m

Tack på förhand för er hjälp med denna ärende.

Med vänliga hälsningar,
Anna Svensson


## 2. NER Transformers Model

In this section we try out a Name Entity Recognition Model using a BERT transformer model for Swedish Text

In [10]:
# Loading both tokenizer and NER model
model_path = 'KBLab/bert-base-swedish-cased-ner'
tokenizer = AutoTokenizer.from_pretrained(model_path)
ner_model = AutoModelForTokenClassification.from_pretrained(model_path)

Some weights of the model checkpoint at KBLab/bert-base-swedish-cased-ner were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
nlp = pipeline('ner', model=ner_model, tokenizer=tokenizer, aggregation_strategy="simple")

transformer_res = nlp(document_content)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [21]:
transformer_res

[{'entity_group': 'PER',
  'score': 0.999498,
  'word': 'Anna Svensson',
  'start': 63,
  'end': 76},
 {'entity_group': 'TME',
  'score': 0.99970055,
  'word': 'Nyligen',
  'start': 161,
  'end': 168},
 {'entity_group': 'PER',
  'score': 0.9997503,
  'word': 'Anna Svensson',
  'start': 312,
  'end': 325},
 {'entity_group': 'LOC',
  'score': 0.9978662,
  'word': 'Storgatan 12',
  'start': 367,
  'end': 379},
 {'entity_group': 'LOC',
  'score': 0.9884695,
  'word': 'Gamlastaden',
  'start': 388,
  'end': 399},
 {'entity_group': 'LOC',
  'score': 0.9973336,
  'word': 'Lillgatan 34',
  'start': 411,
  'end': 423},
 {'entity_group': 'LOC',
  'score': 0.5679079,
  'word': '543',
  'start': 425,
  'end': 428},
 {'entity_group': 'LOC',
  'score': 0.9952892,
  'word': 'Nystaden',
  'start': 432,
  'end': 440},
 {'entity_group': 'MSR',
  'score': 0.8682774,
  'word': '070',
  'start': 456,
  'end': 459},
 {'entity_group': 'MSR',
  'score': 0.5759755,
  'word': '##45',
  'start': 463,
  'end': 46

In [12]:
bert_detected_entities = [res['word'] for res in transformer_res]

In [13]:
set(bert_detected_entities)

{'##45',
 '070',
 '543',
 'Anna Svensson',
 'Gamlastaden',
 'Lillgatan 34',
 'Nyligen',
 'Nystaden',
 'Storgatan 12'}

## 3. Mixed Pipeline development (Transformers + SpaCy)

- https://microsoft.github.io/presidio/samples/python/transformers_recognizer/
- https://microsoft.github.io/presidio/analyzer/adding_recognizers/#extending-the-analyzer-for-additional-pii-entities

In [14]:
from transformers import pipeline
mapping_labels = {"PER":"PERSON",'LOC':'LOCATION','ORG':"ORGANIZATION",'PHONE_NUMBER':'PHONE_NUMBER'}

# list of entities: https://microsoft.github.io/presidio/supported_entities/#list-of-supported-entities
DEFAULT_ANOYNM_ENTITIES = [
    "CREDIT_CARD",
    "CRYPTO",
    "DATE_TIME",
    "EMAIL_ADDRESS",
    "IBAN_CODE",
    "IP_ADDRESS",
    "NRP",
    "LOCATION",
    "PERSON",
    "PHONE_NUMBER",
    "MEDICAL_LICENSE",
    "URL",
    "ORGANIZATION",
    "NUMBER"
]

In [15]:
class HFTransformerRecognizer(EntityRecognizer):
    def __init__(
        self,
        model_id_or_path,
        mapping_labels,
        aggregation_strategy="simple",
        supported_language=lang_code,
        ignore_labels=["O", "MISC"],
    ):
        # inits transformers pipeline for given mode or path
        self.pipeline = pipeline(
            "token-classification", model=model_id_or_path, aggregation_strategy=aggregation_strategy, ignore_labels=ignore_labels
        )
        # map labels to presidio labels
        self.label2presidio = mapping_labels

        # passes entities from model into parent class
        super().__init__(supported_entities=list(self.label2presidio.values()), supported_language=supported_language)

    def load(self) -> None:
        """No loading is required."""
        pass

    def analyze(
        self, text: str, entities = None, nlp_artifacts: NlpArtifacts = None
    ):
        """
        Extracts entities using Transformers pipeline
        """
        results = []

        predicted_entities = self.pipeline(text)
        if len(predicted_entities) > 0:
            for e in predicted_entities:
                if(e['entity_group'] not in self.label2presidio):
                    continue
                converted_entity = self.label2presidio[e["entity_group"]]
                if converted_entity in entities or entities is None:
                    results.append(
                        RecognizerResult(
                            entity_type=converted_entity, start=e["start"], end=e["end"], score=e["score"]
                        )
                    )
        return results

In [16]:
to_keep = []
configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": lang_code, "model_name": model_name}],
}

provider = NlpEngineProvider(nlp_configuration=configuration)

nlp_engine = provider.create_engine()

# Pass the created NLP engine and supported_languages to the AnalyzerEngine
analyzer = AnalyzerEngine(
    nlp_engine=nlp_engine,
    supported_languages = [lang_code]
)

In [17]:
transformers_recognizer = HFTransformerRecognizer(model_path, mapping_labels)
analyzer.registry.add_recognizer(transformers_recognizer)

Some weights of the model checkpoint at KBLab/bert-base-swedish-cased-ner were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
# Text Analyzer
analyzer_results = analyzer.analyze(text=document_content, entities = DEFAULT_ANOYNM_ENTITIES, allow_list = to_keep, language=lang_code)

# Text Anonymizer
anonymizer = AnonymizerEngine()
anonymized_text = anonymizer.anonymize(text=document_content, analyzer_results=analyzer_results)

# Restructuring anonymizer results

anonymization_results =  {"anonymized": anonymized_text.text,"found": [entity.to_dict() for entity in analyzer_results]}

words = [{'word': document_content[obj['start']:obj['end']], 'entity_type':obj['entity_type'], 'start':obj['start'], 'end':obj['end']} for obj in anonymization_results['found']]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [19]:
words

[{'word': 'anna.svensson@email.com',
  'entity_type': 'EMAIL_ADDRESS',
  'start': 476,
  'end': 499},
 {'word': 'Anna Svensson', 'entity_type': 'PERSON', 'start': 573, 'end': 586},
 {'word': 'Anna Svensson', 'entity_type': 'PERSON', 'start': 312, 'end': 325},
 {'word': 'Anna Svensson', 'entity_type': 'PERSON', 'start': 63, 'end': 76},
 {'word': 'Storgatan 12', 'entity_type': 'LOCATION', 'start': 367, 'end': 379},
 {'word': 'Lillgatan 34', 'entity_type': 'LOCATION', 'start': 411, 'end': 423},
 {'word': 'Nystaden', 'entity_type': 'LOCATION', 'start': 432, 'end': 440},
 {'word': 'Gamlastaden', 'entity_type': 'LOCATION', 'start': 388, 'end': 399},
 {'word': '543', 'entity_type': 'LOCATION', 'start': 425, 'end': 428},
 {'word': 'anna.sv', 'entity_type': 'URL', 'start': 476, 'end': 483},
 {'word': 'email.com', 'entity_type': 'URL', 'start': 490, 'end': 499},
 {'word': '860711-1234',
  'entity_type': 'PHONE_NUMBER',
  'start': 340,
  'end': 351}]

In [20]:
print_colored_pii(anonymized_text.text)

Ämne: Förfrågan om Byte av Adressuppgifter

Hej,

Mitt namn är [31m<PERSON>[0m och jag skriver till er för att uppdatera
mina personliga uppgifter i era register.
Nyligen har jag flyttat till en ny adress och det är viktigt för mig
att mina uppgifter är aktuella hos er.

Här är mina uppdaterade uppgifter:

Namn: [31m<PERSON>[0m
Personnummer: [31m<PHONE_NUMBER>[0m
Gammal adress: [31m<LOCATION>[0m, 123 45 [31m<LOCATION>[0m
Ny adress: [31m<LOCATION>[0m, [31m<LOCATION>[0m 21 [31m<LOCATION>[0m
Telefonnummer: 070-1234567
E-post: [31m<EMAIL_ADDRESS>[0m

Tack på förhand för er hjälp med denna ärende.

Med vänliga hälsningar,
[31m<PERSON>[0m
