Install dependencies

In [1]:
!pip install presidio-analyzer presidio-anonymizer flask nest-asyncio transformers sentencepiece

Collecting presidio-analyzer
  Downloading presidio_analyzer-2.2.360-py3-none-any.whl.metadata (3.4 kB)
Collecting presidio-anonymizer
  Downloading presidio_anonymizer-2.2.360-py3-none-any.whl.metadata (8.9 kB)
Collecting phonenumbers<10.0.0,>=8.12 (from presidio-analyzer)
  Downloading phonenumbers-9.0.14-py2.py3-none-any.whl.metadata (11 kB)
Collecting tldextract (from presidio-analyzer)
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract->presidio-analyzer)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading presidio_analyzer-2.2.360-py3-none-any.whl (128 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.7/128.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading presidio_anonymizer-2.2.360-py3-none-any.whl (35 kB)
Downloading phonenumbers-9.0.14-py2.py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m18

Set environment variables

In [2]:
import os

# Presidio endpoints for local Flask apps
os.environ["PRESIDIO_ANALYZER_API_BASE"] = "http://127.0.0.1:5002"
os.environ["PRESIDIO_ANONYMIZER_API_BASE"] = "http://127.0.0.1:5001"

print("Environment variables set!")


Environment variables set!


Run Presidio Analyzer and Anonymizer servers in background

In [8]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from flask import Flask, request, jsonify
import threading
import nest_asyncio

nest_asyncio.apply()

# Analyzer setup
analyzer_app = Flask("analyzer")
analyzer = AnalyzerEngine()

@analyzer_app.route('/analyze', methods=['POST'])
def analyze():
    data = request.get_json()
    text = data.get("text", "")
    results = analyzer.analyze(text=text, entities=["PERSON", "PHONE_NUMBER"], language="en")
    # Convert RecognizerResult objects to dictionaries
    results_dict = [result.to_dict() for result in results]
    return jsonify(results_dict)

def run_analyzer():
    analyzer_app.run(host="127.0.0.1", port=5002)

# Anonymizer setup
anonymizer_app = Flask("anonymizer")
anonymizer = AnonymizerEngine()

@anonymizer_app.route('/anonymize', methods=['POST'])
def anonymize():
    data = request.get_json()
    text = data.get("text", "")
    analyzer_results = data.get("analyzer_results", [])
    anonymized_result = anonymizer.anonymize(text=text, analyzer_results=analyzer_results)
    return jsonify(anonymized_result)

def run_anonymizer():
    anonymizer_app.run(host="127.0.0.1", port=5001)

# Start servers
threading.Thread(target=run_analyzer).start()
threading.Thread(target=run_anonymizer).start()



 * Serving Flask app 'analyzer'
 * Debug mode: off
 * Serving Flask app 'anonymizer'


Address already in use
Port 5002 is in use by another program. Either identify and stop that program, or start the server with a different port.


 * Debug mode: off


Address already in use
Port 5001 is in use by another program. Either identify and stop that program, or start the server with a different port.


Masking Logic

In [9]:
import requests

ANALYZER_URL = os.environ["PRESIDIO_ANALYZER_API_BASE"] + "/analyze"
ANONYMIZER_URL = os.environ["PRESIDIO_ANONYMIZER_API_BASE"] + "/anonymize"

def presidio_masking(text):
    # Analyze text
    resp = requests.post(ANALYZER_URL, json={"text": text})
    entities = resp.json()

    # Anonymize text
    resp2 = requests.post(ANONYMIZER_URL, json={
        "text": text,
        "analyzer_results": entities
    })
    anonymized = resp2.json()

    return anonymized.get("text", text)


Hugging Face mode

In [17]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the model and tokenizer
model_name = "facebook/blenderbot-400M-distill"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def chat_with_llm(input_text):
    # Apply masking
    masked_input = presidio_masking(input_text)
    print("Input after masking:", masked_input)

    # Encode and generate response
    inputs = tokenizer(masked_input, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=50)
    reply = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print("LLM Output:", reply)

    # Restore masked values (for demonstration, hardcoded)
    final_reply = reply.replace("[PERSON]", "Jane Doe").replace("[PHONE_NUMBER]", "034453334")

    return final_reply


Test

In [21]:
user_input = "Hello world, my name is Jane Doe and my phone number is 034453334."
result = chat_with_llm(user_input)

print("\nFinal Response:")
print(result)


ERROR:analyzer:Exception on /analyze [POST]
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/flask/app.py", line 1511, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/flask/app.py", line 919, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/flask/app.py", line 917, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/flask/app.py", line 902, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)  # type: ignore[no-any-return]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipython-input-4125686609.py", line 18, in analyze
    return jsonify(results)
           ^^^^^^^^^^^^^^^^
  File "/u

JSONDecodeError: Expecting value: line 1 column 1 (char 0)