## Basic usage

In [1]:
from presidio_analyzer import AnalyzerEngine, recognizer_result

# Set up the engine, loads the NLP module (spaCy model by default) and other PII recognizers`b`
analyzer = AnalyzerEngine()

# entities to be detected
entities = [
    "PERSON",
    "PHONE_NUMBER",
    "NRP",
    "EMAIL_ADDRESS",
    "IBAN_CODE",
    "CREDIT_CARD",
    "LOCATION",
    "IP_ADDRESS",
    "URL"
]


# Call analyzer to get results
results = analyzer.analyze(text="My name is Kyle and my phone number is 212-555-5555",
                           entities=["PHONE_NUMBER", "PERSON"],
                           language='en')


In [2]:
def format_results(text:str, results:list[recognizer_result.RecognizerResult]):
    for result in results:
        entity = text[result.start: result.end]

format_results("My name is Kyle and my phone number is 212-555-5555", results)

## Multithreading Implementation

In [3]:
from multiprocessing.dummy import Pool as ThreadPool

text = [
    "My name is Andrew and my phone number is 212-555-5555",
    "I'm located at 71 Makati Avenue",
    "Here is my credit card number 5555-5537-5304-8194"
]

text*=10

# entities to be detected
entities = [
    "PERSON",
    "PHONE_NUMBER",
    "NRP",
    "EMAIL_ADDRESS",
    "IBAN_CODE",
    "CREDIT_CARD",
    "LOCATION",
    "IP_ADDRESS",
    "URL"
]

In [4]:
def detect_pii(text: str):
    results = analyzer.analyze(text,
                            entities=entities,
                            language='en')
    format_results(text ,results)

### SAMPLE POC

In [100]:
# Make the Pool of workers
pool = ThreadPool(4)

# Open the URLs in their own threads
# and return the results
results = pool.map(detect_pii, text)

# Close the pool and wait for the work to finish
pool.close()
pool.join()

In [9]:
import multiprocessing
with multiprocessing.Pool() as pool:
    result = pool.map(detect_pii, text)

Process SpawnPoolWorker-25:
Process SpawnPoolWorker-27:
Process SpawnPoolWorker-26:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/kachan/anaconda3/envs/csog/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/kachan/anaconda3/envs/csog/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/kachan/anaconda3/envs/csog/lib/python3.10/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/kachan/anaconda3/envs/csog/lib/python3.10/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
  File "/Users/kachan/anaconda3/envs/csog/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/kachan/anaconda3/envs/csog/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  F

KeyboardInterrupt: 

### Testing speed

In [5]:
from timeit import timeit

text*=100
texts = text[:1000]

print("Text length:", len(text))

def run_threaded(threads:int):
    # Make the Pool of workers
    pool = ThreadPool(threads)

    # Open the URLs in their own threads
    # and return the results
    results = pool.map(detect_pii, texts)

    # Close the pool and wait for the work to finish
    pool.close()
    pool.join()
    pool.terminate()

def run_single():
    for text in texts:
        detect_pii(text)

Text length: 3000


In [106]:

%timeit run_threaded(4)

5.69 s ± 248 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
print('Single')
%timeit run_single()
print("4 threads")
%timeit run_threaded(4)
print("8 threads")
%timeit run_threaded(8)

Single
6.09 s ± 278 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
4 threads
5.63 s ± 159 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
8 threads
5.84 s ± 249 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [110]:
run_single()

In [10]:
import os
os.cpu_count() 

8