## INSTALL DEPENDENCIES

In [6]:
%pip install presidio_analyzer
%pip install presidio_anonymizer
%pip install transformers
%pip install pandas
%pip install spacy
%pip install torch

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting torch
  Using cached torch-2.0.0-cp39-none-macosx_11_0_arm64.whl (55.8 MB)
Collecting sympy
  Using cached sympy-1.11.1-py3-none-any.whl (6.5 MB)
Collecting networkx
  Downloading networkx-3.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting mpmath>=0.19
  Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)
Installing collected packages: mpmath, sympy, networkx, torch
Successfully installed mpmath-1.3.0 networkx-3.1 sympy-1.11.1 torch-2.0.0
Note: you may need to restart the kernel to use updated packages.


### INSTALL SIMPLE SPACY MODEL

In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## TEST PRECIDIO

### IMPORTS

In [7]:
from presidio_analyzer import AnalyzerEngine, RecognizerResult, RecognizerRegistry
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider
import pandas as pd
from transformers_rec import (
    TransformersRecognizer,
    BERT_DEID_CONFIGURATION,
)
import logging
from presidio_anonymizer.entities import OperatorConfig
from typing import List
from spacy import displacy
import csv
import json
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### FUNCTIONS

In [8]:
def analyzer_engine(model_path):
  """Return AnalyzerEngine.
    :param model_path: Which model to use for NER:
        "obi/deid_roberta_i2b2",
        "en_core_web_lg"
    """
  registry = RecognizerRegistry()
  registry.load_predefined_recognizers()
  if model_path == "en_core_web_lg":

        nlp_configuration = {
            "nlp_engine_name": "spacy",
            "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
        }
  else:
      # Using a small spaCy model + a HF NER model
        transformers_recognizer = TransformersRecognizer(model_path=model_path)
        transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)

        # Use small spaCy model, no need for both spacy and HF models
        # The transformers model is used here as a recognizer, not as an NlpEngine
        nlp_configuration = {
          "nlp_engine_name": "spacy",
          "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
        }
        registry.add_recognizer(transformers_recognizer)

  nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()

  analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
  return analyzer


In [9]:
def analyze(analyzer, **kwargs):
    """Analyze input using Analyzer engine and input arguments (kwargs)."""
    if "entities" not in kwargs or "All" in kwargs["entities"]:
        kwargs["entities"] = None
    return analyzer.analyze(**kwargs)

In [10]:
def anonymize(text: str, analyze_results: List[RecognizerResult]):
    """Anonymize identified input using Presidio Anonymizer.
    :param text: Full text
    :param analyze_results: list of results from presidio analyzer engine
    """
    operator_config = {"lambda": lambda x: x}
    operator = "custom"
    res = AnonymizerEngine().anonymize(
        text,
        analyze_results,
        operators={"DEFAULT": OperatorConfig(operator, operator_config)},
    )
    return res

In [11]:
def annotate(text: str, analyze_results: List[RecognizerResult]):
    """
    Highlights every identified entity on top of the text.
    :param text: full text
    :param analyze_results: list of analyzer results.
    """
    ents = []

    # Use the anonymizer to resolve overlaps
    results = anonymize(text, analyze_results)
    # sort by start index
    results = sorted(results.items, key=lambda x: x.start)
    for i, res in enumerate(results):
        ents.append({"start": res.start, "end": res.end, "label": res.entity_type, "text": res.text})
    return [{"text": text, "ents": ents}]

In [12]:
def show_results(an_r, text, return_analyzer_results=False):
    """Show results of analyze() in a dataframe."""
    df = pd.DataFrame.from_records([r.to_dict() for r in an_r])
    df["text"] = [text[res.start: res.end] for res in an_r]
    df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
        {
            "entity_type": "Entity type",
            "text": "Text",
            "start": "Start",
            "end": "End",
            "score": "Confidence",
        },
        axis=1,
    )
    df_subset["Text"] = [text[res.start: res.end] for res in an_r]
    #  In analysis_explanation_df there are more columns than in df_subset with more information. 
    if return_analyzer_results:
      analysis_explanation_df = pd.DataFrame.from_records(
          [r.analysis_explanation.to_dict() for r in an_r]
      )
    # df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
    result = annotate(text, an_r)
    return df_subset.reset_index(drop=True), result
  

## CREATE ANALYZER

In [13]:
analyzer = analyzer_engine("obi/deid_roberta_i2b2")

### TESTING IN SIMPLE DATA

In [14]:
text="Hi, I'm Romain. Absolutely loved this smartwatch. The sound quality amazing. Text me at (313) 247 1997 and you can email me at romain_thompson848@outlook.com. I live at 505 Black Gore dr, Vail, Colorado. My credit card number is 4539 1488 0343 6467."
threshold = 0.50
entities = ["PERSON", "LOCATION", "PHONE_NUMBER", "EMAIL_ADDRESS","CREDIT_CARD", "US_SSN", "US_BANK_NUMBER"]

In [15]:
analyze_results = analyze(
    analyzer=analyzer,
    text=text,
    entities= entities,
    language="en",
    score_threshold=threshold,
)



In [16]:
frame, sentence = show_results(analyze_results, text)
displacy.render(sentence, style="ent", manual=True)
display(frame)

Unnamed: 0,Entity type,Text,Start,End,Confidence
0,PHONE_NUMBER,(313) 247,88,97,1.0
1,EMAIL_ADDRESS,romain_thompson848@outlook.com,127,157,1.0
2,PERSON,thompson,134,142,1.0
3,LOCATION,505 Black Gore,169,183,1.0
4,LOCATION,dr,184,186,1.0
5,LOCATION,V,188,189,1.0
6,LOCATION,Colorado,194,202,1.0
7,CREDIT_CARD,4539 1488 0343 6467,229,248,1.0
8,PHONE_NUMBER,1997,98,102,0.99
9,O,45,229,231,0.98


### TESTING IN CSV FILE

In [17]:
threshold = 0.70
entities = ["PERSON", "LOCATION", "PHONE_NUMBER", "EMAIL_ADDRESS","CREDIT_CARD", "US_SSN", "US_BANK_NUMBER"]

In [18]:
def create_obj(an_r, text):
    """Show results of analyze() in a dataframe."""
    ents = []
    for r in an_r:
      info = r.to_dict()
      ent ={ "start": info["start"], 
              "end": info['end'], 
              "confidence": info['score'], 
              "entity": info['entity_type'], 
              "text": text[info["start"]:info["end"]]} 
      ents.append(ent)
    return ents


def test_model(csv_path, json_path, entities, threshold, analizer):
  results = []
  df = pd.read_csv(csv_path, encoding="ISO-8859-1",header=0, names=["PVID", "CONTENT"])
  for index, row in tqdm(df.iterrows(), total=len(df)):
    id = row.PVID
    text = row.CONTENT
    analyze_results = analyze(
      analyzer=analyzer,
      text=text,
      entities= entities,
      language="en",
      score_threshold=threshold,
    )
    result = create_obj(analyze_results, text)
    results.append({"PVID": id, "TEXT": text, "ENTITIES": result})
  fp=open(json_path,'w', encoding="ISO-8859-1") # output file
  json.dump(results, fp)  

#### CHANGE DIRECTORY

In [11]:
import os

os.getcwd()
#Change this
os.chdir("")
os.getcwd()

'/Users/gonzalo.zelinka/Desktop/POC-PII'

In [42]:
test_model("input.csv", "output.json", entities, threshold, analyzer)

100%|██████████| 515/515 [01:33<00:00,  5.50it/s]
