![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/4.Clinical_DeIdentification.ipynb)


# Clinical Deidentification

## Colab Setup

In [1]:
import json

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
import os
os.environ.update(license_keys)

Saving 3.3.0.spark_nlp_for_healthcare.txt to 3.3.0.spark_nlp_for_healthcare.txt


In [2]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

[K     |████████████████████████████████| 212.4 MB 69 kB/s 
[K     |████████████████████████████████| 120 kB 22.7 MB/s 
[K     |████████████████████████████████| 198 kB 53.7 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 133 kB 1.8 MB/s 
[?25h

In [3]:
import json
import os
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql import SparkSession

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp

from sparknlp.util import *
from sparknlp.pretrained import ResourceDownloader
from pyspark.sql import functions as F

import pandas as pd

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

import string
import numpy as np

params = {"spark.driver.memory":"16G",
          "spark.kryoserializer.buffer.max":"2000M",
          "spark.driver.maxResultSize":"2000M"}

spark = sparknlp_jsl.start(secret = SECRET, params=params)

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

Spark NLP Version : 3.3.0
Spark NLP_JSL Version : 3.3.0


In [None]:
# if you want to start the session with custom params as in start function above
from pyspark.sql import SparkSession

def start(SECRET):
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[*]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:"+PUBLIC_VERSION) \
        .config("spark.jars", "https://pypi.johnsnowlabs.com/"+SECRET+"/spark-nlp-jsl-"+JSL_VERSION+".jar")
      
    return builder.getOrCreate()

#spark = start(SECRET)

In [None]:
spark

# Deidentification Model

Protected Health Information: 
- individual’s past, present, or future physical or mental health or condition
- provision of health care to the individual
- past, present, or future payment for the health care 

Protected health information includes many common identifiers (e.g., name, address, birth date, Social Security Number) when they can be associated with the health information.

Load NER pipeline to identify protected entities:

In [None]:
documentAssembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

# Sentence Detector annotator, processes various sentences per line

sentenceDetector = SentenceDetector()\
      .setInputCols(["document"])\
      .setOutputCol("sentence")

# Tokenizer splits words in a relevant format for NLP

tokenizer = Tokenizer()\
      .setInputCols(["sentence"])\
      .setOutputCol("token")

# Clinical word embeddings trained on PubMED dataset

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
      .setInputCols(["sentence", "token"])\
      .setOutputCol("embeddings")

# NER model trained on n2c2 (de-identification and Heart Disease Risk Factors Challenge) datasets)

clinical_ner = MedicalNerModel.pretrained("ner_deid_generic_augmented", "en", "clinical/models") \
      .setInputCols(["sentence", "token", "embeddings"]) \
      .setOutputCol("ner")

ner_converter = NerConverterInternal()\
      .setInputCols(["sentence", "token", "ner"])\
      .setOutputCol("ner_chunk")

nlpPipeline = Pipeline(stages=[
      documentAssembler, 
      sentenceDetector,
      tokenizer,
      word_embeddings,
      clinical_ner,
      ner_converter])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_deid_generic_augmented download started this may take some time.
Approximate size to download 13.8 MB
[OK!]


### Pretrained NER models extracts:

- Name
- Profession
- Age
- Date
- Contact(Telephone numbers, FAX numbers, Email addresses)
- Location (Address, City, Postal code, Hospital Name, Employment information)
- Id (Social Security numbers, Medical record numbers, Internet protocol addresses)

In [None]:
text ='''
A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street
'''

In [None]:
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))

In [None]:
result_df = result.select(F.explode(F.arrays_zip('token.result', 'ner.result')).alias("cols")) \
                  .select(F.expr("cols['0']").alias("token"),
                          F.expr("cols['1']").alias("ner_label"))

In [None]:
result_df.select("token", "ner_label").groupBy('ner_label').count().orderBy('count', ascending=False).show(truncate=False)

+----------+-----+
|ner_label |count|
+----------+-----+
|O         |27   |
|I-LOCATION|5    |
|I-NAME    |3    |
|B-DATE    |3    |
|B-NAME    |3    |
|B-LOCATION|2    |
|B-ID      |1    |
|B-AGE     |1    |
+----------+-----+



### Check extracted sensetive entities

In [None]:
result.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+-----------------------------+---------+
|chunk                        |ner_label|
+-----------------------------+---------+
|2093-01-13                   |DATE     |
|David Hale                   |NAME     |
|Hendrickson , Ora            |NAME     |
|7194334                      |ID       |
|01/13/93                     |DATE     |
|Oliveira                     |NAME     |
|25                           |AGE      |
|2079-11-09                   |DATE     |
|Cocke County Baptist Hospital|LOCATION |
|0295 Keats Street            |LOCATION |
+-----------------------------+---------+



### Excluding entities from deidentification

Sometimes we need to leave some entities in the text, for example, if we want to analyze the frequency of the disease by the hospital. In this case, we need to use parameter **`setWhiteList()`** to modify `ner_chunk` output. This parameter having using a list of entities type to deidentify as an input. So, if we want to leave the location in the list we need to remove this tag from the list:

In [None]:
ner_converter = NerConverterInternal()\
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("ner_chunk") \
    .setWhiteList(['NAME', 'PROFESSION', 'ID', 'AGE', 'DATE'])

nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    clinical_ner,
    ner_converter])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model_with_white_list = nlpPipeline.fit(empty_data)

In [None]:
text ='''
A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street. Phone +1 (302) 786-5227.
'''

result_with_white_list = model_with_white_list.transform(spark.createDataFrame([[text]]).toDF("text"))

In [None]:
print("All Labels :")
result.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

print("WhiteListed Labels: ")
result_with_white_list.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
                      .select(F.expr("cols['0']").alias("chunk"),
                              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

All Labels :
+-----------------------------+---------+
|chunk                        |ner_label|
+-----------------------------+---------+
|2093-01-13                   |DATE     |
|David Hale                   |NAME     |
|Hendrickson , Ora            |NAME     |
|7194334                      |ID       |
|01/13/93                     |DATE     |
|Oliveira                     |NAME     |
|25                           |AGE      |
|2079-11-09                   |DATE     |
|Cocke County Baptist Hospital|LOCATION |
|0295 Keats Street            |LOCATION |
+-----------------------------+---------+

WhiteListed Labels: 
+-----------------+---------+
|chunk            |ner_label|
+-----------------+---------+
|2093-01-13       |DATE     |
|David Hale       |NAME     |
|Hendrickson , Ora|NAME     |
|7194334          |ID       |
|01/13/93         |DATE     |
|Oliveira         |NAME     |
|25               |AGE      |
|2079-11-09       |DATE     |
+-----------------+---------+



## Masking and Obfuscation

### Replace this enitites with Tags

In [None]:
ner_converter = NerConverterInternal()\
      .setInputCols(["sentence", "token", "ner"])\
      .setOutputCol("ner_chunk") 

deidentification = DeIdentification() \
      .setInputCols(["sentence", "token", "ner_chunk"]) \
      .setOutputCol("deidentified") \
      .setMode("mask")\
      .setReturnEntityMappings(True) #  return a new column to save the mappings between the mask/obfuscated entities and original entities.
      #.setMappingsColumn("MappingCol") # change the name of the column, 'aux' is default

deidPipeline = Pipeline(stages=[
      documentAssembler, 
      sentenceDetector,
      tokenizer,
      word_embeddings,
      clinical_ner,
      ner_converter,
      deidentification])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model_deid = deidPipeline.fit(empty_data)

In [None]:
text ='''
A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street. Phone (302) 786-5227.
'''
result = model_deid.transform(spark.createDataFrame([[text]]).toDF("text"))

In [None]:
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|           ner_chunk|        deidentified|                 aux|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|
A . Record date ...|[{document, 0, 24...|[{document, 1, 3,...|[{token, 1, 1, A,...|[{word_embeddings...|[{named_entity, 1...|[{chunk, 19, 28, ...|[{document, 0, 2,...|[{chunk, 17, 22, ...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [None]:
result.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+-----------------------------+---------+
|chunk                        |ner_label|
+-----------------------------+---------+
|2093-01-13                   |DATE     |
|David Hale                   |NAME     |
|Hendrickson , Ora            |NAME     |
|7194334                      |ID       |
|01/13/93                     |DATE     |
|Oliveira                     |NAME     |
|25                           |AGE      |
|2079-11-09                   |DATE     |
|Cocke County Baptist Hospital|LOCATION |
|0295 Keats Street            |LOCATION |
|(302) 786-5227               |CONTACT  |
+-----------------------------+---------+



In [None]:
result.select(F.explode(F.arrays_zip('sentence.result', 'deidentified.result')).alias("cols")) \
      .select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,A .,A .
1,"Record date : 2093-01-13 , David Hale , M.D .","Record date : <DATE> , <NAME> , M.D ."
2,", Name : Hendrickson , Ora MR .",", Name : <NAME> MR ."
3,"# 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 .","# <ID> Date : <DATE> PCP : <NAME> , <AGE> years-old , Record date : <DATE> ."
4,Cocke County Baptist Hospital .,<LOCATION> .
5,0295 Keats Street.,<LOCATION>.
6,Phone (302) 786-5227.,Phone <CONTACT>.


### Mapping Column

In [None]:
result.select("aux").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
result.select(F.explode(F.arrays_zip('aux.metadata', 'aux.result', 'aux.begin', 'aux.end')).alias("cols")) \
      .select(F.expr("cols['0']['originalChunk']").alias("chunk"),
              F.expr("cols['0']['beginOriginalChunk']").alias("beginChunk"),
              F.expr("cols['0']['endOriginalChunk']").alias("endChunk"),
              F.expr("cols['1']").alias("label"),
              F.expr("cols['2']").alias("beginLabel"),
              F.expr("cols['3']").alias("endLabel"),
              ).show(truncate=False)

+-----------------------------+----------+--------+----------+----------+--------+
|chunk                        |beginChunk|endChunk|label     |beginLabel|endLabel|
+-----------------------------+----------+--------+----------+----------+--------+
|2093-01-13                   |19        |28      |<DATE>    |17        |22      |
|David Hale                   |32        |41      |<NAME>    |26        |31      |
|Hendrickson , Ora            |60        |76      |<NAME>    |49        |54      |
|7194334                      |85        |91      |<ID>      |62        |65      |
|01/13/93                     |100       |107     |<DATE>    |74        |79      |
|Oliveira                     |115       |122     |<NAME>    |87        |92      |
|25                           |126       |127     |<AGE>     |96        |100     |
|2079-11-09                   |155       |164     |<DATE>    |128       |133     |
|Cocke County Baptist Hospital|168       |196     |<LOCATION>|136       |145     |
|029

## Reidentification

We can use `ReIdentification` annotator to go back to the original sentence.

In [None]:
reIdentification = ReIdentification()\
     .setInputCols(["aux","deidentified"])\
     .setOutputCol("original")

In [None]:
reid_result = reIdentification.transform(result)

In [None]:
reid_result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|           ner_chunk|        deidentified|                 aux|            original|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|
A . Record date ...|[{document, 0, 24...|[{document, 1, 3,...|[{token, 1, 1, A,...|[{word_embeddings...|[{named_entity, 1...|[{chunk, 19, 28, ...|[{document, 0, 2,...|[{chunk, 17, 22, ...|[{document, 1, 3,...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----

In [None]:
print(text)

reid_result.select('original.result').show(truncate=False)


A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street. Phone (302) 786-5227.

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                                                                                                 |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[A ., R

## Using multiple NER in the same pipeline

In [None]:
from sparknlp_jsl.annotator import *

documentAssembler = DocumentAssembler()\
      .setInputCol("text")\
      .setOutputCol("document")

sentenceDetector = SentenceDetector()\
      .setInputCols(["document"])\
      .setOutputCol("sentence")

tokenizer = Tokenizer()\
      .setInputCols(["sentence"])\
      .setOutputCol("token")

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
      .setInputCols(["sentence", "token"])\
      .setOutputCol("embeddings")

deid_ner = MedicalNerModel.pretrained("ner_deid_generic_augmented", "en", "clinical/models") \
      .setInputCols(["sentence", "token", "embeddings"]) \
      .setOutputCol("ner_generic")

ner_converter = NerConverter()\
      .setInputCols(["sentence", "token", "ner_generic"])\
      .setOutputCol("ner_generic_chunk")\
      .setWhiteList(['ID', 'DATE', 'AGE', 'NAME', 'PROFESSION'])# CONTACT and LOCATION is removed

deid_ner_enriched = MedicalNerModel.pretrained("ner_deid_subentity_augmented", "en", "clinical/models") \
      .setInputCols(["sentence", "token", "embeddings"]) \
      .setOutputCol("ner_subentity")

ner_converter_enriched = NerConverter() \
      .setInputCols(["sentence", "token", "ner_subentity"]) \
      .setOutputCol("ner_subentity_chunk")\
      .setWhiteList(['COUNTRY', 'CITY', 'HOSPITAL', 'STATE', 'STREET', 'ZIP'])
      # we can also add PATIENT and DOCTOR entities and remove NAME entity from the other NER model

chunk_merge = ChunkMergeApproach()\
      .setInputCols("ner_subentity_chunk","ner_generic_chunk")\
      .setOutputCol("deid_merged_chunk")

deidentification = DeIdentification() \
      .setInputCols(["sentence", "token", "deid_merged_chunk"]) \
      .setOutputCol("deidentified") \
      .setMode("mask")\
      .setIgnoreRegex(True)


nlpPipeline = Pipeline(stages=[
      documentAssembler, 
      sentenceDetector,
      tokenizer,
      word_embeddings,
      deid_ner,
      ner_converter,
      deid_ner_enriched,
      ner_converter_enriched,
      chunk_merge,
      deidentification])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_deid_generic_augmented download started this may take some time.
Approximate size to download 13.8 MB
[OK!]
ner_deid_subentity_augmented download started this may take some time.
Approximate size to download 14.1 MB
[OK!]


In [None]:
text ='''
A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 719435 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street. Phone 302-786-5227.
'''

In [None]:
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))

# ner_deid_generic_augmented
result.select(F.explode(F.arrays_zip('ner_generic_chunk.result', 'ner_generic_chunk.metadata')).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+-----------------+---------+
|chunk            |ner_label|
+-----------------+---------+
|2093-01-13       |DATE     |
|David Hale       |NAME     |
|Hendrickson , Ora|NAME     |
|719435           |ID       |
|01/13/93         |DATE     |
|Oliveira         |NAME     |
|25               |AGE      |
|2079-11-09       |DATE     |
+-----------------+---------+



In [None]:
# ner_deid_subentity_augmented
result.select(F.explode(F.arrays_zip('ner_subentity_chunk.result', 'ner_subentity_chunk.metadata')).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+-----------------------------+---------+
|chunk                        |ner_label|
+-----------------------------+---------+
|Cocke County Baptist Hospital|HOSPITAL |
|0295 Keats Street.           |STREET   |
+-----------------------------+---------+



In [None]:
# merged chunk
result.select(F.explode(F.arrays_zip('deid_merged_chunk.result', 'deid_merged_chunk.metadata')).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+-----------------------------+---------+
|chunk                        |ner_label|
+-----------------------------+---------+
|2093-01-13                   |DATE     |
|David Hale                   |NAME     |
|Hendrickson , Ora            |NAME     |
|719435                       |ID       |
|01/13/93                     |DATE     |
|Oliveira                     |NAME     |
|25                           |AGE      |
|2079-11-09                   |DATE     |
|Cocke County Baptist Hospital|HOSPITAL |
|0295 Keats Street.           |STREET   |
+-----------------------------+---------+



In [None]:
result.select(F.explode(F.arrays_zip('sentence.result', 'deidentified.result')).alias("cols")) \
      .select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,A .,A .
1,"Record date : 2093-01-13 , David Hale , M.D .","Record date : <DATE> , <NAME> , M.D ."
2,", Name : Hendrickson , Ora MR .",", Name : <NAME> MR ."
3,"# 719435 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 .","# <ID> Date : <DATE> PCP : <NAME> , <AGE> years-old , Record date : <DATE> ."
4,Cocke County Baptist Hospital .,<HOSPITAL> .
5,0295 Keats Street.,<STREET>
6,Phone 302-786-5227.,Phone 302-786-5227.


### Enriching with Regex and Override NER




In [None]:
deidentification = DeIdentification()\
    .setInputCols(["sentence", "token", "deid_merged_chunk"])\
    .setOutputCol("deidentified")\
    .setMode("mask") \
    .setRegexOverride(True)

pipeline = Pipeline(stages=[
    nlpPipeline, 
    deidentification
])

model_default_rgx = pipeline.fit(empty_data)

In [None]:
result = model_default_rgx.transform(spark.createDataFrame([[text]]).toDF("text"))

result.select(F.explode(F.arrays_zip('sentence.result', 'deidentified.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,A .,A .
1,"Record date : 2093-01-13 , David Hale , M.D .","Record date : <DATE> , <NAME> , M.D ."
2,", Name : Hendrickson , Ora MR .",", Name : <NAME> MR ."
3,"# 719435 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 .","# <ID> Date : <DATE> PCP : <NAME> , <AGE> years-old , Record date : <DATE> ."
4,Cocke County Baptist Hospital .,<HOSPITAL> .
5,0295 Keats Street.,<STREET>
6,Phone 302-786-5227.,Phone <PHONE>.


In [None]:
rgx = '''NUMBER ([0-9]{2})
NUMBER (\d{7})''' 

with open("./custom_regex.txt", "w") as f:
  f.write(rgx)

f = open("./custom_regex.txt", "r")

print(f.read())

NUMBER ([0-9]{2})
NUMBER (\d{7})


We see that two entities  have conflict between the regex and the NER. NER has the priroty as a default. We can change this `setRegexOverride` param 

In [None]:
deidentification_rgx = DeIdentification()\
    .setInputCols(["sentence", "token", "deid_merged_chunk"])\
    .setOutputCol("deidentified")\
    .setMode("mask") \
    .setRegexPatternsDictionary("./custom_regex.txt")\
    .setRegexOverride(True)


nlpPipeline_rgx = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    deid_ner,
    ner_converter,
    deid_ner_enriched,
    ner_converter_enriched,
    chunk_merge,
    deidentification_rgx])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model_rgx = nlpPipeline_rgx.fit(empty_data)

In [None]:
text ='''A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street.'''

result = model_rgx.transform(spark.createDataFrame([[text]]).toDF("text"))
result.select(F.explode(F.arrays_zip('sentence.result', 'deidentified.result')).alias("cols")) \
      .select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,A .,A .
1,"Record date : 2093-01-13 , David Hale , M.D .","Record date : <DATE> , <NAME> , M.D ."
2,", Name : Hendrickson , Ora MR .",", Name : <NAME> MR ."
3,"# 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 .","# <NUMBER> Date : <DATE> PCP : <NAME> , <NUMBER> years-old , Record date : <DATE> ."
4,Cocke County Baptist Hospital .,<HOSPITAL> .
5,0295 Keats Street.,<STREET>


## Obfuscation mode

In the obfuscation mode **DeIdentificationModel** will replace sensetive entities with random values of the same type. 


In [None]:
obs_lines = """Marvin MARSHALL#PATIENT
Hubert GROGAN#PATIENT
ALTHEA COLBURN#PATIENT
Kalil AMIN#PATIENT
Inci FOUNTAIN#PATIENT
Ekaterina Rosa#DOCTOR
Rudiger Chao#DOCTOR
COLLETTE KOHLER#DOCTOR
Mufi HIGGS#DOCTOR"""


with open ('obfuscation.txt', 'w') as f:
  f.write(obs_lines)

In [None]:
deid_ner = MedicalNerModel.pretrained("ner_deid_subentity_augmented", "en", "clinical/models") \
      .setInputCols(["sentence", "token", "embeddings"]) \
      .setOutputCol("ner")

ner_converter = NerConverterInternal()\
      .setInputCols(["sentence", "token", "ner"])\
      .setOutputCol("ner_chunk")

obfuscation = DeIdentification()\
      .setInputCols(["sentence", "token", "ner_chunk"]) \
      .setOutputCol("deidentified") \
      .setMode("obfuscate")\
      .setObfuscateDate(True)\
      .setObfuscateRefFile('obfuscation.txt')\
      #.setObfuscateRefSource("both")

pipeline = Pipeline(stages=[
      documentAssembler, 
      sentenceDetector,
      tokenizer,
      word_embeddings,
      deid_ner,
      ner_converter,
      obfuscation
])

obfuscation_model = pipeline.fit(empty_data)

ner_deid_subentity_augmented download started this may take some time.
Approximate size to download 14.1 MB
[OK!]


In [None]:
text ='''
A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR # 7194334 Date : 01/13/93 . Patient : Oliveira, 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street
'''

result = obfuscation_model.transform(spark.createDataFrame([[text]]).toDF("text"))

result.select(F.explode(F.arrays_zip('sentence.result', 'deidentified.result')).alias("cols")) \
      .select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,A .,A .
1,"Record date : 2093-01-13 , David Hale , M.D .","Record date : 2093-02-14 , Dr Milda Bard , M.D ."
2,", Name : Hendrickson , Ora MR # 7194334 Date : 01/13/93 .",", Name : Alban Barlow MR # A8388982 Date : 02-06-1986 ."
3,"Patient : Oliveira, 25 years-old , Record date : 2079-11-09 .","Patient : Lanetta Gear, 82 years-old , Record date : 2079-11-14 ."
4,Cocke County Baptist Hospital .,SISTERS OF CHARITY HOSPITAL .
5,0295 Keats Street,HAIMINGERBERG


In [None]:
result.select("ner_chunk").collect()

[Row(ner_chunk=[Row(annotatorType='chunk', begin=19, end=28, result='2093-01-13', metadata={'sentence': '1', 'chunk': '0', 'ner_source': 'ner_chunk', 'entity': 'DATE', 'confidence': '1.0'}, embeddings=[]), Row(annotatorType='chunk', begin=32, end=41, result='David Hale', metadata={'sentence': '1', 'chunk': '1', 'ner_source': 'ner_chunk', 'entity': 'NAME', 'confidence': '0.99665'}, embeddings=[]), Row(annotatorType='chunk', begin=60, end=76, result='Hendrickson , Ora', metadata={'sentence': '2', 'chunk': '2', 'ner_source': 'ner_chunk', 'entity': 'NAME', 'confidence': '0.8545'}, embeddings=[]), Row(annotatorType='chunk', begin=83, end=89, result='7194334', metadata={'sentence': '2', 'chunk': '3', 'ner_source': 'ner_chunk', 'entity': 'ID', 'confidence': '1.0'}, embeddings=[]), Row(annotatorType='chunk', begin=98, end=105, result='01/13/93', metadata={'sentence': '2', 'chunk': '4', 'ner_source': 'ner_chunk', 'entity': 'DATE', 'confidence': '1.0'}, embeddings=[]), Row(annotatorType='chunk',

In [None]:
obfuscation = DeIdentification()\
      .setInputCols(["sentence", "token", "ner_chunk"]) \
      .setOutputCol("deidentified") \
      .setMode("obfuscate")\
      .setObfuscateDate(True)\
      .setObfuscateRefFile('obfuscation.txt')\
      .setObfuscateRefSource("file")

pipeline = Pipeline(stages=[
      documentAssembler, 
      sentenceDetector,
      tokenizer,
      word_embeddings,
      deid_ner,
      ner_converter,
      obfuscation
])

obfuscation_model = pipeline.fit(empty_data)      
      
      
result = obfuscation_model.transform(spark.createDataFrame([[text]]).toDF("text"))

result.select(F.explode(F.arrays_zip('sentence.result', 'deidentified.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,A .,A .
1,"Record date : 2093-01-13 , David Hale , M.D .","Record date : 2093-01-23 , Ekaterina Rosa , M.D ."
2,", Name : Hendrickson , Ora MR # 7194334 Date : 01/13/93 .",", Name : ALTHEA COLBURN MR # <MEDICALRECORD> Date : 01-27-1975 ."
3,"Patient : Oliveira, 25 years-old , Record date : 2079-11-09 .","Patient : Marvin MARSHALL, <AGE> years-old , Record date : 2079-11-30 ."
4,Cocke County Baptist Hospital .,<HOSPITAL> .
5,0295 Keats Street,<STREET>


## Faker mode

The faker module allow to the user to use a set of fake entities that are in the memory of the spark-nlp-internal. You can setting up this module using the the following property setObfuscateRefSource('faker').

If we select the setObfuscateRefSource('both') then we choose randomly the entities using the faker and the fakes entities from the obfuscateRefFile.


The entities that are allowed right now are the followings:

* Location
* Location-other
* Hospital
* City
* State
* Zip
* Country
* Contact
* Username
* Phone
* Fax
* Url
* Email
* Profession
* Name
* Doctor
* Patient
* Id
* Idnum
* Bioid
* Age
* Organization
* Healthplan
* Medicalrecord
* Ssn
* Passport
* DLN
* NPI
* C_card
* IBAN
* DEA
* Device




In [None]:
obfuscation = DeIdentification()\
      .setInputCols(["sentence", "token", "ner_chunk"]) \
      .setOutputCol("deidentified") \
      .setMode("obfuscate")\
      .setObfuscateDate(True)\
      .setObfuscateRefSource("faker") \

pipeline = Pipeline(stages=[
      documentAssembler, 
      sentenceDetector,
      tokenizer,
      word_embeddings,
      deid_ner,
      ner_converter,
      obfuscation
])

obfuscation_model = pipeline.fit(empty_data)

In [None]:
text ='''
A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR # 7194334 Date : 01/13/93 . Patient : Oliveira, 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street
'''

result = obfuscation_model.transform(spark.createDataFrame([[text]]).toDF("text"))

result.select(F.explode(F.arrays_zip('sentence.result', 'deidentified.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,A .,A .
1,"Record date : 2093-01-13 , David Hale , M.D .","Record date : 2093-02-15 , Dr Starlene Settle , M.D ."
2,", Name : Hendrickson , Ora MR # 7194334 Date : 01/13/93 .",", Name : Durrell Fling MR # Y1542240 Date : 06-17-2003 ."
3,"Patient : Oliveira, 25 years-old , Record date : 2079-11-09 .","Patient : Antonio Chute, 31 years-old , Record date : 2079-12-28 ."
4,Cocke County Baptist Hospital .,IBERIA MEDICAL CENTER .
5,0295 Keats Street,Carlisle


## Use full pipeline in the Light model

In [None]:
text ='''
A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR # 7194334 Date : 01/13/93 . Patient : Oliveira, 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street
'''

In [None]:
light_model = LightPipeline(model)
annotated_text = light_model.annotate(text)
annotated_text['deidentified']

['A .',
 'Record date : <DATE> , <NAME> , M.D .',
 ', Name : <NAME> MR # <ID> Date : <DATE> .',
 'Patient : <NAME>, <AGE> years-old , Record date : <DATE> .',
 '<HOSPITAL> .',
 '<STREET>']

In [None]:
obf_light_model = LightPipeline(obfuscation_model)
annotated_text = obf_light_model.annotate(text)
annotated_text['deidentified']

['A .',
 'Record date : 2093-02-10 , Dr Starlene Settle , M.D .',
 ', Name : Durrell Fling MR # Y1542240 Date : 06-17-2003 .',
 'Patient : Antonio Chute, 31 years-old , Record date : 2079-12-23 .',
 'IBERIA MEDICAL CENTER .',
 'Carlisle']

# Structured Deidentification

In [None]:
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/hipaa-table-001.txt

df = spark.read.format("csv") \
    .option("sep", "\t") \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .load("hipaa-table-001.txt")

df = df.withColumnRenamed("PATIENT","NAME")
df.show(truncate=False)

+---------------+----------+---+----------------------------------------------------+-------+--------------+---+---+
|NAME           |DOB       |AGE|ADDRESS                                             |ZIPCODE|TEL           |SBP|DBP|
+---------------+----------+---+----------------------------------------------------+-------+--------------+---+---+
|Cecilia Chapman|04/02/1935|83 |711-2880 Nulla St. Mankato Mississippi              |69200  |(257) 563-7401|101|42 |
|Iris Watson    |03/10/2009|9  |P.O. Box 283 8562 Fusce Rd. Frederick Nebraska      |20620  |(372) 587-2335|159|122|
|Bryar Pitts    |11/01/1921|98 |5543 Aliquet St. Fort Dodge GA                      |20783  |(717) 450-4729|149|52 |
|Theodore Lowe  |13/02/2002|16 |Ap #867-859 Sit Rd. Azusa New York                  |39531  |(793) 151-6230|134|115|
|Calista Wise   |20/08/1942|76 |7292 Dictum Av. San Antonio MI                      |47096  |(492) 709-6392|139|78 |
|Kyla Olsen     |12/05/1973|45 |Ap #651-8679 Sodales Av. Tamunin

In [None]:
from sparknlp_jsl.structured_deidentification import StructuredDeidentification

In [None]:
obfuscator = StructuredDeidentification(spark,{"NAME":"PATIENT","AGE":"AGE"}, obfuscateRefSource = "faker")
obfuscator_df = obfuscator.obfuscateColumns(df)
obfuscator_df.show(truncate=False)

+--------------------+----------+----+----------------------------------------------------+-------+--------------+---+---+
|NAME                |DOB       |AGE |ADDRESS                                             |ZIPCODE|TEL           |SBP|DBP|
+--------------------+----------+----+----------------------------------------------------+-------+--------------+---+---+
|[Jonnie Aurora]     |04/02/1935|[46]|711-2880 Nulla St. Mankato Mississippi              |69200  |(257) 563-7401|101|42 |
|[Darin Muff]        |03/10/2009|[33]|P.O. Box 283 8562 Fusce Rd. Frederick Nebraska      |20620  |(372) 587-2335|159|122|
|[Shoshana Greenland]|11/01/1921|[88]|5543 Aliquet St. Fort Dodge GA                      |20783  |(717) 450-4729|149|52 |
|[Sheryll Clink]     |13/02/2002|[16]|Ap #867-859 Sit Rd. Azusa New York                  |39531  |(793) 151-6230|134|115|
|[Georgi Sermons]    |20/08/1942|[7] |7292 Dictum Av. San Antonio MI                      |47096  |(492) 709-6392|139|78 |
|[Ulyess Loma]  

In [None]:
obfuscator_unique_ref_test = '''Will Perry#PATIENT
John Smith#PATIENT
Marvin MARSHALL#PATIENT
Hubert GROGAN#PATIENT
ALTHEA COLBURN#PATIENT
Kalil AMIN#PATIENT
Inci FOUNTAIN#PATIENT
Jackson WILLE#PATIENT
Jack SANTOS#PATIENT
Mahmood ALBURN#PATIENT
Marnie MELINGTON#PATIENT
Aysha GHAZI#PATIENT
Maryland CODER#PATIENT
Darene GEORGIOUS#PATIENT
Shelly WELLBECK#PATIENT
Min Kun JAE#PATIENT
Thomson THOMAS#PATIENT
Christian SUDDINBURG#PATIENT
Aberdeen#CITY
Louisburg St#STREET
France#LOC
Nick Riviera#DOCTOR
5552312#PHONE
St James Hospital#HOSPITAL
Calle del Libertador#ADDRESS
111#ID
Will#DOCTOR
20#AGE
30#AGE
40#AGE
50#AGE
60#AGE
'''

with open('obfuscator_unique_ref_test.txt', 'w') as f:
  f.write(obfuscator_unique_ref_test)

In [None]:
# obfuscateRefSource = "file"

obfuscator = StructuredDeidentification(spark,{"NAME":"PATIENT","AGE":"AGE"}, 
                                        obfuscateRefFile = "/content/obfuscator_unique_ref_test.txt",
                                        obfuscateRefSource = "file",
                                        columnsSeed={"NAME": 23, "AGE": 23})
obfuscator_df = obfuscator.obfuscateColumns(df)
obfuscator_df.select("NAME","AGE").show(truncate=False)

+------------------+----+
|NAME              |AGE |
+------------------+----+
|[Inci FOUNTAIN]   |[60]|
|[Jack SANTOS]     |[30]|
|[Darene GEORGIOUS]|[30]|
|[Shelly WELLBECK] |[40]|
|[Hubert GROGAN]   |[40]|
|[Kalil AMIN]      |[40]|
|[ALTHEA COLBURN]  |[60]|
|[Thomson THOMAS]  |[60]|
|[Jack SANTOS]     |[60]|
|[Will Perry]      |[20]|
|[Jackson WILLE]   |[60]|
|[Shelly WELLBECK] |[40]|
|[Kalil AMIN]      |[30]|
|[Marnie MELINGTON]|[30]|
|[Min Kun JAE]     |[30]|
|[Marvin MARSHALL] |[60]|
|[Marvin MARSHALL] |[50]|
|[Min Kun JAE]     |[30]|
|[Maryland CODER]  |[20]|
|[Marnie MELINGTON]|[20]|
+------------------+----+
only showing top 20 rows



# Pretrained Deidentification Pipeline

This pipeline can be used to deidentify PHI information from medical texts. The PHI information will be masked and obfuscated in the resulting text. The pipeline can mask and obfuscate `AGE`, `CONTACT`, `DATE`, `ID`, `LOCATION`, `NAME`, `PROFESSION`, `CITY`, `COUNTRY`, `DOCTOR`, `HOSPITAL`, `IDNUM`, `MEDICALRECORD`, `ORGANIZATION`, `PATIENT`, `PHONE`, `PROFESSION`, `STREET`, `USERNAME`, `ZIP`, `ACCOUNT`, `LICENSE`, `VIN`, `SSN`, `DLN`, `PLATE`, `IPADDR` entities.

In [4]:
from sparknlp.pretrained import PretrainedPipeline

deid_pipeline = PretrainedPipeline("clinical_deidentification", "en", "clinical/models")

clinical_deidentification download started this may take some time.
Approx size to download 1.6 GB
[OK!]


In [6]:
deid_pipeline.model.stages

[DocumentAssembler_4138763a0971,
 SentenceDetector_86e931e7c8f8,
 REGEX_TOKENIZER_d598143a4ab3,
 WORD_EMBEDDINGS_MODEL_9004b1d00302,
 MedicalNerModel_e8178a1262cc,
 NerConverter_c65ef30dd048,
 MedicalNerModel_342e883f0c40,
 NerConverter_57d05987bf2d,
 MERGE_2e8463f42c79,
 CONTEXTUAL-PARSER_51466b721eb0,
 CONTEXTUAL-PARSER_a3de95f1471b,
 CONTEXTUAL-PARSER_c58ac84f8e17,
 CONTEXTUAL-PARSER_7c8367ecb966,
 CONTEXTUAL-PARSER_0a5a264bbd63,
 MERGE_af6b7886587f,
 MERGE_362e4fb9ec92,
 MERGE_7ca0817daa8c,
 MERGE_3887fcbb1b25,
 MERGE_3222ccbd2835,
 DE-IDENTIFICATION_63d7afedd4fa,
 DE-IDENTIFICATION_43da675b6e68,
 Finisher_6c199c7afe3f]

In [7]:
deid_res = deid_pipeline.annotate("Record date : 2093-01-13, David Hale, M.D. IP: 203.120.223.13. The driver's license no:A334455B. the SSN:324598674 and e-mail: hale@gmail.com. Name : Hendrickson, Ora MR. 25 years-old # 719435 Date : 01/13/93. Signed by Oliveira Sander, . Record date : 2079-11-09, Patient's VIN : 1HGBH41JXMN109286.")

In [8]:
deid_res.keys()

dict_keys(['masked', 'obfuscated', 'ner_chunk', 'sentence'])

In [9]:
pd.set_option("display.max_colwidth", 100)

df = pd.DataFrame(list(zip(deid_res['sentence'], deid_res['masked'], deid_res['obfuscated'])),
                  columns = ['Sentence','Masked', 'Obfuscated'])
df

Unnamed: 0,Sentence,Masked,Obfuscated
0,"Record date : 2093-01-13, David Hale, M.D.","Record date : <DATE>, <DOCTOR>, M.D.","Record date : 2093-01-16, Dr Hilma Nancy, M.D."
1,IP: 203.120.223.13.,IP: <IPADDR>.,IP: 003.003.003.003.
2,The driver's license no:A334455B.,The driver's license <DLN>.,The driver's license S99921801.
3,the SSN:324598674 and e-mail: hale@gmail.com.,the <SSN> and e-mail: <EMAIL>.,the 999-36-5441 and e-mail: Loria@google.com.
4,"Name : Hendrickson, Ora MR. 25 years-old # 719435 Date : 01/13/93.",Name : <PATIENT> MR. <AGE> years-old # <MEDICALRECORD> Date : <DATE>.,Name : Eda Buddle MR. 5 years-old # Z5145513 Date : 03-18-1986.
5,"Signed by Oliveira Sander, .","Signed by <DOCTOR>, .","Signed by Dr Joaquim Orris, ."
6,"Record date : 2079-11-09, Patient's VIN : 1HGBH41JXMN109286.","Record date : <DATE>, Patient's VIN : <VIN>.","Record date : 2079-11-30, Patient's VIN : 5eeee44ffff555666."
