![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/Certification_Trainings/Healthcare/4.1.Pretrained_Clinical_DeIdentificiation.ipynb)

# Pretrained Clinical Deidentification 

In [None]:
import os

jsl_secret = os.getenv('SECRET')

import sparknlp
sparknlp_version = sparknlp.version()
import sparknlp_jsl
jsl_version = sparknlp_jsl.version()

print (jsl_secret)

In [None]:
import json
import os
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp

params = {"spark.driver.memory":"16G",
"spark.kryoserializer.buffer.max":"2000M",
"spark.driver.maxResultSize":"2000M"}

spark = sparknlp_jsl.start(jsl_secret, params=params)


print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

Spark NLP Version : 3.0.1
Spark NLP_JSL Version : 3.0.0


In [None]:
# if you want to start the session with custom params as in start function above
def start(secret):
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[*]") \
        .config("spark.driver.memory", "16G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2000M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:"+version) \
        .config("spark.jars", "https://pypi.johnsnowlabs.com/"+secret+"/spark-nlp-jsl-"+jsl_version+".jar")
      
    return builder.getOrCreate()

#spark = start(secret)

In [None]:
spark

# Deidentification Model

Protected Health Information: 
- individual’s past, present, or future physical or mental health or condition
- provision of health care to the individual
- past, present, or future payment for the health care 

Protected health information includes many common identifiers (e.g., name, address, birth date, Social Security Number) when they can be associated with the health information.

Load NER pipeline to isentify protected entities:

In [None]:
import pandas as pd

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)


import pyspark.sql.functions as F
import string
import numpy as np
from sparknlp.util import *
from sparknlp.pretrained import ResourceDownloader

In [None]:
documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

# Sentence Detector annotator, processes various sentences per line

sentenceDetector = SentenceDetector()\
  .setInputCols(["document"])\
  .setOutputCol("sentence")

# Tokenizer splits words in a relevant format for NLP

tokenizer = Tokenizer()\
  .setInputCols(["sentence"])\
  .setOutputCol("token")

# Clinical word embeddings trained on PubMED dataset

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("embeddings")

# NER model trained on n2c2 (de-identification and Heart Disease Risk Factors Challenge) datasets)

clinical_ner = MedicalNerModel.pretrained("ner_deid_large", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")

ner_converter = NerConverterInternal()\
  .setInputCols(["sentence", "token", "ner"])\
  .setOutputCol("ner_chunk")

nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    clinical_ner,
    ner_converter])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_deid_large download started this may take some time.
Approximate size to download 14.1 MB
[OK!]


### Pretrained NER models extracts:

- Name
- Profession
- Age
- Date
- Contact(Telephone numbers, FAX numbers, Email addresses)
- Location (Address, City, Postal code, Hospital Name, Employment information)
- Id (Social Security numbers, Medical record numbers, Internet protocol addresses)

In [None]:
text ='''
A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street
'''

In [None]:
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))

In [None]:
result_df = result.select(F.explode(F.arrays_zip('token.result', 'ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ner_label"))

In [None]:
result_df.select("token", "ner_label").groupBy('ner_label').count().orderBy('count', ascending=False).show(truncate=False)

+----------+-----+
|ner_label |count|
+----------+-----+
|O         |27   |
|I-LOCATION|5    |
|I-NAME    |3    |
|B-DATE    |3    |
|B-NAME    |3    |
|B-LOCATION|2    |
|B-ID      |1    |
|B-AGE     |1    |
+----------+-----+



### Check extracted sensetive entities

In [None]:
result.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+-----------------------------+---------+
|chunk                        |ner_label|
+-----------------------------+---------+
|2093-01-13                   |DATE     |
|David Hale                   |NAME     |
|Hendrickson , Ora            |NAME     |
|7194334                      |ID       |
|01/13/93                     |DATE     |
|Oliveira                     |NAME     |
|25                           |AGE      |
|2079-11-09                   |DATE     |
|Cocke County Baptist Hospital|LOCATION |
|0295 Keats Street            |LOCATION |
+-----------------------------+---------+



We can find the cases, where the model will skip some important entities, for example:

In [None]:
text ='''
Patient Aiqing, 25 years-old , born in Beijing, was transfered to the The Johns Hopkins Hospital. Phone number: (541) 754-3010. MSW 100009632582
'''

In [None]:
result2 = model.transform(spark.createDataFrame([[text]]).toDF("text"))

In [None]:
result2.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+----------------------+---------+
|chunk                 |ner_label|
+----------------------+---------+
|25                    |AGE      |
|Beijing               |LOCATION |
|Johns Hopkins Hospital|LOCATION |
|(541) 754-3010        |CONTACT  |
|100009632582          |ID       |
+----------------------+---------+



For these entities we can add a dictionary to the pipeline, by using **NerOverwriter()**:

In [None]:
neroverwriter = NerOverwriter() \
    .setInputCols(["ner"]) \
    .setOutputCol("ner_overwrited") \
    .setStopWords(['Aiqing']) \
    .setNewResult("I-NAME")

ner_converter = NerConverterInternal()\
    .setInputCols(["sentence", "token", "ner_overwrited"])\
    .setOutputCol("ner_chunk")

nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    clinical_ner,
    neroverwriter,
    ner_converter])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

Let's test the model after modification:

In [None]:
result2 = model.transform(spark.createDataFrame([[text]]).toDF("text"))

In [None]:
result_df2 = result2.select(F.explode(F.arrays_zip('token.result', 'ner.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("token"),
        F.expr("cols['1']").alias("ner_label"))

result_df2.show()

+----------+----------+
|     token| ner_label|
+----------+----------+
|   Patient|         O|
|    Aiqing|         O|
|         ,|         O|
|        25|     B-AGE|
| years-old|         O|
|         ,|         O|
|      born|         O|
|        in|         O|
|   Beijing|B-LOCATION|
|         ,|         O|
|       was|         O|
|transfered|         O|
|        to|         O|
|       the|         O|
|       The|         O|
|     Johns|B-LOCATION|
|   Hopkins|I-LOCATION|
|  Hospital|I-LOCATION|
|         .|         O|
|     Phone|         O|
+----------+----------+
only showing top 20 rows



In [None]:
result2.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+----------------------+---------+
|chunk                 |ner_label|
+----------------------+---------+
|Aiqing                |NAME     |
|25                    |AGE      |
|Beijing               |LOCATION |
|Johns Hopkins Hospital|LOCATION |
|(541) 754-3010        |CONTACT  |
|100009632582          |ID       |
+----------------------+---------+



As we can see, now name **Aiqing** was identified correctly

### Excluding entities from deidentification

Sometimes we need to leave some entities in the text, for example, if we want to analyze the frequency of the disease by the hospital. In this case, we need to use parameter **setWhiteList()** to modify NerChunk output. This parameter having using a list of entities type to deidentify as an input. So, if we want to leave the location in the list we need to remove this tag from the list:

In [None]:
ner_converter = NerConverterInternal()\
    .setInputCols(["sentence", "token", "ner_overwrited"])\
    .setOutputCol("ner_chunk") \
    .setWhiteList(['NAME', 'PROFESSION', 'ID', 'AGE',
               'DATE', 'CONTACT'])

nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    clinical_ner,
    neroverwriter,
    ner_converter])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model_with_white_list = nlpPipeline.fit(empty_data)

In [None]:
text ='''
A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . The Johns Hopkins Hospital . 0295 Keats Street
'''

result_with_white_list = model_with_white_list.transform(spark.createDataFrame([[text]]).toDF("text"))

In [None]:
result_with_white_list.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+-----------------+---------+
|chunk            |ner_label|
+-----------------+---------+
|2093-01-13       |DATE     |
|David Hale       |NAME     |
|Hendrickson , Ora|NAME     |
|7194334          |ID       |
|01/13/93         |DATE     |
|Oliveira         |NAME     |
|25               |AGE      |
|2079-11-09       |DATE     |
+-----------------+---------+



## Masking and Obfuscation

### Replace this enitites with Tags - Masking

In [None]:
deidentification = DeIdentificationModel.pretrained("deidentify_large", "en", "clinical/models") \
      .setInputCols(["sentence", "token", "ner_chunk"]) \
      .setOutputCol("deidentified") \
      .setMode("mask")

deidentify_large download started this may take some time.
Approximate size to download 188.1 KB
[OK!]


In [None]:
text ='''
A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . The Johns Hopkins Hospital. 0295 Keats Street
'''
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))

In [None]:
result.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+----------------------+---------+
|chunk                 |ner_label|
+----------------------+---------+
|2093-01-13            |DATE     |
|David Hale            |NAME     |
|Hendrickson , Ora     |NAME     |
|7194334               |ID       |
|01/13/93              |DATE     |
|Oliveira              |NAME     |
|25                    |AGE      |
|2079-11-09            |DATE     |
|Johns Hopkins Hospital|LOCATION |
|0295 Keats Street     |LOCATION |
+----------------------+---------+



In [None]:
deid_text = deidentification.transform(result)

In [None]:
deid_text.select(F.explode(F.arrays_zip('sentence.result', 'deidentified.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,A .,A .
1,"Record date : 2093-01-13 , David Hale , M.D .","Record date : <DATE> , <NAME> , M.D ."
2,", Name : Hendrickson , Ora MR .",", Name : <NAME> MR ."
3,"# 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 .","# <ID> Date : <DATE> PCP : <NAME> , <AGE> years-old , Record date : <DATE> ."
4,The Johns Hopkins Hospital.,The <LOCATION>.
5,0295 Keats Street,<LOCATION>


### Use **Obfuscate** mode

In the **Obfuscate** mode, DeIdentificationModel will replace sensitive entities with random values of the same type. 

Will be replaced: 
- Name
- Location
- Contacts
- Profession

Will be tagged:
- Age
- Date
- ID

In [None]:
obfuscation = DeIdentificationModel.pretrained("deidentify_large", "en", "clinical/models") \
      .setInputCols(["sentence", "token", "ner_chunk"]) \
      .setOutputCol("deidentified") \
      .setMode("obfuscate")\
      .setObfuscateDate(True)

deidentify_large download started this may take some time.
Approximate size to download 188.1 KB
[OK!]


In [None]:
text ='''
A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . Johns Hopkins Hospital. 0295 Keats Street
'''
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))

In [None]:
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|      ner_overwrited|           ner_chunk|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|
A . Record date ...|[{document, 0, 20...|[{document, 1, 3,...|[{token, 1, 1, A,...|[{word_embeddings...|[{named_entity, 1...|[{named_entity, 1...|[{chunk, 19, 28, ...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [None]:
obfusated_text = obfuscation.transform(result)

In [None]:
obfusated_text.select('ner.result').take(1)

[Row(result=['O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'B-NAME', 'I-NAME', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NAME', 'I-NAME', 'I-NAME', 'O', 'O', 'O', 'B-ID', 'O', 'O', 'B-DATE', 'O', 'O', 'B-NAME', 'O', 'B-AGE', 'O', 'O', 'O', 'O', 'O', 'B-DATE', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION', 'O', 'B-LOCATION', 'I-LOCATION', 'I-LOCATION'])]

In [None]:
obfusated_text.select(F.explode(F.arrays_zip('sentence.result', 'deidentified.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,A .,A .
1,"Record date : 2093-01-13 , David Hale , M.D .","Record date : 2093-02-24 , Kia , M.D ."
2,", Name : Hendrickson , Ora MR .",", Name : PAUL MR ."
3,"# 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 .","# <ID> Date : 03-13-1964 PCP : Seana , <AGE> years-old , Record date : 2079-12-26 ."
4,Johns Hopkins Hospital.,313 Asselsestraat 7.
5,0295 Keats Street,229 South Dalton


### Obfuscation with **"faker"** parameter in `.setObfuscateRefSource()`
- Dates will be changed according to the given parameter in `.setDays()` 

In [None]:
deid_ner = MedicalNerModel.pretrained("ner_deid_enriched", "en", "clinical/models") \
      .setInputCols(["sentence", "token", "embeddings"]) \
      .setOutputCol("ner")

obfuscation = DeIdentification()\
      .setInputCols(["sentence", "token", "ner_chunk"])\
      .setOutputCol("deidentified")\
      .setMode("obfuscate") \
      .setObfuscateRefSource("faker") \
      .setObfuscateDate(True) \
      .setDateTag("DATE")\
      .setDays(5)


pipeline = Pipeline(stages=[
      documentAssembler, 
      sentenceDetector,
      tokenizer,
      word_embeddings,
      deid_ner,
      neroverwriter,
      ner_converter,
      obfuscation
])


empty_data = spark.createDataFrame([[""]]).toDF("text")

model = pipeline.fit(empty_data)

ner_deid_enriched download started this may take some time.
Approximate size to download 14.2 MB
[OK!]


In [None]:
text ='''
A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . Johns Hopkins Hospital. 0295 Keats Street
'''
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))

result.select(F.explode(F.arrays_zip('sentence.result', 'deidentified.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,A .,A .
1,"Record date : 2093-01-13 , David Hale , M.D .","Record date : 2093-01-18 , David Hale , M.D ."
2,", Name : Hendrickson , Ora MR .",", Name : Hendrickson , Ora MR ."
3,"# 7194334 Date : 01/13/93 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 .","# 03.27.52.01.78 Date : 11-11-2015 PCP : Oliveira , 56 years-old , Record date : 2079-11-14 ."
4,Johns Hopkins Hospital.,Johns Hopkins Hospital.
5,0295 Keats Street,0295 Keats Street


## Use full pipeline in the Light model

In [None]:
finisher = Finisher() \
    .setInputCols("deidentified")

ner_converter = NerConverterInternal()\
  .setInputCols(["sentence", "token", "ner_overwrited"])\
  .setOutputCol("ner_chunk") 

In [None]:
pipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    clinical_ner,
    neroverwriter,
    ner_converter,
    obfuscation])

In [None]:
empty_data = spark.createDataFrame([[""]]).toDF("text")

model = pipeline.fit(empty_data)

In [None]:
light_model = LightPipeline(model)

In [None]:
text ='''
A . Record date : 2093-01-13 , David Hale , M.D . , Name : Hendrickson , Ora MR . # 7194334 Date : 01-13-1993 PCP : Oliveira , 25 years-old , Record date : 2079-11-09 . Cocke County Baptist Hospital . 0295 Keats Street
'''

In [None]:
annotated_text = light_model.annotate(text)
annotated_text['deidentified']

['A .',
 'Record date : 2093-01-18 , Mohmad Gordon , M.D .',
 ', Name : Anna Lawson MR .',
 '# D8839800 Date : 01-18-1993 PCP : Ranko Carroll , 58 years-old , Record date : 2079-11-14 .',
 '682 Guerrerostad .',
 '395 Tværgyden 40']

In [None]:
list(zip(annotated_text['token'], annotated_text['ner']))

[('A', 'O'),
 ('.', 'O'),
 ('Record', 'O'),
 ('date', 'O'),
 (':', 'O'),
 ('2093-01-13', 'B-DATE'),
 (',', 'O'),
 ('David', 'B-NAME'),
 ('Hale', 'I-NAME'),
 (',', 'O'),
 ('M.D', 'O'),
 ('.', 'O'),
 (',', 'O'),
 ('Name', 'O'),
 (':', 'O'),
 ('Hendrickson', 'B-NAME'),
 (',', 'I-NAME'),
 ('Ora', 'I-NAME'),
 ('MR', 'O'),
 ('.', 'O'),
 ('#', 'O'),
 ('7194334', 'B-ID'),
 ('Date', 'O'),
 (':', 'O'),
 ('01-13-1993', 'B-DATE'),
 ('PCP', 'O'),
 (':', 'O'),
 ('Oliveira', 'B-NAME'),
 (',', 'O'),
 ('25', 'B-AGE'),
 ('years-old', 'O'),
 (',', 'O'),
 ('Record', 'O'),
 ('date', 'O'),
 (':', 'O'),
 ('2079-11-09', 'B-DATE'),
 ('.', 'O'),
 ('Cocke', 'B-LOCATION'),
 ('County', 'I-LOCATION'),
 ('Baptist', 'I-LOCATION'),
 ('Hospital', 'I-LOCATION'),
 ('.', 'O'),
 ('0295', 'B-LOCATION'),
 ('Keats', 'I-LOCATION'),
 ('Street', 'I-LOCATION')]

In [None]:
annotated_text['ner_chunk']

['2093-01-13',
 'David Hale',
 'Hendrickson , Ora',
 '7194334',
 '01-13-1993',
 'Oliveira',
 '25',
 '2079-11-09',
 'Cocke County Baptist Hospital',
 '0295 Keats Street']

In [None]:
source_text = '''Record date : 2093-01-13, David Hale, M.D. is manager, 
Name: Hendrickson, Ora MR. # 7194334 Date: 01-13-1993 PCP: Oliveira.
Record date: 2079-11-09. Cocke County Baptist Hospital. 0295 Keats Street.
This 17-yr-old male, presented with chest heaviness that started during a pick-up basketball game. His past medical history was unremarkable. He denied prior cardiac symptoms and suffered no chest trauma during the game. His father had suffered an acute myocardial infarction at age 38. The patient was a nonsmoker, did not drink alcohol, and denied recreational drug use. He swallowed a tablet of aspirin before coming to the emergency room. His blood pressure was 160/90 mm Hg, and his heart rate was 80 bpm. Physical examination revealed no stigmata of Marfan syndrome. The rest of his physical examination was normal.'''

annotated_text = light_model.annotate(source_text)
annotated_text['deidentified']

['Record date : 2093-01-18, Mohmad Gordon, M.D. is Museum/gallery conservator, \nName: Anna Lawson MR. # D8839800 Date: 01-18-1993 PCP: Ranko Carroll.',
 'Record date: 2079-11-14.',
 '682 Guerrerostad.',
 '395 Tværgyden 40.',
 'This 17-yr-old male, presented with chest heaviness that started during a pick-up basketball game.',
 'His past medical history was unremarkable.',
 'He denied prior cardiac symptoms and suffered no chest trauma during the game.',
 'His father had suffered an acute myocardial infarction at age',
 '38. The patient was a nonsmoker, did not drink alcohol, and denied recreational drug use.',
 'He swallowed a tablet of aspirin before coming to the emergency room.',
 'His blood pressure was 160/90 mm Hg, and his heart rate was 80 bpm.',
 'Physical examination revealed no stigmata of Marfan syndrome.',
 'The rest of his physical examination was normal.']

## Train custom Model


In [None]:
glove_embeddings = WordEmbeddingsModel.pretrained('glove_100d')\
          .setInputCols(["document", "token"])\
          .setOutputCol("embeddings")

public_ner = NerDLModel.pretrained("ner_dl", 'en') \
          .setInputCols(["document", "token", "embeddings"]) \
          .setOutputCol("ner")

ner_converter = NerConverterInternal()\
          .setInputCols(["sentence", "token", "ner"])\
          .setOutputCol("ner_chunk")


glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]
ner_dl download started this may take some time.
Approximate size to download 13.6 MB
[OK!]


In [None]:

nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    glove_embeddings,
    public_ner,
    ner_converter])

empty_data = spark.createDataFrame([[""]]).toDF("text")

custom_model = nlpPipeline.fit(result)

In [None]:
source_text = '''Record date : 2093-01-13, David Hale lives in New York'''

In [None]:
result = custom_model.transform(spark.createDataFrame([[source_text]]).toDF("text"))

#### We got new entities in the model 'PER' and 'ORG'

In [None]:
result.select(F.explode(F.arrays_zip('ner_chunk.result', 'ner_chunk.metadata')).alias("cols")) \
.select(F.expr("cols['0']").alias("chunk"),
        F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+----------+---------+
|chunk     |ner_label|
+----------+---------+
|David Hale|PER      |
|New York  |LOC      |
+----------+---------+



### Mask custom entities

In [None]:
deidentification = DeIdentification() \
      .setInputCols(["sentence", "token", "ner_chunk"]) \
      .setOutputCol("deidentified") \
      .setMode("mask")

In [None]:
deid_model = deidentification.fit(result)
deid_text = deid_model.transform(result)

In [None]:
deid_text.select(F.explode(F.arrays_zip('sentence.result', 'deidentified.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,"Record date : 2093-01-13, David Hale lives in New York","Record date : <DATE>, <PER> lives in <LOC>"


In [None]:
deid_model.write().overwrite().save('custom_deid_masker_model')

In order to obfuscate new entities we have to a create custom Deidentification Model with new dictionary

In [None]:
obfuscation_list = ['Marvin MARSHALL#PER',
      'Hubert GROGAN#PER',
      'ALTHEA COLBURN#PER',
      'Kalil AMIN#PER',
      'Inci FOUNTAIN#PER',
      'Surrey#ORG',
      'Warwickshire#ORG',
      'Derbyshire#ORG',
      'Leicestershire#ORG',
      'Glamorgan#ORG',
      'Durham#ORG',
      'Los Angeles#LOC']

with open('obfuscation.txt', 'a') as the_file:
    for line in obfuscation_list:
      the_file.write(line + '\n')

In [None]:
obfuscator = DeIdentification() \
    .setInputCols(["sentence", "token", "ner_chunk"]) \
    .setOutputCol("deidentified") \
    .setMode('obfuscate') \
    .setRefSep('#') \
    .setObfuscateRefFile('obfuscation.txt') \
    .setObfuscateDate(True)


In [None]:
obfuscator_model = obfuscator.fit(result)

In [None]:
obfuscator_model.write().overwrite().save('custom_obfuscator_model')

In [None]:
obfusc_text = obfuscator_model.transform(result)

In [None]:
obfusc_text.select(F.explode(F.arrays_zip('sentence.result', 'deidentified.result')).alias("cols")) \
.select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,"Record date : 2093-01-13, David Hale lives in New York","Record date : 2093-03-14, ALTHEA COLBURN lives in Los Angeles"


# Structured Deidentification

In [None]:
!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/tutorials/Certification_Trainings/Healthcare/data/hipaa-table-001.txt

In [None]:
df = spark.read.format("csv") \
    .option("sep", "\t") \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .load("hipaa-table-001.txt")

df = df.withColumnRenamed("PATIENT","NAME")
df.show(truncate=False)

+---------------+----------+---+----------------------------------------------------+-------+--------------+---+---+
|NAME           |DOB       |AGE|ADDRESS                                             |ZIPCODE|TEL           |SBP|DBP|
+---------------+----------+---+----------------------------------------------------+-------+--------------+---+---+
|Cecilia Chapman|04/02/1935|83 |711-2880 Nulla St. Mankato Mississippi              |69200  |(257) 563-7401|101|42 |
|Iris Watson    |03/10/2009|9  |P.O. Box 283 8562 Fusce Rd. Frederick Nebraska      |20620  |(372) 587-2335|159|122|
|Bryar Pitts    |11/01/1921|98 |5543 Aliquet St. Fort Dodge GA                      |20783  |(717) 450-4729|149|52 |
|Theodore Lowe  |13/02/2002|16 |Ap #867-859 Sit Rd. Azusa New York                  |39531  |(793) 151-6230|134|115|
|Calista Wise   |20/08/1942|76 |7292 Dictum Av. San Antonio MI                      |47096  |(492) 709-6392|139|78 |
|Kyla Olsen     |12/05/1973|45 |Ap #651-8679 Sodales Av. Tamunin

In [None]:
from sparknlp_jsl.structured_deidentification import StructuredDeidentification


In [None]:
obfuscator_unique_ref_test = '''Will Perry#PATIENT
John Smith#PATIENT
Marvin MARSHALL#PATIENT
Hubert GROGAN#PATIENT
ALTHEA COLBURN#PATIENT
Kalil AMIN#PATIENT
Inci FOUNTAIN#PATIENT
Aberdeen#CITY
Louisburg St#STREET
France#LOC
Nick Riviera#DOCTOR
5552312#PHONE
St James Hospital#HOSPITAL
Calle del Libertador#ADDRESS
111#ID
Will#DOCTOR'''

with open('obfuscator_unique_ref_test.txt', 'w') as f:
  f.write(obfuscator_unique_ref_test)

In [None]:
obfuscator = StructuredDeidentification(spark,{"NAME":"PATIENT","AGE":"AGE"},"/content/obfuscator_unique_ref_test.txt")
obfuscator_df = obfuscator.obfuscateColumns(df)
obfuscator_df.select("NAME","AGE").show(truncate=False)

+-----------------------+----+
|NAME                   |AGE |
+-----------------------+----+
|[Oliver Sandgreen]     |[34]|
|[Moustafa Kristiansen] |[93]|
|[Antti Alatalo]        |[19]|
|[Genghis Rose]         |[21]|
|[Mathias Rustad]       |[80]|
|[Gauthier Dias]        |[22]|
|[Tomás Čechová]        |[61]|
|[Concordio Villanueva] |[29]|
|[John Henriksen]       |[45]|
|[Anastasio Ferreira]   |[76]|
|[Mohsin Miller]        |[33]|
|[Kayla Brkić]          |[45]|
|[Ricky Geisler]        |[93]|
|[Rasmus Takaki]        |[19]|
|[Annemette Christensen]|[59]|
|[William Johansen]     |[29]|
|[Christophe Fuchs]     |[19]|
|[Jaroslava Woźniak]    |[21]|
|[Nathaniel Wheeler]    |[8] |
|[Marisa Schiavone]     |[75]|
+-----------------------+----+
only showing top 20 rows

