![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/healthcare/DEID_EHR_DATA.ipynb)

# **De-identify Structured Data**

To run this yourself, you will need to upload your license keys to the notebook. Just Run The Cell Below in order to do that. Also You can open the file explorer on the left side of the screen and upload `license_keys.json` to the folder that opens.
Otherwise, you can look at the example outputs at the bottom of the notebook.



## 1. Colab Setup


In [None]:
import json, os
from google.colab import files

if 'spark_jsl.json' not in os.listdir():
  license_keys = files.upload()
  os.rename(list(license_keys.keys())[0], 'spark_jsl.json')

with open('spark_jsl.json') as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)
os.environ.update(license_keys)

In [None]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

In [3]:
import sparknlp
import sparknlp_jsl

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql.types import StringType, IntegerType

import pandas as pd
pd.set_option('display.max_colwidth', 200)

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

print("Spark NLP Version :", sparknlp.version())
print("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark

Spark NLP Version : 4.2.8
Spark NLP_JSL Version : 4.2.8


**Create a `DataFrame`**

In [4]:
from collections import OrderedDict

In [5]:
demo = {
        'ID':['1002301','4052191','7017021'],
        'SSN':['547-46-9390', '433-10-5021', '322-21-1197'],
        'NAME':['Alex Williams', 'David Smith', 'Mary Johnson'],
        'DOB':['1970-01-01', '1955-10-13','1965-04-03'],
        'POB':['Salt Lake City, Utah', 'Charleston, SC', 'Fayetteville, New York'],
        'ADDRESS':['615  Walton Street, Salt Lake City, UT, 84111', '3261  Broadway Street, Charleston, SC, 29424', '4116  Confederate Drive, Fayetteville, NY, 13066'],
        
}

enc = {'DATE':['1999-10-05', '1999-05-25'],
 'PATIENT':['Alex Williams', 'David Smith'],
 'SUMMARY':['Mr. Williams said he has cut his alcohol back to 6 pack once a week. He has cut back his cigarettes to one time per week. His PCP was M.D William who had suggested some tests.',
           'Mr. Smith is seen today.  I have not seen him since November. About three weeks ago he stopped his Prednisone on his own becausehe was gaining weight. '
           ],
 'MED_CODE':['99748', '99171'],
 'MED_UNITS': [5, 10]
}
 
other = {'ID':['1265343','1243987'],
 'PATIENT':['David Jones', 'Jason Martinez'],
 'PROVIDER':['Robert Smith', 'Shawn Johnson'],
 'PAYER':['Mark Daniel', 'Charles Wilson']
 }


In [6]:
df = pd.DataFrame(OrderedDict(demo))
df

Unnamed: 0,ID,SSN,NAME,DOB,POB,ADDRESS
0,1002301,547-46-9390,Alex Williams,1970-01-01,"Salt Lake City, Utah","615 Walton Street, Salt Lake City, UT, 84111"
1,4052191,433-10-5021,David Smith,1955-10-13,"Charleston, SC","3261 Broadway Street, Charleston, SC, 29424"
2,7017021,322-21-1197,Mary Johnson,1965-04-03,"Fayetteville, New York","4116 Confederate Drive, Fayetteville, NY, 13066"


**Build pipeline**

In [7]:
documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")


sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
    .setInputCols(["document"])\
    .setOutputCol("sentence")


tokenizer = Tokenizer()\
  .setInputCols(["sentence"])\
  .setOutputCol("token")

# Clinical word embeddings trained on PubMED dataset

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("embeddings")

# NER model trained on n2c2 datasets)

clinical_ner = MedicalNerModel.pretrained("ner_deid_enriched", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")
    

ner_converter = NerConverterInternal()\
  .setInputCols(["sentence", "token", "ner"])\
  .setOutputCol("ner_chunk")\
  .setGreedyMode(True)\
  .setIgnoreStopWords([","])\
  .setBlackList(['ID'])\
  .setPreservePosition(True)


nlpPipeline = Pipeline(stages=[
    documentAssembler, 
    sentenceDetector,
    tokenizer,
    word_embeddings,
    clinical_ner,
    ner_converter])


empty_data = spark.createDataFrame([[""]]).toDF("text")

model = LightPipeline(nlpPipeline.fit(empty_data))

deidentification = DeIdentificationModel.pretrained("deidentify_large", "en", "clinical/models") \
      .setInputCols(["sentence", "token", "ner_chunk"]) \
      .setOutputCol("masked") \
      .setMode("mask")


obfuscation = DeIdentificationModel.pretrained("deidentify_large", "en", "clinical/models") \
      .setInputCols(["sentence", "token", "ner_chunk"]) \
      .setOutputCol("obfuscated") \
      .setMode("obfuscate").setObfuscateDate(True)


sentence_detector_dl_healthcare download started this may take some time.
Approximate size to download 367.3 KB
[OK!]
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_deid_enriched download started this may take some time.
[OK!]
deidentify_large download started this may take some time.
Approximate size to download 188.1 KB
[OK!]
deidentify_large download started this may take some time.
Approximate size to download 188.1 KB
[OK!]


In [8]:
def deid_row(df):

    res_m = OrderedDict({})
    res_o = OrderedDict({})

    for col in df.columns:

        result = model.transform(spark.createDataFrame(pd.DataFrame({'text':[df[col].values[0]]})))


        deid_text = deidentification.transform(result)

        res1 = deid_text.toPandas()
        #print (res1.columns)
        sent = ''
        for r in res1['masked'].iloc[0]:
            #print (r)
            sent = sent + ' ' + r[3]
        res_m[col] = sent

        obfusated_text = obfuscation.transform(result)

        res2 = obfusated_text.toPandas()
        sent = ''
        for r in res2['obfuscated'].iloc[0]:
            sent = sent + ' ' + r[3]
        res_o[col] = sent

    return pd.DataFrame([res_m]), pd.DataFrame([res_o])
   
row_df_m = []
row_df_o = []
for row in df.index:
    r_m, r_o =  deid_row(df.iloc[row:row+1])
    row_df_m.append(r_m)
    row_df_o.append(r_o)
fdf_m = pd.concat(row_df_m)
fdf_o = pd.concat(row_df_o)

## **Obfuscate**

In [9]:
fdf_o

Unnamed: 0,ID,SSN,NAME,DOB,POB,ADDRESS
0,G4072911,469 34 883,Aundra Catalan,1970-01-14,"South Katherinefurt, Maine","16200 I4865777d Canyon Avenue, South Katherinefurt, Iowa, 80344"
0,B6323431,09438 20 16 33,Raul Sleet,1955-11-26,"Virginia beach, Georgia","Lille Vibyvej 8, Virginia beach, Georgia, 74347"
0,K548103,(52) 878-333,Sharran Go,1965-05-10,"Greenport, Alabama","1005 East 32Nd Street, Greenport, Georgia, 11298"


## **Mask**

In [10]:
fdf_m

Unnamed: 0,ID,SSN,NAME,DOB,POB,ADDRESS
0,<MEDICALRECORD>,<PHONE>,<PATIENT>,<DATE>,"<CITY>, <STATE>","<STREET>, <CITY>, <STATE>, <ZIP>"
0,<MEDICALRECORD>,<PHONE>,<PATIENT>,<DATE>,"<CITY>, <STATE>","<STREET>, <CITY>, <STATE>, <ZIP>"
0,<MEDICALRECORD>,<PHONE>,<PATIENT>,<DATE>,"<CITY>, <STATE>","<STREET>, <CITY>, <STATE>, <ZIP>"
