![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

#🔎 Financial Deidentification

In [0]:
from johnsnowlabs import *

#🔎 Deidentification Model

Some financial information can be considered sensitive. (e.g.,document, organization, address, signer)

In [0]:
documentAssembler = nlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

sentenceDetector =  nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx")\
    .setInputCols(["document"])\
    .setOutputCol("sentence")
    #.setCustomBounds(["\n\n"])

tokenizer =  nlp.Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

embeddings =  nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("embeddings")

fin_ner = finance.NerModel.pretrained("finner_deid", "en", "finance/models")\
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ner") 
    #.setLabelCasing("upper")

ner_converter = finance.NerConverterInternal() \
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("ner_chunk")\
    .setReplaceLabels({"ORG": "PARTY"}) # Replace "ORG" entity as "PARTY"

nlpPipeline = nlp.Pipeline(stages=[
      documentAssembler, 
      sentenceDetector,
      tokenizer,
      embeddings,
      fin_ner,
      ner_converter])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

###🚀 Pretrained NER models extracts:
- PROFESSION
- URL
- LOCATION-OTHER
- CITY
- DATE
- ZIP
- PERSON
- STATE
- COUNTRY
- STREET
- ORG
- PHONE
- EMAIL
- FAX
- AGE

In [0]:
fin_ner.getClasses()

In [0]:
text = """
(State or other jurisdictionof incorporation or organization)
(I.R.S. EmployerIdentification No.)
55 Almaden Boulevard, 6th Floor
San Jose, California 95113
(Address of principal executive offices and Zip Code)
799-9666
(Registrant’s telephone number, including area code) """

In [0]:
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))

In [0]:
from pyspark.sql import functions as F

result_df = result.select(F.explode(F.arrays_zip(result.token.result, 
                                                 result.ner.result)).alias("cols")) \
                  .select(F.expr("cols['0']").alias("token"),
                          F.expr("cols['1']").alias("ner_label"))

In [0]:
result_df.select("token", "ner_label").groupBy('ner_label').count().orderBy('count', ascending=False).show(truncate=False)

###🚀 Check extracted sensitive entities

In [0]:
result.select(F.explode(F.arrays_zip(result.ner_chunk.result, 
                                     result.ner_chunk.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

##🔎 Masking and Obfuscation

###✅ Replace these enitites with Tags

In [0]:
ner_converter = finance.NerConverterInternal()\
      .setInputCols(["sentence", "token", "ner"])\
      .setOutputCol("ner_chunk") 

deidentification = finance.DeIdentification() \
      .setInputCols(["sentence", "token", "ner_chunk"]) \
      .setOutputCol("deidentified") \
      .setMode("mask")\
      .setReturnEntityMappings(True) #  return a new column to save the mappings between the mask/obfuscated entities and original entities.
      #.setMappingsColumn("MappingCol") # change the name of the column, 'aux' is default

deidPipeline = nlp.Pipeline(stages=[
      documentAssembler, 
      sentenceDetector,
      tokenizer,
      embeddings,
      fin_ner,
      ner_converter,
      deidentification])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model_deid = deidPipeline.fit(empty_data)

In [0]:
result = model_deid.transform(spark.createDataFrame([[text]]).toDF("text"))

In [0]:
result.show()

In [0]:
result.select(F.explode(F.arrays_zip(result.sentence.result, result.deidentified.result)).alias("cols")) \
      .select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,(State or other jurisdictionof incorporation o...,(State or other jurisdictionof incorporation o...
1,"55 Almaden Boulevard, 6th Floor","<STREET>, 6th Floor"
2,"San Jose, California 95113","<CITY>, <STATE> <ZIP>"
3,(Address of principal executive offices and Zi...,(Address of principal executive offices and Zi...
4,"(Registrant’s telephone number, including area...","(Registrant’s telephone number, including area..."


📖We have three modes to mask the entities in the Deidentification annotator. You can select the modes using the `.setMaskingPolicy()` parameter. The methods are the followings:

**“entity_labels”**: Mask with the entity type of that chunk. (default) <br/>
**“same_length_chars”**: Mask the deid entities with same length of asterix ( * ) with brackets ( [ , ] ) on both end. <br/>
**“fixed_length_chars”**: Mask the deid entities with a fixed length of asterix ( * ). The length is setting up using the `setFixedMaskLength()` method. <br/>

Let's try each of these and compare the results:

In [0]:
#deid model with "entity_labels"
deid_entity_labels= finance.DeIdentification()\
    .setInputCols(["sentence", "token", "ner_chunk"])\
    .setOutputCol("deid_entity_label")\
    .setMode("mask")\
    .setReturnEntityMappings(True)\
    .setMaskingPolicy("entity_labels")

#deid model with "same_length_chars"
deid_same_length=  finance.DeIdentification()\
    .setInputCols(["sentence", "token", "ner_chunk"])\
    .setOutputCol("deid_same_length")\
    .setMode("mask")\
    .setReturnEntityMappings(True)\
    .setMaskingPolicy("same_length_chars")

#deid model with "fixed_length_chars"
deid_fixed_length=  finance.DeIdentification()\
    .setInputCols(["sentence", "token", "ner_chunk"])\
    .setOutputCol("deid_fixed_length")\
    .setMode("mask")\
    .setReturnEntityMappings(True)\
    .setMaskingPolicy("fixed_length_chars")\
    .setFixedMaskLength(4)


deidPipeline = nlp.Pipeline(stages=[
      documentAssembler, 
      sentenceDetector,
      tokenizer,
      embeddings,
      fin_ner,
      ner_converter,
      deid_entity_labels,
      deid_same_length,
      deid_fixed_length])


empty_data = spark.createDataFrame([[""]]).toDF("text")
model_deid = deidPipeline.fit(empty_data)

In [0]:
policy_result = model_deid.transform(spark.createDataFrame([[text]]).toDF("text"))

In [0]:
policy_result.show()

In [0]:
policy_result.select(F.explode(F.arrays_zip(policy_result.sentence.result, 
                                            policy_result.deid_entity_label.result, 
                                            policy_result.deid_same_length.result, 
                                            policy_result.deid_fixed_length.result)).alias("cols")) \
             .select(F.expr("cols['0']").alias("sentence"),
                     F.expr("cols['1']").alias("deid_entity_label"),
                     F.expr("cols['2']").alias("deid_same_length"),
                     F.expr("cols['3']").alias("deid_fixed_length")).toPandas()

Unnamed: 0,sentence,deid_entity_label,deid_same_length,deid_fixed_length
0,(State or other jurisdictionof incorporation o...,(State or other jurisdictionof incorporation o...,(State or other jurisdictionof incorporation o...,(State or other jurisdictionof incorporation o...
1,"55 Almaden Boulevard, 6th Floor","<STREET>, 6th Floor","[******************], 6th Floor","****, 6th Floor"
2,"San Jose, California 95113","<CITY>, <STATE> <ZIP>","[******], [********] [***]","****, **** ****"
3,(Address of principal executive offices and Zi...,(Address of principal executive offices and Zi...,(Address of principal executive offices and Zi...,(Address of principal executive offices and Zi...
4,"(Registrant’s telephone number, including area...","(Registrant’s telephone number, including area...","(Registrant’s telephone number, including area...","(Registrant’s telephone number, including area..."


###✅ Mapping Column

In [0]:
result.select("aux").show(truncate=False)

In [0]:
result.select(F.explode(F.arrays_zip(result.aux.metadata, 
                                     result.aux.result, 
                                     result.aux.begin, 
                                     result.aux.end)).alias("cols")) \
      .select(F.expr("cols['0']['originalChunk']").alias("chunk"),
              F.expr("cols['0']['beginOriginalChunk']").alias("beginChunk"),
              F.expr("cols['0']['endOriginalChunk']").alias("endChunk"),
              F.expr("cols['1']").alias("label"),
              F.expr("cols['2']").alias("beginLabel"),
              F.expr("cols['3']").alias("endLabel")).show(truncate=False)

##🔎 Reidentification

We can use `ReIdentification` annotator to go back to the original sentence.

In [0]:
reIdentification =  finance.ReIdentification()\
    .setInputCols(["aux","deidentified"])\
    .setOutputCol("original")

In [0]:
reid_result = reIdentification.transform(result)

In [0]:
reid_result.show()

In [0]:
print(text)

reid_result.select('original.result').show(truncate=False)

##📌 Using multiple NER in the same pipeline

In [0]:
documentAssembler = nlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

textSplitter = finance.TextSplitter()\
    .setInputCols(["document"])\
    .setOutputCol("sentence")

tokenizer = nlp.Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

embeddings = nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("embeddings")

bert_embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("bert_embeddings")

fin_ner = finance.NerModel.pretrained('finner_deid', "en", "finance/models")\
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ner") 
    #.setLabelCasing("upper")

ner_converter =  finance.NerConverterInternal() \
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("ner_chunk")\
    .setReplaceLabels({"ORG": "PARTY"}) # Replace "ORG" entity as "PARTY"

ner_finner = finance.NerModel.pretrained("finner_org_per_role_date", "en", "finance/models")\
    .setInputCols(["sentence", "token", "bert_embeddings"]) \
    .setOutputCol("ner_finner") 
    #.setLabelCasing("upper")

ner_converter_finner = finance.NerConverterInternal() \
    .setInputCols(["sentence", "token", "ner_finner"]) \
    .setOutputCol("ner_finner_chunk") \
    .setWhiteList(['ROLE']) # Just use "ROLE" entity from this NER

chunk_merge =  finance.ChunkMergeApproach()\
    .setInputCols("ner_finner_chunk", "ner_chunk")\
    .setOutputCol("deid_merged_chunk")

deidentification =  finance.DeIdentification() \
    .setInputCols(["sentence", "token", "deid_merged_chunk"]) \
    .setOutputCol("deidentified") \
    .setMode("mask")\
    .setIgnoreRegex(True)


nlpPipeline = nlp.Pipeline(stages=[
      documentAssembler, 
      textSplitter,
      tokenizer,
      embeddings,
      bert_embeddings,
      fin_ner,
      ner_converter,
      ner_finner,
      ner_converter_finner,
      chunk_merge,
      deidentification])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

In [0]:
text = """ Jeffrey Preston Bezos is an American entrepreneur, founder and CEO of Amazon  """ 

In [0]:
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))

# fin_ner
result.select(F.explode(F.arrays_zip(result.ner_chunk.result, 
                                     result.ner_chunk.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

In [0]:
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))

# ner_finner
result.select(F.explode(F.arrays_zip(result.ner_finner_chunk.result, 
                                     result.ner_finner_chunk.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

In [0]:
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))

# merged_chunk
result.select(F.explode(F.arrays_zip(result.deid_merged_chunk.result, 
                                     result.deid_merged_chunk.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

In [0]:
result.select(F.explode(F.arrays_zip(result.sentence.result, 
                                     result.deidentified.result)).alias("cols")) \
      .select(F.expr("cols['0']").alias("sentence"),
              F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,Jeffrey Preston Bezos is an American entrepren...,"<PERSON> is an American entrepreneur, <ROLE> a..."


##🔎 Obfuscation mode

In the obfuscation mode **DeIdentificationModel** will replace sensitive entities with random values of the same type.

In [0]:
# This is the obfuscation dict for the new entities
obs_lines = """5417543010#PHONE
(123)123-1234#PHONE
+18087339090#PHONE
(555) 555-1234#PHONE
541-700-3010#PHONE
HenryWatson@world.com#EMAIL
yousef@jacob.com#EMAIL
eric.shannon@geegle.com#EMAIL
mgt@jsl.com#EMAIL
gokhan@company.com#EMAIL
richard@company.it#EMAIL
TURER INC#PARTY
Clarus llc.#PARTY
SESA CO.#PARTY
John Snow Labs Inc#PARTY
MGT Trust Company, LLC.#PARTY
26-06-1990#EFFDATE
03/08/2025#EFFDATE
01/01/2045#EFFDATE
11/7/2016#EFFDATE
12-12-2022#EFFDATE
CEO#ROLE
CTO#ROLE
Director#ROLE
James Turner#PERSON
JUAN RAMIREZ#PERSON
Benjamin Curie#PERSON"""

with open ('obfuscate.txt', 'w') as f:
    f.write(obs_lines)

In [0]:
ner_converter_finner = finance.NerConverterInternal() \
    .setInputCols(["sentence", "token", "ner_finner"]) \
    .setOutputCol("ner_finner_chunk") \
    .setWhiteList(['ROLE'])\

chunk_merge =  finance.ChunkMergeApproach()\
    .setInputCols("ner_finner_chunk", "ner_chunk")\
    .setOutputCol("deid_merged_chunk")

obfuscation =  finance.DeIdentification()\
    .setInputCols(["sentence", "token", "deid_merged_chunk"]) \
    .setOutputCol("deidentified") \
    .setMode("obfuscate")\
    .setObfuscateDate(True)\
    .setObfuscateRefFile('obfuscate.txt')\
    .setObfuscateRefSource("both") #default: "faker"


nlpPipeline = nlp.Pipeline(stages=[
      documentAssembler, 
      sentenceDetector,
      tokenizer,
      embeddings,
      bert_embeddings,
      fin_ner,
      ner_converter,
      ner_finner,
      ner_converter_finner,
      chunk_merge,
      obfuscation])

obfuscation_model = nlpPipeline.fit(empty_data)

In [0]:
result = obfuscation_model.transform(spark.createDataFrame([[text]]).toDF("text"))

result.select(F.explode(F.arrays_zip(result.sentence.result, result.deidentified.result)).alias("cols")) \
      .select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,Jeffrey Preston Bezos is an American entrepren...,"JUAN RAMIREZ is an American entrepreneur, CEO ..."


##🔎 Faker Mode

📖The faker module allows the user to use a set of fake entities that are in the memory of spark-nlp-internal. You can set up this module using the following property: `setObfuscateRefSource('faker')`.

If we select the `setObfuscateRefSource('both')` then we choose randomly the entities using the faker and the fakes entities from the obfuscateRefFile.


The entities that are allowed right now are the followings:

* Location
* Location-other
* Hospital
* City
* State
* Zip
* Country
* Contact
* Username
* Phone
* Fax
* Url
* Email
* Profession
* Name
* Doctor
* Patient
* Id
* Idnum
* Bioid
* Age
* Organization
* Healthplan
* Medicalrecord
* Ssn
* Passport
* DLN
* NPI
* C_card
* IBAN
* DEA
* Device

In [0]:
ner_converter_finner = finance.NerConverterInternal() \
    .setInputCols(["sentence", "token", "ner_finner"]) \
    .setOutputCol("ner_finner_chunk") \
    .setWhiteList(['ROLE'])\

chunk_merge =  finance.ChunkMergeApproach()\
    .setInputCols("ner_finner_chunk", "ner_chunk")\
    .setOutputCol("deid_merged_chunk")

obfuscation =  finance.DeIdentification()\
    .setInputCols(["sentence", "token", "deid_merged_chunk"]) \
    .setOutputCol("deidentified") \
    .setMode("obfuscate")\
    .setObfuscateDate(True)\
    .setObfuscateRefSource("faker") \


nlpPipeline = nlp.Pipeline(stages=[
      documentAssembler, 
      sentenceDetector,
      tokenizer,
      embeddings,
      bert_embeddings,
      fin_ner,
      ner_converter,
      ner_finner,
      ner_converter_finner,
      chunk_merge,
      obfuscation])

obfuscation_model = nlpPipeline.fit(empty_data)

In [0]:
text = """"By  Mitesh Patel By:  Judson Hannigan Name: Mitesh Patel Name: Judson Hannigan Title: VP, Marketing Title: CEO ."""

result = obfuscation_model.transform(spark.createDataFrame([[text]]).toDF("text"))

result.select(F.explode(F.arrays_zip(result.sentence.result,
                                     result.deidentified.result)).alias("cols")) \
      .select(F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,"""By Mitesh Patel By: Judson Hannigan Name: M...","""By <PERSON> By: <PERSON> Name: <PERSON> Nam..."


##🚀 Use full pipeline in the Light model

In [0]:
light_model = nlp.LightPipeline(model)
annotated_text = light_model.annotate(text)
annotated_text['deidentified']

In [0]:
obf_light_model = nlp.LightPipeline(obfuscation_model)
annotated_text = obf_light_model.annotate(text)
annotated_text['deidentified']

##🚀 Shifting Days

We use the `medical.DocumentHashCoder()` annotator to determine shifting days. This annotator gets the hash of the specified column and creates a new document column containing day shift information. And then, the `medical.DeIdentification()` annotator deidentifies this new doc. We should set the seed parameter to hash consistently.

In [0]:
import pandas as pd

data = pd.DataFrame(
    {'DocumentID' : ['A001', 'A001', 'A002', 'A002'],
     'text' : ['Chris Brown was arrested on 10/02/2022', 
               'Mark White has bought a stock on 02/28/2020', 
               'John has bought a house on 03/15/2022',
               'John Moore was discharged on 12/31/2022'
              ]
    }
)

my_input_df = spark.createDataFrame(data)

my_input_df.show(truncate = False)

###✔️ Shifting days according to the ID column

We use the `finance.DocumentHashCoder()` annotator to determine shifting days. This annotator gets the hash of the specified column and creates a new document column containing day shift information. And then, the `finance.DeIdentification()` annotator deidentifies this new doc. We should set the seed parameter to hash consistently.

In [0]:
documentAssembler = nlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

documentHasher = finance.DocumentHashCoder()\
    .setInputCols("document")\
    .setOutputCol("document2")\
    .setPatientIdColumn("DocumentID")\
    .setRangeDays(100)\
    .setNewDateShift("shift_days")\
    .setSeed(100)


# sentenceDetector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx")\
#     .setInputCols(["document2"])\
#     .setOutputCol("sentence")
#     #.setCustomBounds(["\n\n"])

tokenizer = nlp.Tokenizer()\
    .setInputCols(["document2"])\
    .setOutputCol("token")

embeddings = nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") \
    .setInputCols(["document2", "token"]) \
    .setOutputCol("embeddings")

fin_ner = finance.NerModel.pretrained('finner_deid', "en", "finance/models")\
    .setInputCols(["document2", "token", "embeddings"]) \
    .setOutputCol("ner") 
    #.setLabelCasing("upper")

ner_converter = finance.NerConverterInternal() \
    .setInputCols(["document2", "token", "ner"])\
    .setOutputCol("ner_chunk") # "ALIAS" are secondary names of companies, so let's extract them also as PARTY


deid = finance.DeIdentification()\
    .setInputCols(["document2", "token", "ner_chunk"]) \
    .setOutputCol("deidentified") \
    .setMode("obfuscate") \
    .setObfuscateDate(True) \
    .setDateTag("DATE") \
    .setLanguage("en") \
    .setObfuscateRefSource('faker') \
    .setUseShifDays(True)\
    .setRegion('us')

pipeline = nlp.Pipeline(stages=[
      documentAssembler, 
      documentHasher,
      tokenizer,
      embeddings,
      fin_ner,
      ner_converter,
      deid])

empty_data = spark.createDataFrame([["", ""]]).toDF("text", "DocumentID")

pipeline_model = pipeline.fit(empty_data)

In [0]:
output = pipeline_model.transform(my_input_df)

output.select('DocumentID','text', 'deidentified.result').show(truncate = False)

###✔️ Shifting days according to specified values

Instead of shifting days according to ID column, we can specify shifting values with another column.

```python
documentHasher = finance.DocumentHashCoder()\
    .setInputCols("document")\
    .setOutputCol("document2")\
    .setDateShiftColumn("dateshift")\
```

In [0]:
data = pd.DataFrame(
    {'DocumentID' : ['A001', 'A001', 'A002', 'A002'],
     'text' : ['Chris Brown was arrested on 10/02/2019', 
               'Mark White has bought a stock on 02/28/2020', 
               'John has bought a house on 03/15/2022',
               'John Moore was discharged on 12/31/2022'
                            ],
     'dateshift' : ['5', '5', '10', '10']
    }
)


my_input_df = spark.createDataFrame(data)

my_input_df.show(truncate = False)

In [0]:
documentAssembler = nlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

documentHasher = finance.DocumentHashCoder()\
    .setInputCols("document")\
    .setOutputCol("document2")\
    .setDateShiftColumn("dateshift")\


# sentenceDetector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx")\
#     .setInputCols(["document2"])\
#     .setOutputCol("sentence")
#     #.setCustomBounds(["\n\n"])

tokenizer = nlp.Tokenizer()\
    .setInputCols(["document2"])\
    .setOutputCol("token")

embeddings = nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") \
    .setInputCols(["document2", "token"]) \
    .setOutputCol("embeddings")


fin_ner = finance.NerModel.pretrained('finner_deid', "en", "finance/models")\
    .setInputCols(["document2", "token", "embeddings"]) \
    .setOutputCol("ner") 
    #.setLabelCasing("upper")

ner_converter = finance.NerConverterInternal() \
    .setInputCols(["document2", "token", "ner"])\
    .setOutputCol("ner_chunk") # "ALIAS" are secondary names of companies, so let's extract them also as PARTY


deid = finance.DeIdentification()\
    .setInputCols(["document2", "token", "ner_chunk"]) \
    .setOutputCol("deidentified") \
    .setMode("obfuscate") \
    .setObfuscateDate(True) \
    .setDateTag("DATE") \
    .setLanguage("en") \
    .setObfuscateRefSource('faker') \
    .setUseShifDays(True)\
    .setRegion('us')

pipeline = nlp.Pipeline(stages=[
      documentAssembler, 
      documentHasher,
      tokenizer,
      embeddings,
      fin_ner,
      ner_converter,
      deid])

empty_data = spark.createDataFrame([["", "", ""]]).toDF("text", "DocumentID", "dateshift")

pipeline_model = pipeline.fit(empty_data)

In [0]:
output = pipeline_model.transform(my_input_df)

output.select('text', 'dateshift', 'deidentified.result').show(truncate = False)

###✔️ Masking Unnormalized Date Formats

`setUnnormalizedDateMode()` parameter is used to mask the DATE entities that can not be normalized. In the example below, please check `03Apr2022` which couldn't be normalized and it is masked in the output.

In [0]:
data = pd.DataFrame(
    {'DocumentID' : ['A001', 'A001', 'A002', 'A002'],
     'text' : ['Chris Brown was arrested on 10/02/2022', 
               'Mark White has bought a stock on 02/28/2020', 
               'John has bought a house on 03Apr2022',
               'John Moore has bought a property on 12/31/2022'
                            ],
     'dateshift' : ['5', '5', '10', '10']
    }
)

my_input_df = spark.createDataFrame(data)


documentAssembler = nlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

documentHasher = finance.DocumentHashCoder()\
    .setInputCols("document")\
    .setOutputCol("document2")\
    .setDateShiftColumn("dateshift")\


# sentenceDetector = nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl","xx")\
#     .setInputCols(["document2"])\
#     .setOutputCol("sentence")
#     #.setCustomBounds(["\n\n"])

tokenizer = nlp.Tokenizer()\
    .setInputCols(["document2"])\
    .setOutputCol("token")

embeddings = nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") \
    .setInputCols(["document2", "token"]) \
    .setOutputCol("embeddings")

fin_ner = finance.NerModel.pretrained('finner_deid', "en", "finance/models")\
    .setInputCols(["document2", "token", "embeddings"]) \
    .setOutputCol("ner") 
    #.setLabelCasing("upper")

ner_converter = finance.NerConverterInternal() \
    .setInputCols(["document2", "token", "ner"])\
    .setOutputCol("ner_chunk") # "ALIAS" are secondary names of companies, so let's extract them also as PARTY


deid = finance.DeIdentification()\
    .setInputCols(["document2", "token", "ner_chunk"]) \
    .setOutputCol("deidentified") \
    .setMode("obfuscate") \
    .setObfuscateDate(True) \
    .setDateTag("DATE") \
    .setLanguage("en") \
    .setObfuscateRefSource('faker') \
    .setUseShifDays(True)\
    .setRegion('us')\
    .setUnnormalizedDateMode("mask")

pipeline = nlp.Pipeline(stages=[
      documentAssembler, 
      documentHasher,
      sentenceDetector,
      tokenizer,
      embeddings,
      fin_ner,
      ner_converter,
      deid])


output = pipeline.fit(my_input_df).transform(my_input_df)

output.select('text', 'dateshift', 'deidentified.result').show(truncate = False)

#🔎 Structured Deidentification

In [0]:
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/finance-nlp/data/hipaa-table-001.txt

dbutils.fs.cp("file:/databricks/driver/hipaa-table-001.txt", "dbfs:/") 

In [0]:
df = pd.read_csv("hipaa-table-001.txt", sep = "\t") 
df = spark.createDataFrame(df)
df.show(truncate=False)

In [0]:
obfuscator = finance.StructuredDeidentification(spark,{"NAME":"PATIENT","AGE":"AGE"}, obfuscateRefSource = "faker")
obfuscator_df = obfuscator.obfuscateColumns(df)
obfuscator_df.show(truncate=False)

In [0]:
obfuscator_unique_ref_test = '''Will Perry#CLIENT
John Smith#CLIENT
Marvin MARSHALL#CLIENT
Hubert GROGAN#CLIENT
ALTHEA COLBURN#CLIENT
Kalil AMIN#CLIENT
Inci FOUNTAIN#CLIENT
Jackson WILLE#CLIENT
Jack SANTOS#CLIENT
Mahmood ALBURN#CLIENT
Marnie MELINGTON#CLIENT
Aysha GHAZI#CLIENT
Maryland CODER#CLIENT
Darene GEORGIOUS#CLIENT
Shelly WELLBECK#CLIENT
Min Kun JAE#CLIENT
Thomson THOMAS#CLIENT
Christian SUDDINBURG#CLIENT
Aberdeen#CITY
Louisburg St#STREET
France#LOC
5552312#PHONE
Calle del Libertador#ADDRESS
111#ID
20#AGE
30#AGE
40#AGE
50#AGE
60#AGE
'''

with open('obfuscator_unique_ref_test.txt', 'w') as f:
  f.write(obfuscator_unique_ref_test)

In [0]:
# obfuscateRefSource = "file"

obfuscator = finance.StructuredDeidentification(spark,{"NAME":"CLIENT","AGE":"AGE"}, 
                                        obfuscateRefFile = "/dbfs/obfuscator_unique_ref_test.txt",
                                        obfuscateRefSource = "file",
                                        columnsSeed={"NAME": 23, "AGE": 23})
obfuscator_df = obfuscator.obfuscateColumns(df)
obfuscator_df.select("NAME","AGE").show(truncate=False)

We can **shift n days** in the structured deidentification through "days" parameter when the column is a Date.

In [0]:
df = spark.createDataFrame([
            ["Juan García", "13/02/1977", "711 Nulla St.", "140", "673 431234"],
            ["Will Smith", "23/02/1977", "1 Green Avenue.", "140", "+23 (673) 431234"],
            ["Pedro Ximénez", "11/04/1900", "Calle del Libertador, 7", "100", "912 345623"]
        ]).toDF("NAME", "DOB", "ADDRESS", "SBP", "TEL")
df.show(truncate=False)

In [0]:
obfuscator = finance.StructuredDeidentification(spark=spark, 
                                        columns={"NAME": "ID", "DOB": "DATE"},
                                        columnsSeed={"NAME": 23, "DOB": 23},
                                        obfuscateRefSource="faker",
                                        days=5
                                         )

In [0]:
result = obfuscator.obfuscateColumns(df)
result.show(truncate=False)

# Save the Pipeline and Use it from Your Local

In [0]:
documentAssembler = nlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

textSplitter = finance.TextSplitter()\
    .setInputCols(["document"])\
    .setOutputCol("sentence")

tokenizer = nlp.Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

embeddings = nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base","en") \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("embeddings")

bert_embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en") \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("bert_embeddings")

fin_ner = finance.NerModel.pretrained('finner_deid', "en", "finance/models")\
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ner") 
    #.setLabelCasing("upper")

ner_converter =  finance.NerConverterInternal() \
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("ner_chunk")\
    .setReplaceLabels({"ORG": "PARTY"}) # Replace "ORG" entity as "PARTY"

ner_finner = finance.NerModel.pretrained("finner_org_per_role_date", "en", "finance/models")\
    .setInputCols(["sentence", "token", "bert_embeddings"]) \
    .setOutputCol("ner_finner") 
    #.setLabelCasing("upper")

ner_converter_finner = finance.NerConverterInternal() \
    .setInputCols(["sentence", "token", "ner_finner"]) \
    .setOutputCol("ner_finner_chunk") \
    .setWhiteList(['ROLE']) # Just use "ROLE" entity from this NER

chunk_merge =  finance.ChunkMergeApproach()\
    .setInputCols("ner_finner_chunk", "ner_chunk")\
    .setOutputCol("deid_merged_chunk")

deidentification =  finance.DeIdentification() \
    .setInputCols(["sentence", "token", "deid_merged_chunk"]) \
    .setOutputCol("deidentified") \
    .setMode("mask")\
    .setIgnoreRegex(True)


nlpPipeline = nlp.Pipeline(stages=[
      documentAssembler, 
      textSplitter,
      tokenizer,
      embeddings,
      bert_embeddings,
      fin_ner,
      ner_converter,
      ner_finner,
      ner_converter_finner,
      chunk_merge,
      deidentification])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

In [0]:
model.write().overwrite().save('/dbfs/pipeline_deid')

In [0]:
# from sparknlp.pretrained import PretrainedPipeline

deid_pipeline = nlp.PretrainedPipeline.from_disk("/dbfs/pipeline_deid")

In [0]:
text = """ Jeffrey Preston Bezos is an American entrepreneur, founder and CEO of Amazon  """ 

data = spark.createDataFrame([[text]]).toDF("text")

In [0]:
deid_pipeline.model.stages

In [0]:
deid_pipeline.model.transform(data).show()