![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/finance-nlp/11.0.Deidentification.ipynb)

# Financial Deidentification

In [None]:
from johnsnowlabs import *

# nlp.install(force_browser=True)

# Deidentification Model

Some legal information can be considered sensitive. (e.g.,document, organization, address, signer)

In [None]:
documentAssembler = nlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

textSplitter = nlp.SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentence")

tokenizer = nlp.Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en")\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("embeddings")

ner_model = finance.NerModel.pretrained('finner_sec_10k_summary', 'en', 'finance/models')\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ner")

ner_converter = finance.NerConverterInternal() \
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("ner_chunk")

nlpPipeline = nlp.Pipeline(stages=[
      documentAssembler, 
      textSplitter,
      tokenizer,
      embeddings,
      ner_model,
      ner_converter])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

bert_embeddings_sec_bert_base download started this may take some time.
Approximate size to download 390.4 MB
[OK!]
finner_sec_10k_summary download started this may take some time.
[OK!]


### Pretrained NER models extracts...

In [None]:
ner_model.getClasses()

['O',
 'I-ADDRESS',
 'B-FISCAL_YEAR',
 'I-FISCAL_YEAR',
 'B-TICKER',
 'I-TITLE_CLASS_VALUE',
 'B-TITLE_CLASS',
 'I-TITLE_CLASS',
 'B-ADDRESS',
 'B-ORG',
 'B-CFN',
 'I-ORG',
 'B-PHONE',
 'I-PHONE',
 'I-STOCK_EXCHANGE',
 'I-CFN',
 'B-IRS',
 'B-STATE',
 'B-TITLE_CLASS_VALUE',
 'B-STOCK_EXCHANGE']

In [None]:
text= """
Commission file number 000-15867 
_____________________________________
 
CADENCE DESIGN SYSTEMS, INC. 
(Exact name of registrant as specified in its charter)
____________________________________ 
Delaware
 
00-0000000
(State or Other Jurisdiction ofIncorporation or Organization)
 
(I.R.S. EmployerIdentification No.)
2655 Seely Avenue, Building 5,
San Jose,
California
 
95134
(Address of Principal Executive Offices)
 
(Zip Code)
(408)
-943-1234 
(Registrant’s Telephone Number, including Area Code) 
Securities registered pursuant to Section 12(b) of the Act:
Title of Each Class
Trading Symbol(s)
Names of Each Exchange on which Registered
Common Stock, $0.01 par value per share
CDNS
Nasdaq Global Select Market
Securities registered pursuant to Section 12(g) of the Act:"""

In [None]:
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))

In [None]:
from pyspark.sql import functions as F

result_df = result.select(F.explode(F.arrays_zip(result.token.result, 
                                                 result.ner.result)).alias("cols")) \
                  .select(F.expr("cols['0']").alias("token"),
                          F.expr("cols['1']").alias("ner_label"))

In [None]:
result_df.select("token", "ner_label").groupBy('ner_label').count().orderBy('count', ascending=False).show(truncate=False)

+-------------------+-----+
|ner_label          |count|
+-------------------+-----+
|O                  |101  |
|I-ADDRESS          |10   |
|I-PHONE            |4    |
|I-ORG              |4    |
|I-STOCK_EXCHANGE   |3    |
|B-CFN              |1    |
|B-STATE            |1    |
|B-ORG              |1    |
|B-IRS              |1    |
|B-TICKER           |1    |
|B-ADDRESS          |1    |
|B-TITLE_CLASS_VALUE|1    |
|I-TITLE_CLASS      |1    |
|B-PHONE            |1    |
|B-STOCK_EXCHANGE   |1    |
|B-TITLE_CLASS      |1    |
+-------------------+-----+



### Check extracted sensitive entities

In [None]:
result.select(F.explode(F.arrays_zip(result.ner_chunk.result, 
                                     result.ner_chunk.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+---------------------------------------------------+-----------------+
|chunk                                              |ner_label        |
+---------------------------------------------------+-----------------+
|000-15867                                          |CFN              |
|CADENCE DESIGN SYSTEMS, INC                        |ORG              |
|Delaware                                           |STATE            |
|00-0000000                                         |IRS              |
|2655 Seely Avenue, Building 5,
San Jose,
California|ADDRESS          |
|(408)
-943-1234                                    |PHONE            |
|Common Stock                                       |TITLE_CLASS      |
|$0.01                                              |TITLE_CLASS_VALUE|
|CDNS                                               |TICKER           |
|Nasdaq Global Select Market                        |STOCK_EXCHANGE   |
+---------------------------------------------------+-----------

## Masking and Obfuscation

### Replace these enitites with Tags

In [None]:
deidentification = finance.DeIdentification() \
      .setInputCols(["sentence", "token", "ner_chunk"]) \
      .setOutputCol("deidentified") \
      .setMode("mask")\
      .setReturnEntityMappings(True) #  return a new column to save the mappings between the mask/obfuscated entities and original entities. REquired for "ReIdentification"
      #.setMappingsColumn("MappingCol") # change the name of the column, 'aux' is default

deidPipeline = nlp.Pipeline(stages=[
      documentAssembler, 
      textSplitter,
      tokenizer,
      embeddings,
      ner_model,
      ner_converter,
      deidentification])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model_deid = deidPipeline.fit(empty_data)

In [None]:
result = model_deid.transform(spark.createDataFrame([[text]]).toDF("text"))

In [None]:
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|           ner_chunk|        deidentified|                 aux|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|
Commission file ...|[{document, 0, 77...|[{document, 1, 10...|[{token, 1, 10, C...|[{word_embeddings...|[{named_entity, 1...|[{chunk, 24, 32, ...|[{document, 0, 75...|[{chunk, 23, 27, ...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [None]:
reIdentification = finance.ReIdentification()\
    .setInputCols(["aux","deidentified"])\
    .setOutputCol("original")

In [None]:
reid_result = reIdentification.transform(result)

In [None]:
reid_result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|           ner_chunk|        deidentified|                 aux|            original|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|
Commission file ...|[{document, 0, 77...|[{document, 1, 10...|[{token, 1, 10, C...|[{word_embeddings...|[{named_entity, 1...|[{chunk, 24, 32, ...|[{document, 0, 75...|[{chunk, 23, 27, ...|[{document, 1, 10...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----

# ReIdentification

In [None]:
print(text)

reid_result.select('original.result').show(truncate=False)


Commission file number 000-15867 
_____________________________________
 
CADENCE DESIGN SYSTEMS, INC. 
(Exact name of registrant as specified in its charter)
____________________________________ 
Delaware
 
00-0000000
(State or Other Jurisdiction ofIncorporation or Organization)
 
(I.R.S. EmployerIdentification No.)
2655 Seely Avenue, Building 5,
San Jose,
California
 
95134
(Address of Principal Executive Offices)
 
(Zip Code)
(408)
-943-1234 
(Registrant’s Telephone Number, including Area Code) 
Securities registered pursuant to Section 12(b) of the Act:
Title of Each Class
Trading Symbol(s)
Names of Each Exchange on which Registered
Common Stock, $0.01 par value per share
CDNS
Nasdaq Global Select Market
Securities registered pursuant to Section 12(g) of the Act:
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
result.select(F.explode(F.arrays_zip(result.sentence.result, 
                                     result.deidentified.result)).alias("cols")) \
      .select(F.expr("cols['0']").alias("sentence"), 
              F.expr("cols['1']").alias("deidentified")).toPandas()

Unnamed: 0,sentence,deidentified
0,Commission file number 000-15867 \n___________...,Commission file number <CFN> \n_______________...
1,(Exact name of registrant as specified in its ...,(Exact name of registrant as specified in its ...
2,EmployerIdentification No.,EmployerIdentification No.
3,")\n2655 Seely Avenue, Building 5,\nSan Jose,\n...",)\n<ADDRESS>\n \n95134\n(Address of Principal ...
4,(b) of the Act:\nTitle of Each Class\nTrading ...,(b) of the Act:\nTitle of Each Class\nTrading ...
5,(s)\nNames of Each Exchange on which Registere...,(s)\nNames of Each Exchange on which Registere...
6,(g) of the Act:,(g) of the Act:


## Other different masking strategies 

We have three modes to mask the entities in the Deidentification annotator. You can select the modes using the `.setMaskingPolicy()` parameter. The methods are the followings:

**“entity_labels”**: Mask with the entity type of that chunk. (default) <br/>
**“same_length_chars”**: Mask the deid entities with same length of asterix ( * ) with brackets ( [ , ] ) on both end. <br/>
**“fixed_length_chars”**: Mask the deid entities with a fixed length of asterix ( * ). The length is setting up using the `setFixedMaskLength()` method. <br/>

Let's try each of these and compare the results:

In [None]:
#deid model with "entity_labels"
deid_entity_labels= finance.DeIdentification()\
    .setInputCols(["sentence", "token", "ner_chunk"])\
    .setOutputCol("deid_entity_label")\
    .setMode("mask")\
    .setReturnEntityMappings(False)\
    .setMaskingPolicy("entity_labels")

#deid model with "same_length_chars"
deid_same_length= finance.DeIdentification()\
    .setInputCols(["sentence", "token", "ner_chunk"])\
    .setOutputCol("deid_same_length")\
    .setMode("mask")\
    .setReturnEntityMappings(False)\
    .setMaskingPolicy("same_length_chars")

#deid model with "fixed_length_chars"
deid_fixed_length= finance.DeIdentification()\
    .setInputCols(["sentence", "token", "ner_chunk"])\
    .setOutputCol("deid_fixed_length")\
    .setMode("mask")\
    .setReturnEntityMappings(False)\
    .setMaskingPolicy("fixed_length_chars")\
    .setFixedMaskLength(4)


deidPipeline = nlp.Pipeline(stages=[
      documentAssembler, 
      textSplitter,
      tokenizer,
      embeddings,
      ner_model,
      ner_converter,
      deid_entity_labels,
      deid_same_length,
      deid_fixed_length])


empty_data = spark.createDataFrame([[""]]).toDF("text")
model_deid = deidPipeline.fit(empty_data)

In [None]:
result = model_deid.transform(spark.createDataFrame([[text]]).toDF("text"))

In [None]:
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|           ner_chunk|   deid_entity_label|    deid_same_length|   deid_fixed_length|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|
Commission file ...|[{document, 0, 77...|[{document, 1, 10...|[{token, 1, 10, C...|[{word_embeddings...|[{named_entity, 1...|[{chunk, 24, 32, ...|[{document, 0, 75...|[{document, 0, 10...|[{document, 0, 73...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----

In [None]:
result.select(F.explode(F.arrays_zip(result.sentence.result, 
                                     result.deid_entity_label.result, 
                                     result.deid_same_length.result, 
                                     result.deid_fixed_length.result)).alias("cols")) \
      .select(F.expr("cols['0']").alias("sentence"),
              F.expr("cols['1']").alias("deid_entity_label"),
              F.expr("cols['2']").alias("deid_same_length"),
              F.expr("cols['3']").alias("deid_fixed_length")).toPandas()

Unnamed: 0,sentence,deid_entity_label,deid_same_length,deid_fixed_length
0,Commission file number 000-15867 \n___________...,Commission file number <CFN> \n_______________...,Commission file number [*******] \n___________...,Commission file number **** \n________________...
1,(Exact name of registrant as specified in its ...,(Exact name of registrant as specified in its ...,(Exact name of registrant as specified in its ...,(Exact name of registrant as specified in its ...
2,EmployerIdentification No.,EmployerIdentification No.,EmployerIdentification No.,EmployerIdentification No.
3,")\n2655 Seely Avenue, Building 5,\nSan Jose,\n...",)\n<ADDRESS>\n \n95134\n(Address of Principal ...,)\n[******************************************...,)\n****\n \n95134\n(Address of Principal Execu...
4,(b) of the Act:\nTitle of Each Class\nTrading ...,(b) of the Act:\nTitle of Each Class\nTrading ...,(b) of the Act:\nTitle of Each Class\nTrading ...,(b) of the Act:\nTitle of Each Class\nTrading ...
5,(s)\nNames of Each Exchange on which Registere...,(s)\nNames of Each Exchange on which Registere...,(s)\nNames of Each Exchange on which Registere...,(s)\nNames of Each Exchange on which Registere...
6,(g) of the Act:,(g) of the Act:,(g) of the Act:,(g) of the Act:


### Mapping Column

In [None]:
result.select("ner_chunk").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
result.select(F.explode(F.arrays_zip(result.ner_chunk.metadata,
                                     result.ner_chunk.begin, 
                                     result.ner_chunk.end)).alias("cols")) \
      .select(F.expr("cols['0']['entity']").alias("label"),
              F.expr("cols['1']").alias("beginLabel"),
              F.expr("cols['2']").alias("endLabel")).show(truncate=False)

+-----------------+----------+--------+
|label            |beginLabel|endLabel|
+-----------------+----------+--------+
|CFN              |24        |32      |
|ORG              |75        |101     |
|STATE            |198       |205     |
|IRS              |209       |218     |
|ADDRESS          |320       |370     |
|PHONE            |434       |448     |
|TITLE_CLASS      |646       |657     |
|TITLE_CLASS_VALUE|660       |664     |
|TICKER           |686       |689     |
|STOCK_EXCHANGE   |691       |717     |
+-----------------+----------+--------+



## Using NER, ContextualParser and ZeroShotNER in the same Deideintification pipeline

In [None]:
# Create JSON file for PART
alias = {
  "entity": "ALIAS",
  "ruleScope": "document", 
  "completeMatchRegex": "true",
  "regex":'["“].*?["”]',
  "matchScope": "sub-token",
  "contextLength": 100
}

email = {
  "entity": "EMAIL",
  "ruleScope": "document", 
  "completeMatchRegex": "true",
  "regex":'[\w-\.]+@([\w-]+\.)+[\w-]{2,4}',
  "matchScope": "sub-token",
  "contextLength": 100
}

phone = {
  "entity": "PHONE",
  "ruleScope": "document", 
  "completeMatchRegex": "true",
  "regex":'(\+?\d{1,3}[\s-]?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d+',
  "matchScope": "sub-token",
  "contextLength": 100
}

import json
with open('alias.json', 'w') as f:
    json.dump(alias, f)
    
with open('email.json', 'w') as f:
    json.dump(email, f)
    
with open('phone.json', 'w') as f:
    json.dump(phone, f)

In [None]:
documentAssembler = nlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

textSplitter = nlp.SentenceDetector()\
    .setInputCols(["document"])\
    .setOutputCol("sentence")

tokenizer = nlp.Tokenizer()\
    .setInputCols(["sentence"])\
    .setOutputCol("token")

embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en")\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("embeddings")

ner_model = finance.NerModel.pretrained('finner_sec_10k_summary', 'en', 'finance/models')\
    .setInputCols(["sentence", "token", "embeddings"])\
    .setOutputCol("ner")

ner_converter = finance.NerConverterInternal() \
    .setInputCols(["sentence", "token", "ner"])\
    .setOutputCol("ner_chunk")\
    .setGreedyMode(True)

zero_shot_ner = finance.ZeroShotNerModel.pretrained("finner_roberta_zeroshot", "en", "finance/models")\
    .setInputCols(["sentence", "token"])\
    .setOutputCol("zero_shot_ner")\
    .setPredictionThreshold(0.1)\
    .setEntityDefinitions(
        {            
            "ADDRESS":["Which address?", "Where is the location?"],
            "PERSON": ["Which person?", "What is the person name?"],
            "ORG": ["Which LLC?", "Which Inc?", "Which PLC?", "Which Corp?"]
        })


zeroshot_ner_converter = finance.NerConverterInternal() \
    .setInputCols(["sentence", "token", "zero_shot_ner"])\
    .setOutputCol("zero_ner_chunk")\
    .setGreedyMode(True)

alias_parser = finance.ContextualParserApproach() \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("alias")\
    .setJsonPath("alias.json") \
    .setPrefixAndSuffixMatch(False)\
    .setOptionalContextRules(True)\
    .setCaseSensitive(False)

email_parser = finance.ContextualParserApproach() \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("email")\
    .setJsonPath("email.json") \
    .setPrefixAndSuffixMatch(False)\
    .setOptionalContextRules(True)\
    .setCaseSensitive(False)

phone_parser = finance.ContextualParserApproach() \
    .setInputCols(["sentence", "token"]) \
    .setOutputCol("phone")\
    .setJsonPath("phone.json") \
    .setPrefixAndSuffixMatch(False)\
    .setOptionalContextRules(True)\
    .setCaseSensitive(False)

chunk_merger = finance.ChunkMergeApproach()\
    .setInputCols("email", "phone", "ner_chunk", "zero_ner_chunk", "alias")\
    .setOutputCol('merged_ner_chunks')

nlpPipeline = nlp.Pipeline(stages=[
      documentAssembler, 
      textSplitter,
      tokenizer,
      embeddings,
      ner_model,
      ner_converter,
      zero_shot_ner,
      zeroshot_ner_converter,
      alias_parser,
      email_parser,
      phone_parser,
      chunk_merger])

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

bert_embeddings_sec_bert_base download started this may take some time.
Approximate size to download 390.4 MB
[OK!]
finner_sec_10k_summary download started this may take some time.
[OK!]
finner_roberta_zeroshot download started this may take some time.
[OK!]


In [None]:
text= """
Commission file number 000-15867 
_____________________________________
 
CADENCE DESIGN SYSTEMS, INC. 
(Exact name of registrant as specified in its charter)
____________________________________ 
Delaware
 
00-0000000
(State or Other Jurisdiction ofIncorporation or Organization)
 
(I.R.S. EmployerIdentification No.)
2655 Seely Avenue, Building 5,
San Jose,
California
 
95134
(Address of Principal Executive Offices)
 
(Zip Code)
(408)
-943-1234 
(Registrant’s Telephone Number, including Area Code) 
Securities registered pursuant to Section 12(b) of the Act:
Title of Each Class
Trading Symbol(s)
Names of Each Exchange on which Registered
Common Stock, $0.01 par value per share
CDNS
Nasdaq Global Select Market
Securities registered pursuant to Section 12(g) of the Act:"""

In [None]:
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))

# financial_ner (10k summary)
result.select(F.explode(F.arrays_zip(result.ner_chunk.result, 
                                     result.ner_chunk.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+---------------------------------------------------+-----------------+
|chunk                                              |ner_label        |
+---------------------------------------------------+-----------------+
|000-15867                                          |CFN              |
|CADENCE DESIGN SYSTEMS, INC                        |ORG              |
|Delaware                                           |STATE            |
|00-0000000                                         |IRS              |
|2655 Seely Avenue, Building 5,
San Jose,
California|ADDRESS          |
|(408)
-943-1234                                    |PHONE            |
|Common Stock                                       |TITLE_CLASS      |
|$0.01                                              |TITLE_CLASS_VALUE|
|CDNS                                               |TICKER           |
|Nasdaq Global Select Market                        |STOCK_EXCHANGE   |
+---------------------------------------------------+-----------

In [None]:
# zero_shot_ner
result.select(F.explode(F.arrays_zip(result.zero_ner_chunk.result, 
                                     result.zero_ner_chunk.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(truncate=False)

+----------------------+---------+
|chunk                 |ner_label|
+----------------------+---------+
|CADENCE DESIGN SYSTEMS|ORG      |
|INC                   |ORG      |
+----------------------+---------+



It's important the order of the models in MergerChunk. For example, in this case, we have put before the ContextualParser for PHONE numbers than the NER for 10k summaries, which means `000-15867` is detected first as a PHONE number due to the ContextualParser regular expressions without predefined context, and then `CFN` from the NER, whih comes after, is ignored.

In [None]:
# merged_chunk
result.select(F.explode(F.arrays_zip(result.merged_ner_chunks.result, 
                                     result.merged_ner_chunks.metadata)).alias("cols")) \
      .select(F.expr("cols['0']").alias("chunk"),
              F.expr("cols['1']['entity']").alias("ner_label")).show(n=50, truncate=False)

+---------------------------------------------------+-----------------+
|chunk                                              |ner_label        |
+---------------------------------------------------+-----------------+
|000-15867                                          |PHONE            |
|CADENCE DESIGN SYSTEMS, INC                        |ORG              |
|Delaware                                           |STATE            |
|00-0000000                                         |PHONE            |
|2655 Seely Avenue, Building 5,
San Jose,
California|ADDRESS          |
|(408)
-943-1234                                    |PHONE            |
|Common Stock                                       |TITLE_CLASS      |
|$0.01                                              |TITLE_CLASS_VALUE|
|CDNS                                               |TICKER           |
|Nasdaq Global Select Market                        |STOCK_EXCHANGE   |
+---------------------------------------------------+-----------

## Obfuscation mode

In the obfuscation mode **DeIdentificationModel** will replace sensitive entities with random values of the same type. 


### Using external [Faker](https://faker.readthedocs.io/en/master/) library

In [None]:
!pip install faker

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faker
  Downloading Faker-17.3.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-17.3.0


In [None]:
from faker import Faker
fk = Faker()

In [None]:
# This is the obfuscation dict for the new entities
obs_lines = """NASDAQ#STOCK_EXCHANGE
NYSE#STOCK_EXCHANGE
London Stock Exchange#STOCK_EXCHANGE
Tokyo Stock Exchange#STOCK_EXCHANGE
ABCD#TICKER
EFGH#TICKER
YLJJ#TICKER
Common Stock#TITLE_CLASS
Preferred Stock#TITLE_CLASS
$0.01#TITLE_CLASS_VALUE
USD 0.025#TITLE_CLASS_VALUE
000-00001#CFN
000-00002#CFN
000-00003#CFN"""

for _ in range(25):
    obs_lines += f"\n{fk.name().strip()}#PERSON"
    obs_lines += f"\n{fk.date().strip()}#DATE"
    obs_lines += f"\n{fk.company().strip()}#ORG"
    obs_lines += f"\n{fk.phone_number().strip()}#PHONE"
    obs_lines += f"\n{fk.email().strip()}#EMAIL"
    obs_lines += f"\n{fk.street_address().strip()}#STREET"
    obs_lines += f"\n{fk.city().strip()}#CITY"
    obs_lines += f"\n{fk.state().strip()}#STATE"
    obs_lines += f"\n{fk.country().strip()}#COUNTRY"

with open ('obfuscate.txt', 'w') as f:
    f.write(obs_lines)

In [None]:
# Previous Masking Annotators
#deid model with "entity_labels"
deid_entity_labels= finance.DeIdentification()\
    .setInputCols(["sentence", "token", "merged_ner_chunks"])\
    .setOutputCol("deidentified")\
    .setMode("mask")\
    .setMaskingPolicy("entity_labels")
    
#deid model with "same_length_chars"
deid_same_length= finance.DeIdentification()\
    .setInputCols(["sentence", "token", "merged_ner_chunks"])\
    .setOutputCol("masked_with_chars")\
    .setMode("mask")\
    .setMaskingPolicy("same_length_chars")

#deid model with "fixed_length_chars"
deid_fixed_length= finance.DeIdentification()\
    .setInputCols(["sentence", "token", "merged_ner_chunks"])\
    .setOutputCol("masked_fixed_length_chars")\
    .setMode("mask")\
    .setMaskingPolicy("fixed_length_chars")\
    .setFixedMaskLength(4)


In [None]:
# Obfuscation with Faker
obfuscation = finance.DeIdentification()\
    .setInputCols(["sentence", "token", "merged_ner_chunks"]) \
    .setOutputCol("obfuscated") \
    .setMode("obfuscate")\
    .setObfuscateDate(True)\
    .setObfuscateRefFile('obfuscate.txt')\
    .setObfuscateRefSource("both")

nlpPipeline = nlp.Pipeline(stages=[
      documentAssembler, 
      textSplitter,
      tokenizer,
      embeddings,
      ner_model,
      ner_converter,
      zero_shot_ner,
      zeroshot_ner_converter,
      alias_parser,
      email_parser,
      phone_parser,
      chunk_merger,
      deid_entity_labels,
      deid_same_length,
      deid_fixed_length,
      obfuscation])

obfuscation_model = nlpPipeline.fit(empty_data)

In [None]:
text= """
Commission file number 000-15867 
_____________________________________
 
CADENCE DESIGN SYSTEMS, INC. 
(Exact name of registrant as specified in its charter)
____________________________________ 
Delaware
 
00-0000000
(State or Other Jurisdiction ofIncorporation or Organization)
 
(I.R.S. EmployerIdentification No.)
2655 Seely Avenue, Building 5,
San Jose,
California
 
95134
(Address of Principal Executive Offices)
 
(Zip Code)
(408)
-943-1234 
(Registrant’s Telephone Number, including Area Code) 
Securities registered pursuant to Section 12(b) of the Act:
Title of Each Class
Trading Symbol(s)
Names of Each Exchange on which Registered
Common Stock, $0.01 par value per share
CDNS
Nasdaq Global Select Market
Securities registered pursuant to Section 12(g) of the Act:"""

In [None]:
result = obfuscation_model.transform(spark.createDataFrame([[text]]).toDF("text"))

print("\n".join(result.select('obfuscated.result').collect()[0].result))

Commission file number 0699 273 53 89 
_____________________________________
 
Carr Group.
(Exact name of registrant as specified in its charter)
____________________________________ 
IllinoisIndiana
 
74 911 574
(State or Other Jurisdiction ofIncorporation or Organization)
 
(I.R.S.
EmployerIdentification No.
)
<ADDRESS>
 
95134
(Address of Principal Executive Offices)
 
(Zip Code)
0472 51 11 42 
(Registrant’s Telephone Number, including Area Code) 
Securities registered pursuant to Section 12
(b) of the Act:
Title of Each Class
Trading Symbol
(s)
Names of Each Exchange on which Registered
Common Stock, USD 0.025 par value per share
YLJJ
NYSE
Securities registered pursuant to Section 12
(g) of the Act:


## Using Light Pipelines

In [None]:
light_model = nlp.LightPipeline(obfuscation_model)
annotated_text = light_model.annotate(text)

print("\n".join(annotated_text['deidentified']))

Commission file number <PHONE> 
_____________________________________
 
<ORG>.
(Exact name of registrant as specified in its charter)
____________________________________ 
<STATE>
 
<PHONE>
(State or Other Jurisdiction ofIncorporation or Organization)
 
(I.R.S.
EmployerIdentification No.
)
<ADDRESS>
 
95134
(Address of Principal Executive Offices)
 
(Zip Code)<PHONE> 
(Registrant’s Telephone Number, including Area Code) 
Securities registered pursuant to Section 12
(b) of the Act:
Title of Each Class
Trading Symbol
(s)
Names of Each Exchange on which Registered
<TITLE_CLASS>, <TITLE_CLASS_VALUE> par value per share
<TICKER>
<STOCK_EXCHANGE>
Securities registered pursuant to Section 12
(g) of the Act:


In [None]:
print("\n".join(annotated_text['obfuscated']))

Commission file number 0699 273 53 89 
_____________________________________
 
Carr Group.
(Exact name of registrant as specified in its charter)
____________________________________ 
IllinoisIndiana
 
74 911 574
(State or Other Jurisdiction ofIncorporation or Organization)
 
(I.R.S.
EmployerIdentification No.
)
<ADDRESS>
 
95134
(Address of Principal Executive Offices)
 
(Zip Code)
0472 51 11 42 
(Registrant’s Telephone Number, including Area Code) 
Securities registered pursuant to Section 12
(b) of the Act:
Title of Each Class
Trading Symbol
(s)
Names of Each Exchange on which Registered
Common Stock, USD 0.025 par value per share
YLJJ
NYSE
Securities registered pursuant to Section 12
(g) of the Act:


## Shifting Days

We use the `medical.DocumentHashCoder()` annotator to determine shifting days. This annotator gets the hash of the specified column and creates a new document column containing day shift information. And then, the `medical.DeIdentification()` annotator deidentifies this new doc. We should set the seed parameter to hash consistently.  

In [None]:
import pandas as pd

data = pd.DataFrame(
    {'DocumentID' : ['A001', 'A002'],
     'text' : ['Mark Johansson has bought a stock on 02/28/2020', 
               'John has bought a house on 03/15/2022',
               ]
    }
)

my_input_df = spark.createDataFrame(data)

my_input_df.show(truncate = False)

+----------+-----------------------------------------------+
|DocumentID|text                                           |
+----------+-----------------------------------------------+
|A001      |Mark Johansson has bought a stock on 02/28/2020|
|A002      |John has bought a house on 03/15/2022          |
+----------+-----------------------------------------------+



### Shifting days according to the ID column

We use the `legal.DocumentHashCoder()` annotator to determine shifting days. This annotator gets the hash of the specified column and creates a new document column containing day shift information. And then, the `legal.DeIdentification()` annotator deidentifies this new doc. We should set the seed parameter to hash consistently.  

In [None]:
documentAssembler = nlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

documentHasher = finance.DocumentHashCoder()\
    .setInputCols("document")\
    .setOutputCol("document2")\
    .setPatientIdColumn("DocumentID")\
    .setRangeDays(100)\
    .setNewDateShift("shift_days")\
    .setSeed(100)


# textSplitter = nlp.SentenceDetector()\
#     .setInputCols(["document2"])\
#     .setOutputCol("sentence")

tokenizer = nlp.Tokenizer()\
    .setInputCols(["document2"])\
    .setOutputCol("token")

embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en")\
    .setInputCols(["document2", "token"])\
    .setOutputCol("embeddings")

finance_ner = finance.NerModel.pretrained('finner_deid_sec', "en", "finance/models")\
    .setInputCols(["document2", "token", "embeddings"]) \
    .setOutputCol("ner") 
    #.setLabelCasing("upper")

ner_converter = finance.NerConverterInternal() \
    .setInputCols(["document2", "token", "ner"])\
    .setOutputCol("ner_chunk")

deid = finance.DeIdentification()\
    .setInputCols(["document2", "token", "ner_chunk"]) \
    .setOutputCol("deidentified") \
    .setMode("obfuscate") \
    .setObfuscateDate(True) \
    .setDateTag("DATE") \
    .setLanguage("en") \
    .setObfuscateRefSource('faker') \
    .setUseShifDays(True)\
    .setRegion('us')

pipeline = nlp.Pipeline(stages=[
      documentAssembler, 
      documentHasher,
      tokenizer,
      embeddings,
      finance_ner,
      ner_converter,
      deid])

empty_data = spark.createDataFrame([["", ""]]).toDF("text", "DocumentID")

pipeline_model = pipeline.fit(empty_data)

bert_embeddings_sec_bert_base download started this may take some time.
Approximate size to download 390.4 MB
[OK!]
finner_deid_sec download started this may take some time.
[OK!]


In [None]:
output = pipeline_model.transform(my_input_df)

output.select('DocumentID','text', 'deidentified.result').show(truncate = False)

+----------+-----------------------------------------------+-------------------------------------------+
|DocumentID|text                                           |result                                     |
+----------+-----------------------------------------------+-------------------------------------------+
|A001      |Mark Johansson has bought a stock on 02/28/2020|[<PERSON> has bought a stock on 02/23/2020]|
|A002      |John has bought a house on 03/15/2022          |[<PERSON> has bought a house on 04/13/2022]|
+----------+-----------------------------------------------+-------------------------------------------+



### Shifting days according to specified values

Instead of shifting days according to ID column, we can specify shifting values with another column.

```python
documentHasher = finance.DocumentHashCoder()\
    .setInputCols("document")\
    .setOutputCol("document2")\
    .setDateShiftColumn("dateshift")\
```


In [None]:
data = pd.DataFrame(
    {'DocumentID' : ['A001', 'A002'],
     'text' : ['Mark Johansson has bought a stock on 02/28/2020', 
               'John has bought a house on 03/15/2022',
               ],
     'dateshift' : ['5', '10']
    }
)


my_input_df = spark.createDataFrame(data)

my_input_df.show(truncate = False)

+----------+-----------------------------------------------+---------+
|DocumentID|text                                           |dateshift|
+----------+-----------------------------------------------+---------+
|A001      |Mark Johansson has bought a stock on 02/28/2020|5        |
|A002      |John has bought a house on 03/15/2022          |10       |
+----------+-----------------------------------------------+---------+



In [None]:
documentAssembler = nlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

documentHasher = finance.DocumentHashCoder()\
    .setInputCols("document")\
    .setOutputCol("document2")\
    .setDateShiftColumn("dateshift")\

# textSplitter = nlp.SentenceDetector()\
#     .setInputCols(["document2"])\
#     .setOutputCol("sentence")

tokenizer = nlp.Tokenizer()\
    .setInputCols(["document2"])\
    .setOutputCol("token")

embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en")\
    .setInputCols(["document2", "token"])\
    .setOutputCol("embeddings")

finance_ner = finance.NerModel.pretrained('finner_deid_sec', "en", "finance/models")\
    .setInputCols(["document2", "token", "embeddings"]) \
    .setOutputCol("ner") 
    #.setLabelCasing("upper")

ner_converter = finance.NerConverterInternal() \
    .setInputCols(["document2", "token", "ner"])\
    .setOutputCol("ner_chunk")

obfuscation = finance.DeIdentification()\
    .setInputCols(["document2", "token", "ner_chunk"]) \
    .setOutputCol("deidentified") \
    .setMode("obfuscate") \
    .setObfuscateDate(True) \
    .setDateTag("DATE") \
    .setLanguage("en") \
    .setObfuscateRefSource('faker') \
    .setUseShifDays(True)\
    .setRegion('us')

pipeline = nlp.Pipeline(stages=[
      documentAssembler, 
      documentHasher,
      tokenizer,
      embeddings,
      finance_ner,
      ner_converter,
      obfuscation])

empty_data = spark.createDataFrame([["", "", ""]]).toDF("text", "DocumentID", "dateshift")

pipeline_model = pipeline.fit(empty_data)

bert_embeddings_sec_bert_base download started this may take some time.
Approximate size to download 390.4 MB
[OK!]
finner_deid_sec download started this may take some time.
[OK!]


In [None]:
output = pipeline_model.transform(my_input_df)

output.select('text', 'dateshift', 'deidentified.result').show(truncate = False)

+-----------------------------------------------+---------+-------------------------------------------+
|text                                           |dateshift|result                                     |
+-----------------------------------------------+---------+-------------------------------------------+
|Mark Johansson has bought a stock on 02/28/2020|5        |[<PERSON> has bought a stock on 03/04/2020]|
|John has bought a house on 03/15/2022          |10       |[<PERSON> has bought a house on 03/25/2022]|
+-----------------------------------------------+---------+-------------------------------------------+



### Masking Unnormalized Date Formats

`setUnnormalizedDateMode()` parameter is used to mask the DATE entities that can not be normalized. In the example below, please check `03Apr2022` which couldn't be normalized and it is masked in the output.

In [None]:
data = pd.DataFrame(
    {'DocumentID' : ['A001', 'A002'],
     'text' : ['Mark Johansson has bought a stock on 02/28/2020', 
               'John has bought a house on 03Apr2022'],
     'dateshift' : ['5', '10']
    }
)

my_input_df = spark.createDataFrame(data)


documentAssembler = nlp.DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

documentHasher = finance.DocumentHashCoder()\
    .setInputCols("document")\
    .setOutputCol("document2")\
    .setDateShiftColumn("dateshift")\

# textSplitter = nlp.SentenceDetector()
#     .setInputCols(["document2"])\
#     .setOutputCol("sentence")

tokenizer = nlp.Tokenizer()\
    .setInputCols(["document2"])\
    .setOutputCol("token")

embeddings = nlp.BertEmbeddings.pretrained("bert_embeddings_sec_bert_base","en")\
    .setInputCols(["document2", "token"])\
    .setOutputCol("embeddings")

legal_ner = finance.NerModel.pretrained('finner_deid_sec', "en", "finance/models")\
    .setInputCols(["document2", "token", "embeddings"]) \
    .setOutputCol("ner") 
    #.setLabelCasing("upper")

ner_converter = finance.NerConverterInternal() \
    .setInputCols(["document2", "token", "ner"])\
    .setOutputCol("ner_chunk")

obfuscation = finance.DeIdentification()\
    .setInputCols(["sentence", "token", "ner_chunk"]) \
    .setOutputCol("deidentified") \
    .setMode("obfuscate") \
    .setObfuscateDate(True) \
    .setDateTag("DATE") \
    .setLanguage("en") \
    .setObfuscateRefSource('faker') \
    .setUseShifDays(True)\
    .setRegion('us')\
    .setUnnormalizedDateMode("mask")

pipeline = nlp.Pipeline(stages=[
      documentAssembler, 
      documentHasher,
      textSplitter,
      tokenizer,
      embeddings,
      legal_ner,
      ner_converter,
      obfuscation])


output = pipeline.fit(my_input_df).transform(my_input_df)

output.select('text', 'dateshift', 'deidentified.result').show(truncate = False)

bert_embeddings_sec_bert_base download started this may take some time.
Approximate size to download 390.4 MB
[OK!]
finner_deid_sec download started this may take some time.
[OK!]
+-----------------------------------------------+---------+-------------------------------------------+
|text                                           |dateshift|result                                     |
+-----------------------------------------------+---------+-------------------------------------------+
|Mark Johansson has bought a stock on 02/28/2020|5        |[<PERSON> has bought a stock on 03/10/2020]|
|John has bought a house on 03Apr2022           |10       |[<PERSON> has bought a house on <DATE>]    |
+-----------------------------------------------+---------+-------------------------------------------+



# Structured Deidentification

In [None]:
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/legal-nlp/data/hipaa-table-001.txt

dbutils.fs.cp("file:/databricks/driver/hipaa-table-001.txt", "dbfs:/") 

In [None]:
df = pd.read_csv("hipaa-table-001.txt", sep = "\t") 
df = spark.createDataFrame(df)
df.show(truncate=False)

+---------------+----------+---+----------------------------------------------------+-------+--------------+---+---+
|NAME           |DOB       |AGE|ADDRESS                                             |ZIPCODE|TEL           |SBP|DBP|
+---------------+----------+---+----------------------------------------------------+-------+--------------+---+---+
|Cecilia Chapman|04/02/1935|83 |711-2880 Nulla St. Mankato Mississippi              |69200  |(257) 563-7401|101|42 |
|Iris Watson    |03/10/2009|9  |P.O. Box 283 8562 Fusce Rd. Frederick Nebraska      |20620  |(372) 587-2335|159|122|
|Bryar Pitts    |11/01/1921|98 |5543 Aliquet St. Fort Dodge GA                      |20783  |(717) 450-4729|149|52 |
|Theodore Lowe  |13/02/2002|16 |Ap #867-859 Sit Rd. Azusa New York                  |39531  |(793) 151-6230|134|115|
|Calista Wise   |20/08/1942|76 |7292 Dictum Av. San Antonio MI                      |47096  |(492) 709-6392|139|78 |
|Kyla Olsen     |12/05/1973|45 |Ap #651-8679 Sodales Av. Tamunin

In [None]:
obfuscator = finance.StructuredDeidentification(spark,{"NAME":"PATIENT","AGE":"AGE"}, obfuscateRefSource = "faker")
obfuscator_df = obfuscator.obfuscateColumns(df)
obfuscator_df.show(truncate=False)

+-------------------+----------+----+----------------------------------------------------+-------+--------------+---+---+
|NAME               |DOB       |AGE |ADDRESS                                             |ZIPCODE|TEL           |SBP|DBP|
+-------------------+----------+----+----------------------------------------------------+-------+--------------+---+---+
|[Juventino Bracken]|04/02/1935|[60]|711-2880 Nulla St. Mankato Mississippi              |69200  |(257) 563-7401|101|42 |
|[Gregoria Holy]    |03/10/2009|[5] |P.O. Box 283 8562 Fusce Rd. Frederick Nebraska      |20620  |(372) 587-2335|159|122|
|[Reubin Hitch]     |11/01/1921|[60]|5543 Aliquet St. Fort Dodge GA                      |20783  |(717) 450-4729|149|52 |
|[Vesta Kansas]     |13/02/2002|[16]|Ap #867-859 Sit Rd. Azusa New York                  |39531  |(793) 151-6230|134|115|
|[Lorra Brazil]     |20/08/1942|[60]|7292 Dictum Av. San Antonio MI                      |47096  |(492) 709-6392|139|78 |
|[Colon Lung]       |12/

In [None]:
obfuscator_unique_ref_test = '''Will Perry#CLIENT
John Smith#CLIENT
Marvin MARSHALL#CLIENT
Hubert GROGAN#CLIENT
ALTHEA COLBURN#CLIENT
Kalil AMIN#CLIENT
Inci FOUNTAIN#CLIENT
Jackson WILLE#CLIENT
Jack SANTOS#CLIENT
Mahmood ALBURN#CLIENT
Marnie MELINGTON#CLIENT
Aysha GHAZI#CLIENT
Maryland CODER#CLIENT
Darene GEORGIOUS#CLIENT
Shelly WELLBECK#CLIENT
Min Kun JAE#CLIENT
Thomson THOMAS#CLIENT
Christian SUDDINBURG#CLIENT
Aberdeen#CITY
Louisburg St#STREET
France#LOC
5552312#PHONE
Calle del Libertador#ADDRESS
111#ID
20#AGE
30#AGE
40#AGE
50#AGE
60#AGE
'''

with open('obfuscator_unique_ref_test.txt', 'w') as f:
  f.write(obfuscator_unique_ref_test)

In [None]:
# obfuscateRefSource = "file"

obfuscator = finance.StructuredDeidentification(spark,{"NAME":"CLIENT","AGE":"AGE"}, 
                                        obfuscateRefFile = "/dbfs/obfuscator_unique_ref_test.txt",
                                        obfuscateRefSource = "file",
                                        columnsSeed={"NAME": 23, "AGE": 23})
obfuscator_df = obfuscator.obfuscateColumns(df)
obfuscator_df.select("NAME","AGE").show(truncate=False)

+------------------+----+
|NAME              |AGE |
+------------------+----+
|[Inci FOUNTAIN]   |[60]|
|[Jack SANTOS]     |[30]|
|[Darene GEORGIOUS]|[30]|
|[Shelly WELLBECK] |[40]|
|[Hubert GROGAN]   |[40]|
|[Kalil AMIN]      |[40]|
|[ALTHEA COLBURN]  |[60]|
|[Thomson THOMAS]  |[60]|
|[Jack SANTOS]     |[60]|
|[Will Perry]      |[20]|
|[Jackson WILLE]   |[60]|
|[Shelly WELLBECK] |[40]|
|[Kalil AMIN]      |[30]|
|[Marnie MELINGTON]|[30]|
|[Min Kun JAE]     |[30]|
|[Marvin MARSHALL] |[60]|
|[Marvin MARSHALL] |[50]|
|[Min Kun JAE]     |[30]|
|[Maryland CODER]  |[20]|
|[Marnie MELINGTON]|[20]|
+------------------+----+
only showing top 20 rows



We can **shift n days** in the structured deidentification through "days" parameter when the column is a Date.

In [None]:
df = spark.createDataFrame([
            ["Juan García", "13/02/1977", "711 Nulla St.", "140", "673 431234"],
            ["Will Smith", "23/02/1977", "1 Green Avenue.", "140", "+23 (673) 431234"],
            ["Pedro Ximénez", "11/04/1900", "Calle del Libertador, 7", "100", "912 345623"]
        ]).toDF("NAME", "DOB", "ADDRESS", "SBP", "TEL")
df.show(truncate=False)

+-------------+----------+-----------------------+---+----------------+
|NAME         |DOB       |ADDRESS                |SBP|TEL             |
+-------------+----------+-----------------------+---+----------------+
|Juan García  |13/02/1977|711 Nulla St.          |140|673 431234      |
|Will Smith   |23/02/1977|1 Green Avenue.        |140|+23 (673) 431234|
|Pedro Ximénez|11/04/1900|Calle del Libertador, 7|100|912 345623      |
+-------------+----------+-----------------------+---+----------------+



In [None]:
obfuscator = finance.StructuredDeidentification(spark=spark, 
                                        columns={"NAME": "ID", "DOB": "DATE"},
                                        columnsSeed={"NAME": 23, "DOB": 23},
                                        obfuscateRefSource="faker",
                                        days=5
                                         )

In [None]:
result = obfuscator.obfuscateColumns(df)
result.show(truncate=False)

+----------+------------+-----------------------+---+----------------+
|NAME      |DOB         |ADDRESS                |SBP|TEL             |
+----------+------------+-----------------------+---+----------------+
|[N2649912]|[18/02/1977]|711 Nulla St.          |140|673 431234      |
|[W466004] |[28/02/1977]|1 Green Avenue.        |140|+23 (673) 431234|
|[M403810] |[16/04/1900]|Calle del Libertador, 7|100|912 345623      |
+----------+------------+-----------------------+---+----------------+



# Save the Pipeline and Use it from Your Local

In [None]:
model.write().overwrite().save('/dbfs/pipeline_deid')

In [None]:
deid_pipeline = nlp.PretrainedPipeline.from_disk("/dbfs/pipeline_deid")

In [None]:
data = spark.createDataFrame([[text]]).toDF("text")

In [None]:
deid_pipeline.model.stages

[DocumentAssembler_15eaf363dbe2,
 SentenceDetector_3f5f9ca05720,
 REGEX_TOKENIZER_9dc6df10defe,
 BERT_EMBEDDINGS_29ce72cd673e,
 FinanceNerModel_99ecfbac41c1,
 NER_CONVERTER_f64e99524cd7,
 ZeroShotRobertaNer_5d06c0297d21,
 NER_CONVERTER_f1201ec2169e,
 CONTEXTUAL-PARSER_88f6b3062049,
 CONTEXTUAL-PARSER_ac56b326e140,
 CONTEXTUAL-PARSER_5809f0605309,
 MERGE_62fe12dd1c9a]

In [None]:
deid_pipeline.model.transform(data).show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+-----+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|           ner_chunk|       zero_shot_ner|      zero_ner_chunk|alias|email|               phone|   merged_ner_chunks|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+-----+--------------------+--------------------+
|
Commission file ...|[{document, 0, 77...|[{document, 1, 10...|[{token, 1, 10, C...|[{word_embeddings...|[{named_entity, 1...|[{chunk, 24, 32, ...|[{named_entity, 1...|[{chunk, 75, 96, ...|   []|   []|[{chunk, 24, 32, ...|[{chunk, 24, 32, ...|
+-------------------