![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/legal-nlp/11.0.Deidentification.ipynb)

# Legal Deidentification

# Installation

In [1]:
! pip install -q johnsnowlabs

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.9/108.9 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.0/141.0 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m683.3/683.3 kB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m620.8/620.8 kB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.6/95.6 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.9/66.9 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m12.5 MB/s[0

## Automatic Installation
Using my.johnsnowlabs.com SSO

In [None]:
from johnsnowlabs import nlp, legal

# nlp.install(force_browser=True)

## Manual downloading
If you are not registered in my.johnsnowlabs.com, you received a license via e-email or you are using Safari, you may need to do a manual update of the license.

- Go to my.johnsnowlabs.com
- Download your license
- Upload it using the following command

In [None]:
from google.colab import files

print("Please Upload your John Snow Labs License using the button below")
license_keys = files.upload()

- Install it

In [None]:
nlp.install()

# Starting

In [2]:
spark = nlp.start()

📋 Loading license number 0 from /root/.johnsnowlabs/licenses/license_number_{number}_for_Spark-Healthcare_Spark-OCR.json
👌 Launched [92mcpu optimized[39m session with with: 🚀Spark-NLP==5.5.0, 💊Spark-Healthcare==5.5.0, running on ⚡ PySpark==3.4.0


# Deidentification Model

Some legal information can be considered sensitive. (e.g.,document, organization, address, signer)

In [None]:
documentAssembler = nlp.DocumentAssembler().setInputCol("text").setOutputCol("document")

sentenceDetector = (
    nlp.SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")
)

tokenizer = nlp.Tokenizer().setInputCols(["sentence"]).setOutputCol("token")

embeddings = (
    nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base", "en")
    .setInputCols(["sentence", "token"])
    .setOutputCol("embeddings")
    .setStorageRef("roberta_embeddings_legal_roberta_base_en")
)

legal_ner = (
    legal.NerModel.pretrained("legner_contract_doc_parties_lg", "en", "legal/models")
    .setInputCols(["sentence", "token", "embeddings"])
    .setOutputCol("ner")
)
# .setLabelCasing("upper")

ner_converter = (
    legal.NerConverterInternal()
    .setInputCols(["sentence", "token", "ner"])
    .setOutputCol("ner_chunk")
    .setReplaceLabels({"ALIAS": "PARTY"})
)  # "ALIAS" are secondary names of companies, so let's extract them also as PARTY

nlpPipeline = nlp.Pipeline(
    stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        embeddings,
        legal_ner,
        ner_converter,
    ]
)

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

roberta_embeddings_legal_roberta_base download started this may take some time.
Approximate size to download 447.2 MB
[OK!]
legner_contract_doc_parties_lg download started this may take some time.
[OK!]


### Pretrained NER models extracts:
- Document
- Date
- Party (Organization Name)
- Alias

In [8]:
legal_ner.getClasses()

['O',
 'B-EFFDATE',
 'B-ORG',
 'I-DOC',
 'I-ORG',
 'B-ALIAS',
 'I-ALIAS',
 'I-EFFDATE',
 'B-PARTY',
 'B-FORMER_NAME',
 'I-FORMER_NAME',
 'B-DOC',
 'I-PARTY']

In [9]:
text = """THIS STRATEGIC ALLIANCE AGREEMENT ("Agreement") is made and entered into as of December 14, 2016 , by and between Hyatt Franchising Latin America, L.L.C. a limited liability company organized and existing under the laws of the State of Delaware"""

In [10]:
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))

In [None]:
from pyspark.sql import functions as F

result_df = result.select(
    F.explode(F.arrays_zip(result.token.result, result.ner.result)).alias("cols")
).select(F.expr("cols['0']").alias("token"), F.expr("cols['1']").alias("ner_label"))

In [None]:
result_df.select("token", "ner_label").groupBy("ner_label").count().orderBy(
    "count", ascending=False
).show(truncate=False)

+---------+-----+
|ner_label|count|
+---------+-----+
|O        |31   |
|I-PARTY  |5    |
|I-EFFDATE|3    |
|I-DOC    |2    |
|B-DOC    |1    |
|B-PARTY  |1    |
|B-EFFDATE|1    |
+---------+-----+



### Check extracted sensitive entities
We have left DOC (document type) as a sensitive entity. If it is not, you only need to blacklist the entity in the NerConverterInternal:

```
legal_ner = legal.NerModel.pretrained("legner_contract_doc_parties_lg", "en", "legal/models")\
    .setInputCols(["sentence", "token", "embeddings"]) \
    .setOutputCol("ner")
    .setBlackList(["DOC"])
```

In [None]:
result.select(
    F.explode(F.arrays_zip(result.ner_chunk.result, result.ner_chunk.metadata)).alias(
        "cols"
    )
).select(
    F.expr("cols['0']").alias("chunk"), F.expr("cols['1']['entity']").alias("ner_label")
).show(truncate=False)

+--------------------------------------+---------+
|chunk                                 |ner_label|
+--------------------------------------+---------+
|STRATEGIC ALLIANCE AGREEMENT          |DOC      |
|December 14, 2016                     |EFFDATE  |
|Hyatt Franchising Latin America, L.L.C|PARTY    |
+--------------------------------------+---------+



## Masking and Obfuscation

### Replace these enitites with Tags

In [None]:
deidentification = (
    legal.DeIdentification()
    .setInputCols(["sentence", "token", "ner_chunk"])
    .setOutputCol("deidentified")
    .setMode("mask")
    .setReturnEntityMappings(True)
)  #  return a new column to save the mappings between the mask/obfuscated entities and original entities. REquired for "ReIdentification"
# .setMappingsColumn("MappingCol") # change the name of the column, 'aux' is default

deidPipeline = nlp.Pipeline(
    stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        embeddings,
        legal_ner,
        ner_converter,
        deidentification,
    ]
)

empty_data = spark.createDataFrame([[""]]).toDF("text")

model_deid = deidPipeline.fit(empty_data)

In [15]:
result = model_deid.transform(spark.createDataFrame([[text]]).toDF("text"))

In [16]:
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|           ner_chunk|        deidentified|                 aux|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|THIS STRATEGIC AL...|[{document, 0, 24...|[{document, 0, 24...|[{token, 0, 3, TH...|[{word_embeddings...|[{named_entity, 0...|[{chunk, 5, 32, S...|[{document, 0, 18...|[{chunk, 5, 9, <D...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [None]:
reIdentification = (
    legal.ReIdentification()
    .setInputCols(["aux", "deidentified"])
    .setOutputCol("original")
)

In [18]:
reid_result = reIdentification.transform(result)

In [19]:
reid_result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|           ner_chunk|        deidentified|                 aux|            original|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|THIS STRATEGIC AL...|[{document, 0, 24...|[{document, 0, 24...|[{token, 0, 3, TH...|[{word_embeddings...|[{named_entity, 0...|[{chunk, 5, 32, S...|[{document, 0, 18...|[{chunk, 5, 9, <D...|[{document, 0, 24...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----

# ReIdentification

In [None]:
print(text)

reid_result.select("original.result").show(truncate=False)

THIS STRATEGIC ALLIANCE AGREEMENT ("Agreement") is made and entered into as of December 14, 2016 , by and between Hyatt Franchising Latin America, L.L.C. a limited liability company organized and existing under the laws of the State of Delaware
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                                                                                                |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[THIS S

In [None]:
result.select(
    F.explode(F.arrays_zip(result.sentence.result, result.deidentified.result)).alias(
        "cols"
    )
).select(
    F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")
).toPandas()

Unnamed: 0,sentence,deidentified
0,"THIS STRATEGIC ALLIANCE AGREEMENT (""Agreement""...","THIS <DOC> (""Agreement"") is made and entered i..."


## Other different masking strategies

We have three modes to mask the entities in the Deidentification annotator. You can select the modes using the `.setMaskingPolicy()` parameter. The methods are the followings:

**“entity_labels”**: Mask with the entity type of that chunk. (default) <br/>
**“same_length_chars”**: Mask the deid entities with same length of asterix ( * ) with brackets ( [ , ] ) on both end. <br/>
**“fixed_length_chars”**: Mask the deid entities with a fixed length of asterix ( * ). The length is setting up using the `setFixedMaskLength()` method. <br/>

Let's try each of these and compare the results:

In [None]:
# deid model with "entity_labels"
deid_entity_labels = (
    legal.DeIdentification()
    .setInputCols(["sentence", "token", "ner_chunk"])
    .setOutputCol("deid_entity_label")
    .setMode("mask")
    .setReturnEntityMappings(False)
    .setMaskingPolicy("entity_labels")
)

# deid model with "same_length_chars"
deid_same_length = (
    legal.DeIdentification()
    .setInputCols(["sentence", "token", "ner_chunk"])
    .setOutputCol("deid_same_length")
    .setMode("mask")
    .setReturnEntityMappings(False)
    .setMaskingPolicy("same_length_chars")
)

# deid model with "fixed_length_chars"
deid_fixed_length = (
    legal.DeIdentification()
    .setInputCols(["sentence", "token", "ner_chunk"])
    .setOutputCol("deid_fixed_length")
    .setMode("mask")
    .setReturnEntityMappings(False)
    .setMaskingPolicy("fixed_length_chars")
    .setFixedMaskLength(4)
)


deidPipeline = nlp.Pipeline(
    stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        embeddings,
        legal_ner,
        ner_converter,
        deid_entity_labels,
        deid_same_length,
        deid_fixed_length,
    ]
)


empty_data = spark.createDataFrame([[""]]).toDF("text")
model_deid = deidPipeline.fit(empty_data)

In [23]:
result = model_deid.transform(spark.createDataFrame([[text]]).toDF("text"))

In [24]:
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|           ner_chunk|   deid_entity_label|    deid_same_length|   deid_fixed_length|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|THIS STRATEGIC AL...|[{document, 0, 24...|[{document, 0, 24...|[{token, 0, 3, TH...|[{word_embeddings...|[{named_entity, 0...|[{chunk, 5, 32, S...|[{document, 0, 18...|[{document, 0, 24...|[{document, 0, 17...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----

In [None]:
result.select(
    F.explode(
        F.arrays_zip(
            result.sentence.result,
            result.deid_entity_label.result,
            result.deid_same_length.result,
            result.deid_fixed_length.result,
        )
    ).alias("cols")
).select(
    F.expr("cols['0']").alias("sentence"),
    F.expr("cols['1']").alias("deid_entity_label"),
    F.expr("cols['2']").alias("deid_same_length"),
    F.expr("cols['3']").alias("deid_fixed_length"),
).toPandas()

Unnamed: 0,sentence,deid_entity_label,deid_same_length,deid_fixed_length
0,"THIS STRATEGIC ALLIANCE AGREEMENT (""Agreement""...","THIS <DOC> (""Agreement"") is made and entered i...","THIS [**************************] (""Agreement""...","THIS **** (""Agreement"") is made and entered in..."


### Mapping Column

In [26]:
result.select("ner_chunk").show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ner_chunk                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+---------------------------------------------------------------------------------------------------------------------

In [None]:
result.select(
    F.explode(
        F.arrays_zip(
            result.ner_chunk.metadata, result.ner_chunk.begin, result.ner_chunk.end
        )
    ).alias("cols")
).select(
    F.expr("cols['0']['entity']").alias("label"),
    F.expr("cols['1']").alias("beginLabel"),
    F.expr("cols['2']").alias("endLabel"),
).show(truncate=False)

+-------+----------+--------+
|label  |beginLabel|endLabel|
+-------+----------+--------+
|DOC    |5         |32      |
|EFFDATE|79        |95      |
|PARTY  |114       |151     |
+-------+----------+--------+



## Using NER, ContextualParser and ZeroShotNER in the same Deideintification pipeline

In [None]:
# Create JSON file for PART
alias = {
    "entity": "ALIAS",
    "ruleScope": "document",
    "completeMatchRegex": "true",
    "regex": '["“].*?["”]',
    "matchScope": "sub-token",
    "contextLength": 100,
}

email = {
    "entity": "EMAIL",
    "ruleScope": "document",
    "completeMatchRegex": "true",
    "regex": "[\w-\.]+@([\w-]+\.)+[\w-]{2,4}",
    "matchScope": "sub-token",
    "contextLength": 100,
}

phone = {
    "entity": "PHONE",
    "ruleScope": "document",
    "completeMatchRegex": "true",
    "regex": "(\+?\d{1,3}[\s-]?)?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d+",
    "matchScope": "sub-token",
    "contextLength": 100,
}

import json

with open("alias.json", "w") as f:
    json.dump(alias, f)

with open("email.json", "w") as f:
    json.dump(email, f)

with open("phone.json", "w") as f:
    json.dump(phone, f)

In [None]:
documentAssembler = nlp.DocumentAssembler().setInputCol("text").setOutputCol("document")

sentenceDetector = (
    nlp.SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")
)

tokenizer = nlp.Tokenizer().setInputCols(["sentence"]).setOutputCol("token")

embeddings = (
    nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base", "en")
    .setInputCols(["sentence", "token"])
    .setOutputCol("embeddings")
    .setStorageRef("roberta_embeddings_legal_roberta_base_en")
)

ner_model = (
    legal.NerModel.pretrained("legner_contract_doc_parties_lg", "en", "legal/models")
    .setInputCols(["sentence", "token", "embeddings"])
    .setOutputCol("ner")
)

ner_converter = (
    legal.NerConverterInternal()
    .setInputCols(["sentence", "token", "ner"])
    .setOutputCol("ner_chunk")
    .setWhiteList(["EFFDATE", "PARTY", "ALIAS"])
    .setReplaceLabels({"FORMER_NAME": "PARTY"})
    .setGreedyMode(True)
)

zero_shot_ner = (
    legal.ZeroShotNerModel.pretrained("legner_roberta_zeroshot", "en", "legal/models")
    .setInputCols(["sentence", "token"])
    .setOutputCol("zero_shot_ner")
    .setPredictionThreshold(0.1)
    .setEntityDefinitions(
        {
            "ADDRESS": ["Which address?", "Where is the location?"],
            "SIGNING_PERSON": ["Which person?", "What is the person name?"],
            "PARTY": ["Which LLC?", "Which Inc?", "Which PLC?", "Which Corp?"],
        }
    )
)


zeroshot_ner_converter = (
    legal.NerConverterInternal()
    .setInputCols(["sentence", "token", "zero_shot_ner"])
    .setOutputCol("zero_ner_chunk")
)
ner_model2 = (
    legal.NerModel.pretrained("legner_signers", "en", "legal/models")
    .setInputCols(["sentence", "token", "embeddings"])
    .setOutputCol("ner2")
)

ner_converter2 = (
    nlp.NerConverter()
    .setInputCols(["sentence", "token", "ner2"])
    .setOutputCol("ner_chunk2")
)

alias_parser = (
    legal.ContextualParserApproach()
    .setInputCols(["sentence", "token"])
    .setOutputCol("alias")
    .setJsonPath("alias.json")
    .setPrefixAndSuffixMatch(False)
    .setOptionalContextRules(True)
    .setCaseSensitive(False)
)

email_parser = (
    legal.ContextualParserApproach()
    .setInputCols(["sentence", "token"])
    .setOutputCol("email")
    .setJsonPath("email.json")
    .setPrefixAndSuffixMatch(False)
    .setOptionalContextRules(True)
    .setCaseSensitive(False)
)

phone_parser = (
    legal.ContextualParserApproach()
    .setInputCols(["sentence", "token"])
    .setOutputCol("phone")
    .setJsonPath("phone.json")
    .setPrefixAndSuffixMatch(False)
    .setOptionalContextRules(True)
    .setCaseSensitive(False)
)

chunk_merger = (
    legal.ChunkMergeApproach()
    .setInputCols(
        "email", "phone", "ner_chunk", "ner_chunk2", "zero_ner_chunk", "alias"
    )
    .setOutputCol("merged_ner_chunks")
)

nlpPipeline = nlp.Pipeline(
    stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        embeddings,
        ner_model,
        ner_converter,
        ner_model2,
        ner_converter2,
        zero_shot_ner,
        zeroshot_ner_converter,
        alias_parser,
        email_parser,
        phone_parser,
        chunk_merger,
    ]
)

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlpPipeline.fit(empty_data)

roberta_embeddings_legal_roberta_base download started this may take some time.
Approximate size to download 447.2 MB
[OK!]
legner_contract_doc_parties_lg download started this may take some time.
[OK!]
legner_roberta_zeroshot download started this may take some time.
[OK!]
legner_signers download started this may take some time.
[OK!]


In [30]:
text = """
This Commercial Lease (this “Lease”) dated February 11, 2021, but made effective as of January 1, 2021 (the “Effective Date”), is made by and between 605 NASH, LLC, a California limited liability company (“Landlord”) and NANTKWEST, INC., a Delaware corporation (“Tenant”).

605 NASH, LLC,	 	NANTKWEST, inc.,
a California limited liability company	 	a Delaware corporation


By:	 	/s/ Charles Kenworthy	 	By:	 	/s/ Richard Adcock
Name: Charles N. Kenworthy	 	Name: Richard Adcock
Title:   Manager	 	Title:   CEO

Attention: Chuck Kenworthy	 	Attention: Chief Financial Officer
Email:
juan@johnsnowlabs.com
Telephone numbers:
304.123.333
304-123-333
+34 304-123-333
0034304123333
"""

In [None]:
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))

# legal_ner
result.select(
    F.explode(F.arrays_zip(result.ner_chunk.result, result.ner_chunk.metadata)).alias(
        "cols"
    )
).select(
    F.expr("cols['0']").alias("chunk"), F.expr("cols['1']['entity']").alias("ner_label")
).show(truncate=False)

+-----------------+---------+
|chunk            |ner_label|
+-----------------+---------+
|February 11, 2021|EFFDATE  |
|January 1, 2021  |EFFDATE  |
|NASH, LLC        |PARTY    |
|NANTKWEST,       |PARTY    |
|605 NASH, LLC    |PARTY    |
+-----------------+---------+



In [None]:
# ner_signers
result.select(
    F.explode(F.arrays_zip(result.ner_chunk2.result, result.ner_chunk2.metadata)).alias(
        "cols"
    )
).select(
    F.expr("cols['0']").alias("chunk"), F.expr("cols['1']['entity']").alias("ner_label")
).show(truncate=False)

+-----------------------------+--------------+
|chunk                        |ner_label     |
+-----------------------------+--------------+
|605                          |PARTY         |
|NASH, LLC,                   |PARTY         |
|NANTKWEST, INC               |PARTY         |
|NASH, LLC,\t \tNANTKWEST, inc|PARTY         |
|Charles Kenworthy            |SIGNING_PERSON|
|Richard Adcock               |SIGNING_PERSON|
|Charles N. Kenworthy         |SIGNING_PERSON|
|Richard Adcock               |SIGNING_PERSON|
|Manager                      |SIGNING_TITLE |
|CEO                          |SIGNING_TITLE |
|Chuck Kenworthy              |SIGNING_PERSON|
|Chief Financial Officer      |SIGNING_TITLE |
+-----------------------------+--------------+



In [None]:
# zero_shot_ner
result.select(
    F.explode(
        F.arrays_zip(result.zero_ner_chunk.result, result.zero_ner_chunk.metadata)
    ).alias("cols")
).select(
    F.expr("cols['0']").alias("chunk"), F.expr("cols['1']['entity']").alias("ner_label")
).show(truncate=False)

+-------------+---------+
|chunk        |ner_label|
+-------------+---------+
|605 NASH, LLC|PARTY    |
|California   |ADDRESS  |
|NANTKWEST    |PARTY    |
|Delaware     |ADDRESS  |
+-------------+---------+



In [None]:
# merged_chunk
result.select(
    F.explode(
        F.arrays_zip(result.merged_ner_chunks.result, result.merged_ner_chunks.metadata)
    ).alias("cols")
).select(
    F.expr("cols['0']").alias("chunk"), F.expr("cols['1']['entity']").alias("ner_label")
).show(n=50, truncate=False)

+-----------------------------+--------------+
|chunk                        |ner_label     |
+-----------------------------+--------------+
|“Lease”                      |ALIAS         |
|February 11, 2021            |EFFDATE       |
|January 1, 2021              |EFFDATE       |
|“Effective Date”             |ALIAS         |
|605 NASH, LLC                |PARTY         |
|California                   |ADDRESS       |
|“Landlord”                   |ALIAS         |
|NANTKWEST, INC               |PARTY         |
|Delaware                     |ADDRESS       |
|“Tenant”                     |ALIAS         |
|NASH, LLC,\t \tNANTKWEST, inc|PARTY         |
|Charles Kenworthy            |SIGNING_PERSON|
|Richard Adcock               |SIGNING_PERSON|
|Charles N. Kenworthy         |SIGNING_PERSON|
|Richard Adcock               |SIGNING_PERSON|
|Manager                      |SIGNING_TITLE |
|CEO                          |SIGNING_TITLE |
|Chuck Kenworthy              |SIGNING_PERSON|
|Chief Financ

## Obfuscation mode

In the obfuscation mode **DeIdentificationModel** will replace sensitive entities with random values of the same type.


### Using external [Faker](https://faker.readthedocs.io/en/master/) library

In [35]:
!pip install faker

Collecting faker
  Downloading Faker-33.0.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-33.0.0-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.9/1.9 MB[0m [31m107.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-33.0.0


In [None]:
from faker import Faker

fk = Faker()

In [None]:
# This is the obfuscation dict for the new entities
obs_lines = """CEO#SIGNING_TITLE
Chief Executive Officer#SIGNING_TITLE
Chief Legal Officer#SIGNING_TITLE
Chief Financial officer#SIGNING_TITLE
Legal Representative#SIGNING_TILE
"Alias"#ALIAS
"Alias"#ALIAS"""

for _ in range(25):
    add = fk.address().strip()
    for ad in add.split("\n"):
        obs_lines += f"\n{ad}#ADDRESS"
    obs_lines += f"\n{fk.name().strip()}#SIGNING_PERSON"
    obs_lines += f"\n{fk.date().strip()}#EFFDATE"
    obs_lines += f"\n{fk.company().strip()}#PARTY"
    obs_lines += f"\n{fk.phone_number().strip()}#PHONE"
    obs_lines += f"\n{fk.email().strip()}#EMAIL"

with open("obfuscate.txt", "w") as f:
    f.write(obs_lines)

In [None]:
# Previous Masking Annotators
# deid model with "entity_labels"
deid_entity_labels = (
    legal.DeIdentification()
    .setInputCols(["sentence", "token", "merged_ner_chunks"])
    .setOutputCol("deidentified")
    .setMode("mask")
    .setMaskingPolicy("entity_labels")
)

# deid model with "same_length_chars"
deid_same_length = (
    legal.DeIdentification()
    .setInputCols(["sentence", "token", "merged_ner_chunks"])
    .setOutputCol("masked_with_chars")
    .setMode("mask")
    .setMaskingPolicy("same_length_chars")
)

# deid model with "fixed_length_chars"
deid_fixed_length = (
    legal.DeIdentification()
    .setInputCols(["sentence", "token", "merged_ner_chunks"])
    .setOutputCol("masked_fixed_length_chars")
    .setMode("mask")
    .setMaskingPolicy("fixed_length_chars")
    .setFixedMaskLength(4)
)


In [None]:
# Obfuscation with Faker
obfuscation = (
    legal.DeIdentification()
    .setInputCols(["sentence", "token", "merged_ner_chunks"])
    .setOutputCol("obfuscated")
    .setMode("obfuscate")
    .setObfuscateDate(True)
    .setObfuscateRefFile("obfuscate.txt")
    .setObfuscateRefSource("both")
)

nlpPipeline = nlp.Pipeline(
    stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        embeddings,
        ner_model,
        ner_converter,
        ner_model2,
        ner_converter2,
        zero_shot_ner,
        zeroshot_ner_converter,
        alias_parser,
        email_parser,
        phone_parser,
        chunk_merger,
        deid_entity_labels,
        deid_same_length,
        deid_fixed_length,
        obfuscation,
    ]
)

obfuscation_model = nlpPipeline.fit(empty_data)

In [40]:
text = """This Commercial Lease (this “Lease”) dated February 11, 2021, but made effective as of January 1, 2021 (the “Effective Date”), is made by and between 605 NASH, LLC, a California limited liability company (“Landlord”) and NANTKWEST, INC., a Delaware corporation (“Tenant”).

605 NASH, LLC,	 	NANTKWEST, inc.,
a California limited liability company	 	a Delaware corporation


By:	 	/s/ Charles Kenworthy	 	By:	 	/s/ Richard Adcock
Name: Charles N. Kenworthy	 	Name: Richard Adcock
Title:   Manager	 	Title:   CEO

Address:
9922 Jefferson Blvd.
Culver City, CA 90232
Attention: Chuck Kenworthy	 	Attention: Chief Financial Officer cfo@johnkopkins.com (0031) 913-123"""

In [None]:
result = obfuscation_model.transform(spark.createDataFrame([[text]]).toDF("text"))
print("\n".join(result.select("deidentified.result").collect()[0].result))

This Commercial Lease (this <ALIAS>) dated <EFFDATE>, but made effective as of <EFFDATE> (the <ALIAS>), is made by and between <PARTY> a <ADDRESS> limited liability company (<ALIAS>) and <PARTY>., a <ADDRESS> corporation (<ALIAS>).
605 <PARTY>.,
a California limited liability company	 	a Delaware corporation
 	 	 	 	 	 	 
 	 	 	 	 	 	 
By:	 	/s/ <SIGNING_PERSON>	 	By:	 	/s/ <SIGNING_PERSON>
Name: <SIGNING_PERSON>	 	Name: <SIGNING_PERSON>
Title:   <SIGNING_TITLE>	 	Title:   <SIGNING_TITLE>
 	 	 	 	 	 	 
Address:
<ADDRESS>.
<ADDRESS> 90232
Attention: Chuck Kenworthy	 	Attention: <SIGNING_TITLE> <EMAIL> (<PHONE>


In [None]:
print("\n".join(result.select("obfuscated.result").collect()[0].result))

This Commercial Lease (this Paramount Group) dated April 12, 2021, but made effective as of March 2, 2021 (the Summit Capital), is made by and between Liberty Legal Solutions, PC a Anthonyton, OR 24470 limited liability company (Sunrise Holdings) and Emerald City Law Firm, PC., a Michelleside, CA 58081 corporation (Starlight Capital).
605 Sunburst Enterprises, Inc..,
a California limited liability company	 	a Delaware corporation
 	 	 	 	 	 	 
 	 	 	 	 	 	 
By:	 	/s/ Almeta Jacobus	 	By:	 	/s/ Lary Point
Name: Conchetta Deeds	 	Name: Lary Point
Title:   Deputy Treasurer	 	Title:   Agent-in-charge
 	 	 	 	 	 	 
Address:
06232 Kelley Lock.
South Alexandra, KS 90829 90232
Attention: Chuck Kenworthy	 	Attention: Legal Advisor Ambrose@hotmail.com (4814) 546-958


## Using Light Pipelines

In [None]:
light_model = nlp.LightPipeline(obfuscation_model)
annotated_text = light_model.annotate(text)
print("\n".join(annotated_text["deidentified"]))

This Commercial Lease (this <ALIAS>) dated <EFFDATE>, but made effective as of <EFFDATE> (the <ALIAS>), is made by and between <PARTY> a <ADDRESS> limited liability company (<ALIAS>) and <PARTY>., a <ADDRESS> corporation (<ALIAS>).
605 <PARTY>.,
a California limited liability company	 	a Delaware corporation
 	 	 	 	 	 	 
 	 	 	 	 	 	 
By:	 	/s/ <SIGNING_PERSON>	 	By:	 	/s/ <SIGNING_PERSON>
Name: <SIGNING_PERSON>	 	Name: <SIGNING_PERSON>
Title:   <SIGNING_TITLE>	 	Title:   <SIGNING_TITLE>
 	 	 	 	 	 	 
Address:
<ADDRESS>.
<ADDRESS> 90232
Attention: Chuck Kenworthy	 	Attention: <SIGNING_TITLE> <EMAIL> (<PHONE>


In [None]:
print("\n".join(annotated_text["obfuscated"]))

This Commercial Lease (this Paramount Group) dated April 12, 2021, but made effective as of March 2, 2021 (the Summit Capital), is made by and between Liberty Legal Solutions, PC a Anthonyton, OR 24470 limited liability company (Sunrise Holdings) and Emerald City Law Firm, PC., a Michelleside, CA 58081 corporation (Starlight Capital).
605 Sunburst Enterprises, Inc..,
a California limited liability company	 	a Delaware corporation
 	 	 	 	 	 	 
 	 	 	 	 	 	 
By:	 	/s/ Almeta Jacobus	 	By:	 	/s/ Lary Point
Name: Conchetta Deeds	 	Name: Lary Point
Title:   Deputy Treasurer	 	Title:   Agent-in-charge
 	 	 	 	 	 	 
Address:
06232 Kelley Lock.
South Alexandra, KS 90829 90232
Attention: Chuck Kenworthy	 	Attention: Legal Advisor Ambrose@hotmail.com (4814) 546-958
This Commercial Lease (this <ALIAS>) dated <EFFDATE>, but made effective as of <EFFDATE> (the <ALIAS>), is made by and between <PARTY> a <ADDRESS> limited liability company (<ALIAS>) and <PARTY>., a <ADDRESS> corporation (<ALIAS>).
6

## Shifting Days

We use the `medical.DocumentHashCoder()` annotator to determine shifting days. This annotator gets the hash of the specified column and creates a new document column containing day shift information. And then, the `medical.DeIdentification()` annotator deidentifies this new doc. We should set the seed parameter to hash consistently.  

In [None]:
import pandas as pd

data = pd.DataFrame(
    {
        "DocumentID": ["A001", "A001", "A002", "A002"],
        "text": [
            "Chris Brown was arrested on 10/02/2022",
            "Mark White has bought a stock on 02/28/2020",
            "John has bought a house on 03/15/2022",
            "John Moore was discharged on 12/31/2022",
        ],
    }
)

my_input_df = spark.createDataFrame(data)

my_input_df.show(truncate=False)

+----------+-------------------------------------------+
|DocumentID|text                                       |
+----------+-------------------------------------------+
|A001      |Chris Brown was arrested on 10/02/2022     |
|A001      |Mark White has bought a stock on 02/28/2020|
|A002      |John has bought a house on 03/15/2022      |
|A002      |John Moore was discharged on 12/31/2022    |
+----------+-------------------------------------------+



### Shifting days according to the ID column

We use the `legal.DocumentHashCoder()` annotator to determine shifting days. This annotator gets the hash of the specified column and creates a new document column containing day shift information. And then, the `legal.DeIdentification()` annotator deidentifies this new doc. We should set the seed parameter to hash consistently.  

In [None]:
documentAssembler = nlp.DocumentAssembler().setInputCol("text").setOutputCol("document")

documentHasher = (
    legal.DocumentHashCoder()
    .setInputCols("document")
    .setOutputCol("document2")
    .setPatientIdColumn("DocumentID")
    .setRangeDays(100)
    .setNewDateShift("shift_days")
    .setSeed(100)
)

# sentenceDetector = nlp.SentenceDetector()\
#     .setInputCols(["document2"])\
#     .setOutputCol("sentence")

tokenizer = nlp.Tokenizer().setInputCols(["document2"]).setOutputCol("token")

embeddings = (
    nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base", "en")
    .setInputCols(["document2", "token"])
    .setOutputCol("embeddings")
    .setStorageRef("roberta_embeddings_legal_roberta_base_en")
)

legal_ner = (
    legal.NerModel.pretrained("legner_deid", "en", "legal/models")
    .setInputCols(["document2", "token", "embeddings"])
    .setOutputCol("ner")
)
# .setLabelCasing("upper")

ner_converter = (
    legal.NerConverterInternal()
    .setInputCols(["document2", "token", "ner"])
    .setOutputCol("ner_chunk")
)

deid = (
    legal.DeIdentification()
    .setInputCols(["document2", "token", "ner_chunk"])
    .setOutputCol("deidentified")
    .setMode("obfuscate")
    .setObfuscateDate(True)
    .setDateTag("DATE")
    .setLanguage("en")
    .setObfuscateRefSource("faker")
    .setUseShifDays(True)
    .setRegion("us")
)

pipeline = nlp.Pipeline(
    stages=[
        documentAssembler,
        documentHasher,
        sentenceDetector,
        tokenizer,
        embeddings,
        legal_ner,
        ner_converter,
        deid,
    ]
)

empty_data = spark.createDataFrame([["", ""]]).toDF("text", "DocumentID")

pipeline_model = pipeline.fit(empty_data)

roberta_embeddings_legal_roberta_base download started this may take some time.
Approximate size to download 447.2 MB
[OK!]
legner_deid download started this may take some time.
[OK!]


In [None]:
output = pipeline_model.transform(my_input_df)

output.select("DocumentID", "text", "deidentified.result").show(truncate=False)

+----------+-------------------------------------------+----------------------------------------------+
|DocumentID|text                                       |result                                        |
+----------+-------------------------------------------+----------------------------------------------+
|A001      |Chris Brown was arrested on 10/02/2022     |[Devota Fontan was arrested on 09/27/2022]    |
|A001      |Mark White has bought a stock on 02/28/2020|[Juana Nones has bought a stock on 02/23/2020]|
|A002      |John has bought a house on 03/15/2022      |[Darcy Eaton has bought a house on 04/13/2022]|
|A002      |John Moore was discharged on 12/31/2022    |[Phyllis Breeze was discharged on 01/29/2023] |
+----------+-------------------------------------------+----------------------------------------------+



### Shifting days according to specified values

Instead of shifting days according to ID column, we can specify shifting values with another column.

```python
documentHasher = legal.DocumentHashCoder()\
    .setInputCols("document")\
    .setOutputCol("document2")\
    .setDateShiftColumn("dateshift")\
```


In [None]:
data = pd.DataFrame(
    {
        "DocumentID": ["A001", "A001", "A002", "A002"],
        "text": [
            "Chris Brown was arrested on 10/02/2019",
            "Mark White has bought a stock on 02/28/2020",
            "John has bought a house on 03/15/2022",
            "John Moore was discharged on 12/31/2022",
        ],
        "dateshift": ["5", "5", "10", "10"],
    }
)


my_input_df = spark.createDataFrame(data)

my_input_df.show(truncate=False)

+----------+-------------------------------------------+---------+
|DocumentID|text                                       |dateshift|
+----------+-------------------------------------------+---------+
|A001      |Chris Brown was arrested on 10/02/2019     |5        |
|A001      |Mark White has bought a stock on 02/28/2020|5        |
|A002      |John has bought a house on 03/15/2022      |10       |
|A002      |John Moore was discharged on 12/31/2022    |10       |
+----------+-------------------------------------------+---------+



In [None]:
documentAssembler = nlp.DocumentAssembler().setInputCol("text").setOutputCol("document")

documentHasher = (
    legal.DocumentHashCoder()
    .setInputCols("document")
    .setOutputCol("document2")
    .setDateShiftColumn("dateshift")
)


# sentenceDetector = nlp.SentenceDetecto()\
#     .setInputCols(["document2"])\
#     .setOutputCol("sentence")

tokenizer = nlp.Tokenizer().setInputCols(["document2"]).setOutputCol("token")

embeddings = (
    nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base", "en")
    .setInputCols(["document2", "token"])
    .setOutputCol("embeddings")
    .setStorageRef("roberta_embeddings_legal_roberta_base_en")
)

legal_ner = (
    legal.NerModel.pretrained("legner_deid", "en", "legal/models")
    .setInputCols(["document2", "token", "embeddings"])
    .setOutputCol("ner")
)
# .setLabelCasing("upper")

ner_converter = (
    legal.NerConverterInternal()
    .setInputCols(["document2", "token", "ner"])
    .setOutputCol("ner_chunk")
)

obfuscation = (
    legal.DeIdentification()
    .setInputCols(["document2", "token", "ner_chunk"])
    .setOutputCol("deidentified")
    .setMode("obfuscate")
    .setObfuscateDate(True)
    .setDateTag("DATE")
    .setLanguage("en")
    .setObfuscateRefSource("faker")
    .setUseShifDays(True)
    .setRegion("us")
)

pipeline = nlp.Pipeline(
    stages=[
        documentAssembler,
        documentHasher,
        tokenizer,
        embeddings,
        legal_ner,
        ner_converter,
        obfuscation,
    ]
)

empty_data = spark.createDataFrame([["", "", ""]]).toDF(
    "text", "DocumentID", "dateshift"
)

pipeline_model = pipeline.fit(empty_data)

roberta_embeddings_legal_roberta_base download started this may take some time.
Approximate size to download 447.2 MB
[OK!]
legner_deid download started this may take some time.
[OK!]


In [None]:
output = pipeline_model.transform(my_input_df)

output.select("text", "dateshift", "deidentified.result").show(truncate=False)

+-------------------------------------------+---------+-------------------------------------------------+
|text                                       |dateshift|result                                           |
+-------------------------------------------+---------+-------------------------------------------------+
|Chris Brown was arrested on 10/02/2019     |5        |[Alisia Apple was arrested on 10/07/2019]        |
|Mark White has bought a stock on 02/28/2020|5        |[Fayrene Hope has bought a stock on 03/04/2020]  |
|John has bought a house on 03/15/2022      |10       |[Rosebud Confer has bought a house on 03/25/2022]|
|John Moore was discharged on 12/31/2022    |10       |[Chandler Combs was discharged on 01/10/2023]    |
+-------------------------------------------+---------+-------------------------------------------------+



### Masking Unnormalized Date Formats

`setUnnormalizedDateMode()` parameter is used to mask the DATE entities that can not be normalized. In the example below, please check `03Apr2022` which couldn't be normalized and it is masked in the output.

In [None]:
data = pd.DataFrame(
    {
        "DocumentID": ["A001", "A001", "A002", "A002"],
        "text": [
            "Chris Brown was arrested on 10/02/2022",
            "Mark White has bought a stock on 02/28/2020",
            "John has bought a house on 03Apr2022",
            "John Moore was discharged on 12/31/2022",
        ],
        "dateshift": ["5", "5", "10", "10"],
    }
)

my_input_df = spark.createDataFrame(data)


documentAssembler = nlp.DocumentAssembler().setInputCol("text").setOutputCol("document")

documentHasher = (
    legal.DocumentHashCoder()
    .setInputCols("document")
    .setOutputCol("document2")
    .setDateShiftColumn("dateshift")
)


# sentenceDetector = nlp.SentenceDetector()\
#     .setInputCols(["document2"])\
#     .setOutputCol("sentence")

tokenizer = nlp.Tokenizer().setInputCols(["document2"]).setOutputCol("token")

embeddings = (
    nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base", "en")
    .setInputCols(["document2", "token"])
    .setOutputCol("embeddings")
    .setStorageRef("roberta_embeddings_legal_roberta_base_en")
)

legal_ner = (
    legal.NerModel.pretrained("legner_deid", "en", "legal/models")
    .setInputCols(["document2", "token", "embeddings"])
    .setOutputCol("ner")
)
# .setLabelCasing("upper")

ner_converter = (
    legal.NerConverterInternal()
    .setInputCols(["document2", "token", "ner"])
    .setOutputCol("ner_chunk")
)

obfuscation = (
    legal.DeIdentification()
    .setInputCols(["sentence", "token", "ner_chunk"])
    .setOutputCol("deidentified")
    .setMode("obfuscate")
    .setObfuscateDate(True)
    .setDateTag("DATE")
    .setLanguage("en")
    .setObfuscateRefSource("faker")
    .setUseShifDays(True)
    .setRegion("us")
    .setUnnormalizedDateMode("mask")
)

pipeline = nlp.Pipeline(
    stages=[
        documentAssembler,
        documentHasher,
        sentenceDetector,
        tokenizer,
        embeddings,
        legal_ner,
        ner_converter,
        obfuscation,
    ]
)


output = pipeline.fit(my_input_df).transform(my_input_df)

output.select("text", "dateshift", "deidentified.result").show(truncate=False)

roberta_embeddings_legal_roberta_base download started this may take some time.
Approximate size to download 447.2 MB
[OK!]
legner_deid download started this may take some time.
[OK!]
+-------------------------------------------+---------+----------------------------------------------+
|text                                       |dateshift|result                                        |
+-------------------------------------------+---------+----------------------------------------------+
|Chris Brown was arrested on 10/02/2022     |5        |[Kathalene Pali was arrested on 10/14/2022]   |
|Mark White has bought a stock on 02/28/2020|5        |[Leldon Push has bought a stock on 03/11/2020]|
|John has bought a house on 03Apr2022       |10       |[Garen Juneau has bought a house on 03Apr2022]|
|John Moore was discharged on 12/31/2022    |10       |[Heddy Liverpool was discharged on 01/12/2023]|
+-------------------------------------------+---------+----------------------------------------

# Structured Deidentification

In [None]:
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/legal-nlp/data/hipaa-table-001.txt

df = (
    spark.read.format("csv")
    .option("sep", "\t")
    .option("inferSchema", "true")
    .option("header", "true")
    .load("hipaa-table-001.txt")
)

df.show(truncate=False)

+---------------+----------+---+----------------------------------------------------+-------+--------------+---+---+
|NAME           |DOB       |AGE|ADDRESS                                             |ZIPCODE|TEL           |SBP|DBP|
+---------------+----------+---+----------------------------------------------------+-------+--------------+---+---+
|Cecilia Chapman|04/02/1935|83 |711-2880 Nulla St. Mankato Mississippi              |69200  |(257) 563-7401|101|42 |
|Iris Watson    |03/10/2009|9  |P.O. Box 283 8562 Fusce Rd. Frederick Nebraska      |20620  |(372) 587-2335|159|122|
|Bryar Pitts    |11/01/1921|98 |5543 Aliquet St. Fort Dodge GA                      |20783  |(717) 450-4729|149|52 |
|Theodore Lowe  |13/02/2002|16 |Ap #867-859 Sit Rd. Azusa New York                  |39531  |(793) 151-6230|134|115|
|Calista Wise   |20/08/1942|76 |7292 Dictum Av. San Antonio MI                      |47096  |(492) 709-6392|139|78 |
|Kyla Olsen     |12/05/1973|45 |Ap #651-8679 Sodales Av. Tamunin

In [None]:
obfuscator = legal.StructuredDeidentification(
    spark, {"NAME": "PATIENT", "AGE": "AGE"}, obfuscateRefSource="faker"
)
obfuscator_df = obfuscator.obfuscateColumns(df)
obfuscator_df.show(truncate=False)

+------------------+----------+-----+----------------------------------------------------+-------+--------------+---+---+
|NAME              |DOB       |AGE  |ADDRESS                                             |ZIPCODE|TEL           |SBP|DBP|
+------------------+----------+-----+----------------------------------------------------+-------+--------------+---+---+
|[Chanetta Comes]  |04/02/1935|[94] |711-2880 Nulla St. Mankato Mississippi              |69200  |(257) 563-7401|101|42 |
|[Odella Bending]  |03/10/2009|[8]  |P.O. Box 283 8562 Fusce Rd. Frederick Nebraska      |20620  |(372) 587-2335|159|122|
|[Carylon Claude]  |11/01/1921|[89] |5543 Aliquet St. Fort Dodge GA                      |20783  |(717) 450-4729|149|52 |
|[Mathews Solomons]|13/02/2002|[17] |Ap #867-859 Sit Rd. Azusa New York                  |39531  |(793) 151-6230|134|115|
|[Abel Hoe]        |20/08/1942|[66] |7292 Dictum Av. San Antonio MI                      |47096  |(492) 709-6392|139|78 |
|[Chrystal Crape]  |12/0

In [None]:
obfuscator_unique_ref_test = """Will Perry#CLIENT
John Smith#CLIENT
Marvin MARSHALL#CLIENT
Hubert GROGAN#CLIENT
ALTHEA COLBURN#CLIENT
Kalil AMIN#CLIENT
Inci FOUNTAIN#CLIENT
Jackson WILLE#CLIENT
Jack SANTOS#CLIENT
Mahmood ALBURN#CLIENT
Marnie MELINGTON#CLIENT
Aysha GHAZI#CLIENT
Maryland CODER#CLIENT
Darene GEORGIOUS#CLIENT
Shelly WELLBECK#CLIENT
Min Kun JAE#CLIENT
Thomson THOMAS#CLIENT
Christian SUDDINBURG#CLIENT
Aberdeen#CITY
Louisburg St#STREET
France#LOC
5552312#PHONE
Calle del Libertador#ADDRESS
111#ID
20#AGE
30#AGE
40#AGE
50#AGE
60#AGE
"""

with open("obfuscator_unique_ref_test.txt", "w") as f:
    f.write(obfuscator_unique_ref_test)

In [None]:
# obfuscateRefSource = "file"

obfuscator = legal.StructuredDeidentification(
    spark,
    {"NAME": "CLIENT", "AGE": "AGE"},
    obfuscateRefFile="/content/obfuscator_unique_ref_test.txt",
    obfuscateRefSource="file",
    columnsSeed={"NAME": 23, "AGE": 23},
)
obfuscator_df = obfuscator.obfuscateColumns(df)
obfuscator_df.select("NAME", "AGE").show(truncate=False)

+----------------------+----+
|NAME                  |AGE |
+----------------------+----+
|[Christian SUDDINBURG]|[60]|
|[Christian SUDDINBURG]|[30]|
|[Thomson THOMAS]      |[30]|
|[Aysha GHAZI]         |[40]|
|[Jack SANTOS]         |[40]|
|[Mahmood ALBURN]      |[40]|
|[Jackson WILLE]       |[60]|
|[Maryland CODER]      |[60]|
|[Kalil AMIN]          |[60]|
|[Kalil AMIN]          |[20]|
|[Thomson THOMAS]      |[60]|
|[Kalil AMIN]          |[40]|
|[Mahmood ALBURN]      |[30]|
|[Darene GEORGIOUS]    |[30]|
|[Jack SANTOS]         |[30]|
|[Maryland CODER]      |[60]|
|[Darene GEORGIOUS]    |[50]|
|[Maryland CODER]      |[30]|
|[Mahmood ALBURN]      |[20]|
|[Thomson THOMAS]      |[20]|
+----------------------+----+
only showing top 20 rows



We can **shift n days** in the structured deidentification through "days" parameter when the column is a Date.

In [None]:
df = spark.createDataFrame(
    [
        ["Juan García", "13/02/1977", "711 Nulla St.", "140", "673 431234"],
        ["Will Smith", "23/02/1977", "1 Green Avenue.", "140", "+23 (673) 431234"],
        ["Pedro Ximénez", "11/04/1900", "Calle del Libertador, 7", "100", "912 345623"],
    ]
).toDF("NAME", "DOB", "ADDRESS", "SBP", "TEL")
df.show(truncate=False)

+-------------+----------+-----------------------+---+----------------+
|NAME         |DOB       |ADDRESS                |SBP|TEL             |
+-------------+----------+-----------------------+---+----------------+
|Juan García  |13/02/1977|711 Nulla St.          |140|673 431234      |
|Will Smith   |23/02/1977|1 Green Avenue.        |140|+23 (673) 431234|
|Pedro Ximénez|11/04/1900|Calle del Libertador, 7|100|912 345623      |
+-------------+----------+-----------------------+---+----------------+



In [None]:
obfuscator = legal.StructuredDeidentification(
    spark=spark,
    columns={"NAME": "ID", "DOB": "DATE"},
    columnsSeed={"NAME": 23, "DOB": 23},
    obfuscateRefSource="faker",
    days=5,
)

In [59]:
result = obfuscator.obfuscateColumns(df)
result.show(truncate=False)

+---------------+------------+-----------------------+---+----------------+
|NAME           |DOB         |ADDRESS                |SBP|TEL             |
+---------------+------------+-----------------------+---+----------------+
|[MRDW ZDKBXD]  |[18/02/1977]|711 Nulla St.          |140|673 431234      |
|[UGJJ QKGBF]   |[28/02/1977]|1 Green Avenue.        |140|+23 (673) 431234|
|[EFEIV WTXRUFA]|[16/04/1900]|Calle del Libertador, 7|100|912 345623      |
+---------------+------------+-----------------------+---+----------------+



# Save the Pipeline and Use it from Your Local

In [None]:
model.write().overwrite().save("pipeline_deid")

In [61]:
deid_pipeline = nlp.PretrainedPipeline.from_disk("pipeline_deid")

In [62]:
data = spark.createDataFrame([[text]]).toDF("text")

In [63]:
deid_pipeline.model.stages

[DocumentAssembler_b12257598998,
 SentenceDetector_61976255ccfb,
 REGEX_TOKENIZER_929e7567a2e7,
 ROBERTA_EMBEDDINGS_ad42a9ffcbbd,
 LegalNerModel_2581d15a167e,
 NER_CONVERTER_fa87d8c4a8a1,
 LegalNerModel_a75ed7366584,
 NerConverter_401903d11811,
 ZeroShotRobertaNer_5d06c0297d21,
 NER_CONVERTER_051406fa648b,
 CONTEXTUAL-PARSER_885c814223c2,
 CONTEXTUAL-PARSER_319c4be4cdee,
 CONTEXTUAL-PARSER_2e1f324e5be3,
 MERGE_6d7115c64635]

In [64]:
deid_pipeline.model.transform(data).show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|           ner_chunk|                ner2|          ner_chunk2|       zero_shot_ner|      zero_ner_chunk|               alias|               email|               phone|   merged_ner_chunks|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|This Commercial L...|[{document, 0, 70...|[{docu

# Pretrained Deidentification Pipeline

We have this pipeline can be used to deidentify legal information from texts.The legal information will be masked and obfuscated in the resulting text. The pipeline can mask and obfuscate `DOC`, `EFFDATE`, `PARTY`, `ALIAS`, `SIGNING_PERSON`, `SIGNING_TITLE`, `COUNTRY`, `CITY`, `STATE`, `STREET`, `ZIP`, `EMAIL`, `FAX`, `LOCATION-OTHER`, `DATE`,`PHONE` among many others.

In [3]:
deid_pipeline = nlp.PretrainedPipeline("legpipe_deid", "en", "legal/models")

legpipe_deid download started this may take some time.
Approx size to download 479.9 MB
[OK!]


In [4]:
deid_pipeline.model.stages

[DocumentAssembler_bedb6ad8e416,
 SentenceDetector_7cb9cad055b9,
 REGEX_TOKENIZER_33de4324d9e9,
 DateMatcher_8860a592901e,
 Date2Chunk_d279ac477b89,
 ROBERTA_EMBEDDINGS_b915dff90901,
 LegalNerModel_8f2f91053ea4,
 NER_CONVERTER_f2f2eec81402,
 LegalNerModel_2581d15a167e,
 NER_CONVERTER_4502c0659baf,
 LegalNerModel_a75ed7366584,
 NER_CONVERTER_8d11e477e502,
 CONTEXTUAL-PARSER_eb730f0f4a6c,
 CONTEXTUAL-PARSER_c310f42dcaa2,
 CONTEXTUAL-PARSER_712bc31c58d5,
 CONTEXTUAL-PARSER_33a381cc561e,
 CONTEXTUAL-PARSER_19bbf8091ca3,
 MERGE_a653ce055651,
 DE-IDENTIFICATION_ae309c4ceb31,
 DE-IDENTIFICATION_746e07dacaec,
 DE-IDENTIFICATION_6a27cdc8a13f,
 DE-IDENTIFICATION_27363070431f]

In [None]:
text = """CARGILL, INCORPORATED

By:     Pirkko Suominen



Name: Pirkko Suominen Title: Director, Bio Technology Development  Center,  Date:   10/19/2011

BIOAMBER, SAS

By:     Jean-François Huc



Name: Jean-François Huc  Title: President Date:   October 15, 2011

email : jeanfran@gmail.com
phone : 18087339090 """

In [None]:
deid_res = deid_pipeline.annotate(text)

In [7]:
deid_res.keys()

dict_keys(['obfuscated', 'email', 'document', 'ner_signers_chunk', 'deidentified', 'ner_contracts_chunk', 'alias', 'chiefs', 'masked_fixed_length_chars', 'ner_contracts', 'date', 'token', 'ner_signers', 'ner_generic_chunk', 'embeddings', 'ner_date_chunk', 'merged_ner_chunks', 'sentence', 'phone', 'orgs', 'masked_with_chars', 'ner_generic'])

In [None]:
import pandas as pd

pd.set_option("display.max_colwidth", 100)

df = pd.DataFrame(
    list(
        zip(
            deid_res["sentence"],
            deid_res["deidentified"],
            deid_res["masked_with_chars"],
            deid_res["masked_fixed_length_chars"],
            deid_res["obfuscated"],
        )
    ),
    columns=[
        "Sentence",
        "Masked",
        "Masked with Chars",
        "Masked with Fixed Chars",
        "Obfuscated",
    ],
)

df

Unnamed: 0,Sentence,Masked,Masked with Chars,Masked with Fixed Chars,Obfuscated
0,"CARGILL, INCORPORATED\n\nBy: Pirkko Suominen\n\n\n\nName: Pirkko Suominen Title: Director, B...","<ORG>\n\nBy: <PERSON>\n\n\n\nName: <PERSON> Title: <PROFESSION>, Date: <DATE>\n\nBIOAMBER...",[*******************]\n\nBy: [*************]\n\n\n\nName: [*************] Title: [**********...,"****\n\nBy: ****\n\n\n\nName: **** Title: ****, Date: ****\n\nBIOAMBER, SAS\n\nBy: **...","Rohm and Haas\n\nBy: Aneita Keens\n\n\n\nName: Aneita Keens Title: Production assistant, rad..."
