![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

# Legal Deidentification

## Setup

In [2]:
from johnsnowlabs import *
import pyspark.sql.functions as F
import pandas as pd


spark = start_spark()

📋 Loading license number 0 from /root/.johnsnowlabs/licenses/license_number_{number}_for_Spark-Healthcare_Spark-OCR.json
🤓 Looks like you are missing some jars, trying fetching them ...
Downloading 🫘+🚀 Java Library spark-nlp-gpu-assembly-5.5.0.jar
🙆 JSL Home setup in /root/.johnsnowlabs
👌 Launched [92mgpu optimized[39m session with with: 🚀Spark-NLP==5.5.0, 💊Spark-Healthcare==5.5.0, running on ⚡ PySpark==3.4.0


# Deidentification Model

Some legal information can be considered sensitive. (e.g.,document, organization, address, signer)

In [3]:
documentAssembler = nlp.DocumentAssembler().setInputCol("text").setOutputCol("document")

sentenceDetector = (
    nlp.SentenceDetectorDLModel.pretrained("sentence_detector_dl", "xx")
    .setInputCols(["document"])
    .setOutputCol("sentence")
)
# .setCustomBounds(["\n\n"])

tokenizer = nlp.Tokenizer().setInputCols(["sentence"]).setOutputCol("token")

embeddings = (
    nlp.WordEmbeddingsModel.pretrained("legal_word_embeddings", "en", "legal/models")
    .setInputCols(["sentence", "token"])
    .setOutputCol("embeddings")
)

legal_ner = (
    legal.NerModel.pretrained("legner_contract_doc_parties_le", "en", "legal/models")
    .setInputCols(["sentence", "token", "embeddings"])
    .setOutputCol("ner")
)
# .setLabelCasing("upper")

ner_converter = (
    legal.NerConverter()
    .setInputCols(["sentence", "token", "ner"])
    .setOutputCol("ner_chunk")
    .setReplaceLabels({"ALIAS": "PARTY"})
)  # "ALIAS" are secondary names of companies, so let's extract them also as PARTY

nlp_pipeline = nlp.Pipeline(
    stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        embeddings,
        legal_ner,
        ner_converter,
    ]
)

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlp_pipeline.fit(empty_data)

sentence_detector_dl download started this may take some time.
Approximate size to download 514.9 KB
[OK!]
legal_word_embeddings download started this may take some time.
Approximate size to download 80.9 MB
[OK!]
legner_contract_doc_parties_le download started this may take some time.
[OK!]


### Pretrained NER models extracts:
- Document
- Date
- Party (Organization Name)
- Alias

In [4]:
legal_ner.getClasses()

['O',
 'I-DOC',
 'B-EFFDATE',
 'B-ORG',
 'I-ORG',
 'B-ALIAS',
 'I-ALIAS',
 'I-EFFDATE',
 'B-PARTY',
 'B-FORMER_NAME',
 'I-FORMER_NAME',
 'B-DOC',
 'I-PARTY']

In [5]:
text = """THIS STRATEGIC ALLIANCE AGREEMENT ("Agreement") is made and entered into as of December 14, 2016 , by and between Hyatt Franchising Latin America, L.L.C., a limited liability company organized and existing under the laws of the State of Delaware """

In [6]:
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))

In [7]:
result_df = result.select(
    F.explode(F.arrays_zip(result.token.result, result.ner.result)).alias("cols")
).select(F.expr("cols['0']").alias("token"), F.expr("cols['1']").alias("ner_label"))

In [8]:
result_df.select("token", "ner_label").groupBy("ner_label").count().orderBy(
    "count", ascending=False
).show(truncate=False)

+---------+-----+
|ner_label|count|
+---------+-----+
|O        |32   |
|I-EFFDATE|3    |
|I-DOC    |3    |
|I-PARTY  |3    |
|B-DOC    |1    |
|B-PARTY  |1    |
|B-EFFDATE|1    |
+---------+-----+



### Check extracted sensitive entities

In [9]:
result.select(
    F.explode(F.arrays_zip(result.ner_chunk.result, result.ner_chunk.metadata)).alias(
        "cols"
    )
).select(
    F.expr("cols['0']").alias("chunk"), F.expr("cols['1']['entity']").alias("ner_label")
).show(truncate=False)

+---------------------------------+---------+
|chunk                            |ner_label|
+---------------------------------+---------+
|THIS STRATEGIC ALLIANCE AGREEMENT|DOC      |
|December 14, 2016                |EFFDATE  |
|Hyatt Franchising Latin America  |PARTY    |
+---------------------------------+---------+



## Masking and Obfuscation

### Replace these enitites with Tags

In [10]:
ner_converter = (
    legal.NerConverter()
    .setInputCols(["sentence", "token", "ner"])
    .setOutputCol("ner_chunk")
)

deidentification = (
    legal.DeIdentification()
    .setInputCols(["sentence", "token", "ner_chunk"])
    .setOutputCol("deidentified")
    .setMode("mask")
    .setReturnEntityMappings(True)
)  #  return a new column to save the mappings between the mask/obfuscated entities and original entities.
# .setMappingsColumn("MappingCol") # change the name of the column, 'aux' is default

deid_pipeline = nlp.Pipeline(
    stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        embeddings,
        legal_ner,
        ner_converter,
        deidentification,
    ]
)

empty_data = spark.createDataFrame([[""]]).toDF("text")

model_deid = deid_pipeline.fit(empty_data)

In [11]:
result = model_deid.transform(spark.createDataFrame([[text]]).toDF("text"))

In [12]:
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|           ner_chunk|        deidentified|                 aux|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|THIS STRATEGIC AL...|[{document, 0, 24...|[{document, 0, 24...|[{token, 0, 3, TH...|[{word_embeddings...|[{named_entity, 0...|[{chunk, 0, 32, T...|[{document, 0, 18...|[{chunk, 0, 4, <D...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [13]:
result.select(
    F.explode(F.arrays_zip(result.sentence.result, result.deidentified.result)).alias(
        "cols"
    )
).select(
    F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")
).toPandas()

Unnamed: 0,sentence,deidentified
0,"THIS STRATEGIC ALLIANCE AGREEMENT (""Agreement""...","<DOC> (""Agreement"") is made and entered into a..."


We have three modes to mask the entities in the Deidentification annotator. You can select the modes using the `.setMaskingPolicy()` parameter. The methods are the followings:

**“entity_labels”**: Mask with the entity type of that chunk. (default) <br/>
**“same_length_chars”**: Mask the deid entities with same length of asterix ( * ) with brackets ( [ , ] ) on both end. <br/>
**“fixed_length_chars”**: Mask the deid entities with a fixed length of asterix ( * ). The length is setting up using the `setFixedMaskLength()` method. <br/>

Let's try each of these and compare the results:

In [14]:
# deid model with "entity_labels"
deid_entity_labels = (
    legal.DeIdentification()
    .setInputCols(["sentence", "token", "ner_chunk"])
    .setOutputCol("deid_entity_label")
    .setMode("mask")
    .setReturnEntityMappings(True)
    .setMaskingPolicy("entity_labels")
)

# deid model with "same_length_chars"
deid_same_length = (
    legal.DeIdentification()
    .setInputCols(["sentence", "token", "ner_chunk"])
    .setOutputCol("deid_same_length")
    .setMode("mask")
    .setReturnEntityMappings(True)
    .setMaskingPolicy("same_length_chars")
)

# deid model with "fixed_length_chars"
deid_fixed_length = (
    legal.DeIdentification()
    .setInputCols(["sentence", "token", "ner_chunk"])
    .setOutputCol("deid_fixed_length")
    .setMode("mask")
    .setReturnEntityMappings(True)
    .setMaskingPolicy("fixed_length_chars")
    .setFixedMaskLength(4)
)


deid_pipeline = nlp.Pipeline(
    stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        embeddings,
        legal_ner,
        ner_converter,
        deid_entity_labels,
        deid_same_length,
        deid_fixed_length,
    ]
)


empty_data = spark.createDataFrame([[""]]).toDF("text")

model_deid = deid_pipeline.fit(empty_data)

In [15]:
policy_result = model_deid.transform(spark.createDataFrame([[text]]).toDF("text"))

In [16]:
policy_result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|           ner_chunk|   deid_entity_label|                 aux|    deid_same_length|   deid_fixed_length|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|THIS STRATEGIC AL...|[{document, 0, 24...|[{document, 0, 24...|[{token, 0, 3, TH...|[{word_embeddings...|[{named_entity, 0...|[{chunk, 0, 32, T...|[{document, 0, 18...|[{chunk, 0, 3, **...|[{document, 0, 24...|[{document, 0, 17...|
+--------------------+--------------------+--------------------+----

In [17]:
policy_result.select(
    F.explode(
        F.arrays_zip(
            policy_result.sentence.result,
            policy_result.deid_entity_label.result,
            policy_result.deid_same_length.result,
            policy_result.deid_fixed_length.result,
        )
    ).alias("cols")
).select(
    F.expr("cols['0']").alias("sentence"),
    F.expr("cols['1']").alias("deid_entity_label"),
    F.expr("cols['2']").alias("deid_same_length"),
    F.expr("cols['3']").alias("deid_fixed_length"),
).toPandas()

Unnamed: 0,sentence,deid_entity_label,deid_same_length,deid_fixed_length
0,"THIS STRATEGIC ALLIANCE AGREEMENT (""Agreement""...","<DOC> (""Agreement"") is made and entered into a...","[*******************************] (""Agreement""...","**** (""Agreement"") is made and entered into as..."


### Mapping Column

In [18]:
result.select("aux").show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|aux                                                                                                                                                                                                                                                             

In [19]:
result.select(
    F.explode(
        F.arrays_zip(
            result.aux.metadata, result.aux.result, result.aux.begin, result.aux.end
        )
    ).alias("cols")
).select(
    F.expr("cols['0']['originalChunk']").alias("chunk"),
    F.expr("cols['0']['beginOriginalChunk']").alias("beginChunk"),
    F.expr("cols['0']['endOriginalChunk']").alias("endChunk"),
    F.expr("cols['1']").alias("label"),
    F.expr("cols['2']").alias("beginLabel"),
    F.expr("cols['3']").alias("endLabel"),
).show(truncate=False)

+---------------------------------+----------+--------+---------+----------+--------+
|chunk                            |beginChunk|endChunk|label    |beginLabel|endLabel|
+---------------------------------+----------+--------+---------+----------+--------+
|THIS STRATEGIC ALLIANCE AGREEMENT|0         |32      |<DOC>    |0         |4       |
|December 14, 2016                |79        |95      |<EFFDATE>|51        |59      |
|Hyatt Franchising Latin America  |114       |144     |<PARTY>  |78        |84      |
+---------------------------------+----------+--------+---------+----------+--------+



## Reidentification

We can use `ReIdentification` annotator to go back to the original sentence.

In [20]:
reIdentification = (
    legal.ReIdentification()
    .setInputCols(["aux", "deidentified"])
    .setOutputCol("original")
)

In [21]:
reid_result = reIdentification.transform(result)

In [22]:
reid_result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|           ner_chunk|        deidentified|                 aux|            original|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|THIS STRATEGIC AL...|[{document, 0, 24...|[{document, 0, 24...|[{token, 0, 3, TH...|[{word_embeddings...|[{named_entity, 0...|[{chunk, 0, 32, T...|[{document, 0, 18...|[{chunk, 0, 4, <D...|[{document, 0, 24...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----

In [23]:
print(text)

reid_result.select("original.result").show(truncate=False)

THIS STRATEGIC ALLIANCE AGREEMENT ("Agreement") is made and entered into as of December 14, 2016 , by and between Hyatt Franchising Latin America, L.L.C., a limited liability company organized and existing under the laws of the State of Delaware 
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|result                                                                                                                                                                                                                                                 |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[T

## Using multiple NER in the same pipeline

In [24]:
documentAssembler = nlp.DocumentAssembler().setInputCol("text").setOutputCol("document")

sentenceDetector = (
    nlp.SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")
)

tokenizer = nlp.Tokenizer().setInputCols(["sentence"]).setOutputCol("token")

embeddings = (
    nlp.RoBertaEmbeddings.pretrained("roberta_embeddings_legal_roberta_base", "en")
    .setInputCols(["sentence", "token"])
    .setOutputCol("embeddings")
    .setStorageRef("roberta_embeddings_legal_roberta_base_en")
)

legal_ner = (
    legal.NerModel.pretrained("legner_contract_doc_parties", "en", "legal/models")
    .setInputCols(["sentence", "token", "embeddings"])
    .setOutputCol("ner")
)
# .setLabelCasing("upper")

ner_converter = (
    legal.NerConverter()
    .setInputCols(["sentence", "token", "ner"])
    .setOutputCol("ner_chunk")
    .setReplaceLabels({"ALIAS": "PARTY"})
)

ner_signers = (
    legal.NerModel.pretrained("legner_signers", "en", "legal/models")
    .setInputCols(["sentence", "token", "embeddings"])
    .setOutputCol("ner_signers")
)
# .setLabelCasing("upper")

ner_converter_signers = (
    nlp.NerConverter()
    .setInputCols(["sentence", "token", "ner_signers"])
    .setOutputCol("ner_signer_chunk")
)

chunk_merge = (
    legal.ChunkMergeApproach()
    .setInputCols("ner_signer_chunk", "ner_chunk")
    .setOutputCol("deid_merged_chunk")
)

deidentification = (
    legal.DeIdentification()
    .setInputCols(["sentence", "token", "deid_merged_chunk"])
    .setOutputCol("deidentified")
    .setMode("mask")
    .setIgnoreRegex(True)
)


nlp_pipeline = nlp.Pipeline(
    stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        embeddings,
        legal_ner,
        ner_converter,
        ner_signers,
        ner_converter_signers,
        chunk_merge,
        deidentification,
    ]
)

empty_data = spark.createDataFrame([[""]]).toDF("text")

model = nlp_pipeline.fit(empty_data)

roberta_embeddings_legal_roberta_base download started this may take some time.
Approximate size to download 447.2 MB
[OK!]
legner_contract_doc_parties download started this may take some time.
[OK!]
legner_signers download started this may take some time.
[OK!]


In [25]:
text = """ENTIRE AGREEMENT.  This Agreement contains the entire understanding of the parties hereto with respect to the transactions and matters contemplated hereby, supersedes all previous Agreements between i-Escrow and 2TheMart concerning the subject matter.

2THEMART.COM, INC.:                         I-ESCROW, INC.:

By:Dominic J. Magliarditi                By:Sanjay Bajaj Name: Dominic J. Magliarditi                Name: Sanjay Bajaj Title: President                            Title: VP Business Development Date: 6/21/99                               Date: 6/11/99 """

In [26]:
result = model.transform(spark.createDataFrame([[text]]).toDF("text"))


In [27]:
# legal_ner
result.select(
    F.explode(F.arrays_zip(result.ner_chunk.result, result.ner_chunk.metadata)).alias(
        "cols"
    )
).select(
    F.expr("cols['0']").alias("chunk"), F.expr("cols['1']['entity']").alias("ner_label")
).show(truncate=False)

+----------------+---------+
|chunk           |ner_label|
+----------------+---------+
|ENTIRE AGREEMENT|DOC      |
+----------------+---------+



In [28]:
# ner_signers
result.select(
    F.explode(
        F.arrays_zip(result.ner_signer_chunk.result, result.ner_signer_chunk.metadata)
    ).alias("cols")
).select(
    F.expr("cols['0']").alias("chunk"), F.expr("cols['1']['entity']").alias("ner_label")
).show(truncate=False)

+-----------------------+--------------+
|chunk                  |ner_label     |
+-----------------------+--------------+
|INC                    |PARTY         |
|J. Magliarditi         |SIGNING_PERSON|
|Bajaj                  |SIGNING_PERSON|
|Dominic J. Magliarditi |SIGNING_PERSON|
|Sanjay Bajaj           |SIGNING_PERSON|
|President              |SIGNING_TITLE |
|VP Business Development|SIGNING_TITLE |
+-----------------------+--------------+



In [29]:
# merged_chunk
result.select(
    F.explode(
        F.arrays_zip(result.deid_merged_chunk.result, result.deid_merged_chunk.metadata)
    ).alias("cols")
).select(
    F.expr("cols['0']").alias("chunk"), F.expr("cols['1']['entity']").alias("ner_label")
).show(truncate=False)

+-----------------------+--------------+
|chunk                  |ner_label     |
+-----------------------+--------------+
|ENTIRE AGREEMENT       |DOC           |
|INC                    |PARTY         |
|J. Magliarditi         |SIGNING_PERSON|
|Bajaj                  |SIGNING_PERSON|
|Dominic J. Magliarditi |SIGNING_PERSON|
|Sanjay Bajaj           |SIGNING_PERSON|
|President              |SIGNING_TITLE |
|VP Business Development|SIGNING_TITLE |
+-----------------------+--------------+



In [30]:
result.select(
    F.explode(F.arrays_zip(result.sentence.result, result.deidentified.result)).alias(
        "cols"
    )
).select(
    F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")
).toPandas()

Unnamed: 0,sentence,deidentified
0,ENTIRE AGREEMENT.,<DOC>.
1,This Agreement contains the entire understandi...,This Agreement contains the entire understandi...
2,"2THEMART.COM, INC.: I-...","2THEMART.COM, INC.: I-..."


## Obfuscation mode

In the obfuscation mode **DeIdentificationModel** will replace sensitive entities with random values of the same type.


In [31]:
# This is the obfuscation dict for the new entities
obs_lines = """CTO#SIGNING_TITLE
Project Manager#SIGNING_TITLE
Sales Manager#SIGNING_TITLE
Business Manager#SIGNING_TITLE
Coordinator#SIGNING_TITLE
Officer#SIGNING_TITLE
Legal Agreement#DOC
Contract#DOC
Estate Document#DOC
official Document#DOC
Deed of Covenant#DOC
TURER INC#PARTY
Clark llc.#PARTY
SESA CO.#PARTY
John Snow Labs Inc#PARTY
MGT Trust Company, LLC.#PARTY
JAMES TURNER#SIGNING_PERSON
Juan Garcia#SIGNING_PERSON
Benjamin Dean#SIGNING_PERSON
Tommy Lee#SIGNING_PERSON
Dorothy Keen#SIGNING_PERSON
("AGREEMENT")#ALIAS
("TRADE COMPANY")#ALIAS
(the" Agreement")#ALIAS
("private company")#ALIAS
(the "Contract")#ALIAS
26-06-1990#EFFDATE
03/08/2025#EFFDATE
01/01/2045#EFFDATE
11/7/2016#EFFDATE
12-12-2022#EFFDATE """

with open("obfuscate.txt", "w") as f:
    f.write(obs_lines)

In [32]:
ner_converter_signers = (
    nlp.NerConverter()
    .setInputCols(["sentence", "token", "ner_signers"])
    .setOutputCol("ner_signer_chunk")
)

chunk_merge = (
    legal.ChunkMergeApproach()
    .setInputCols("ner_signer_chunk", "ner_chunk")
    .setOutputCol("deid_merged_chunk")
)

obfuscation = (
    legal.DeIdentification()
    .setInputCols(["sentence", "token", "ner_signer_chunk"])
    .setOutputCol("deidentified")
    .setMode("obfuscate")
    .setObfuscateDate(True)
    .setObfuscateRefFile("obfuscate.txt")
    .setObfuscateRefSource("both")
)  # default: "faker"


nlp_pipeline = nlp.Pipeline(
    stages=[
        documentAssembler,
        sentenceDetector,
        tokenizer,
        embeddings,
        legal_ner,
        ner_converter,
        ner_signers,
        ner_converter_signers,
        chunk_merge,
        obfuscation,
    ]
)

obfuscation_model = nlp_pipeline.fit(empty_data)

In [33]:
text = """"Newegg" "Allied" Newegg Inc. Allied Esports International, Inc. By  Mitesh Patel By:  Judson Hannigan Name: Mitesh Patel Name: Judson Hannigan Title: VP, Marketing Title: CEO Newegg Inc. Allied Esports International, Inc. """

result = obfuscation_model.transform(spark.createDataFrame([[text]]).toDF("text"))

In [34]:
result.select(
    F.explode(F.arrays_zip(result.sentence.result, result.deidentified.result)).alias(
        "cols"
    )
).select(
    F.expr("cols['0']").alias("sentence"), F.expr("cols['1']").alias("deidentified")
).toPandas()

Unnamed: 0,sentence,deidentified
0,"""Newegg"" ""Allied"" Newegg Inc.","""Newegg"" ""Allied"" Newegg BlueSky Ventures, LP."
1,"Allied Esports International, Inc.","Platinum Law Group, PC."
2,By Mitesh Patel By: Judson Hannigan Name: Mi...,By Katina Parlor By: Bunny Caroli Name: Kati...
3,"Allied Esports International, Inc.","Platinum Law Group, PC."


## Use full pipeline in the Light model

In [35]:
light_model = nlp.LightPipeline(model)

annotated_text = light_model.annotate(text)


In [36]:
annotated_text["deidentified"]

['"<PARTY>" "<PARTY>" <PARTY>.',
 '<PARTY>.',
 'By  <SIGNING_PERSON> By:  <SIGNING_PERSON> Name: <SIGNING_PERSON> Name: <SIGNING_PERSON> Title: <SIGNING_TITLE> Title: <SIGNING_TITLE> <PARTY>.',
 '<PARTY>.']

In [37]:
obf_light_model = nlp.LightPipeline(obfuscation_model)

annotated_text = obf_light_model.annotate(text)


In [38]:
annotated_text["deidentified"]

['"Newegg" "Allied" Newegg BlueSky Ventures, LP.',
 'Platinum Law Group, PC.',
 'By  Katina Parlor By:  Bunny Caroli Name: Katina Parlor Name: Bunny Caroli Title: Director Title: Chief Compliance Officer (CCO) Ruby Real Estate, LLC.',
 'Platinum Law Group, PC.']

# Save the Pipeline and Use it from Your Local

In [39]:
model.write().overwrite().save("pipeline_deid")

In [40]:
deid_pipeline = nlp.PretrainedPipeline.from_disk("pipeline_deid")

In [41]:
data = spark.createDataFrame([[text]]).toDF("text")

In [42]:
deid_pipeline.model.stages

[DocumentAssembler_4f5d0cd72a34,
 SentenceDetector_94ef3f719e85,
 REGEX_TOKENIZER_596b5138ec8d,
 ROBERTA_EMBEDDINGS_ad42a9ffcbbd,
 MedicalNerModel_93f728ff96e5,
 NER_CONVERTER_fe71246827e9,
 LegalNerModel_a75ed7366584,
 NerConverter_2fb826414e91,
 MERGE_99c90f475dae,
 DE-IDENTIFICATION_86d8fa17729c]

In [43]:
result = deid_pipeline.model.transform(data)

In [44]:
result.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|           ner_chunk|         ner_signers|    ner_signer_chunk|   deid_merged_chunk|        deidentified|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|"Newegg" "Allied"...|[{document, 0, 22...|[{document, 0, 28...|[{token, 0, 0, ",...|[{word_embeddings...|[{named_entity, 0...|[{chunk, 1, 6, Ne...|[{named_entity, 0...|[{chunk, 25, 27, ...|[{chunk, 1, 6, Ne...|[{document, 0, 27...|
+--------------------+--------------------+--------------------+----

# Pretrained Deidentification Pipeline

We have this pipeline can be used to deidentify legal information from texts.The legal information will be masked and obfuscated in the resulting text. The pipeline can mask and obfuscate `DOC`, `EFFDATE`, `PARTY`, `ALIAS`, `SIGNING_PERSON`, `SIGNING_TITLE`, `COUNTRY`, `CITY`, `STATE`, `STREET`, `ZIP`, `EMAIL`, `FAX`, `LOCATION-OTHER`, `DATE`,`PHONE` entities.

In [45]:
deid_pipeline = nlp.PretrainedPipeline("legpipe_deid", "en", "legal/models")

legpipe_deid download started this may take some time.
Approx size to download 479.9 MB
[OK!]


In [46]:
deid_pipeline.model.stages

[DocumentAssembler_bedb6ad8e416,
 SentenceDetector_7cb9cad055b9,
 REGEX_TOKENIZER_33de4324d9e9,
 DateMatcher_8860a592901e,
 Date2Chunk_d279ac477b89,
 ROBERTA_EMBEDDINGS_b915dff90901,
 LegalNerModel_8f2f91053ea4,
 NER_CONVERTER_f2f2eec81402,
 LegalNerModel_2581d15a167e,
 NER_CONVERTER_4502c0659baf,
 LegalNerModel_a75ed7366584,
 NER_CONVERTER_8d11e477e502,
 CONTEXTUAL-PARSER_eb730f0f4a6c,
 CONTEXTUAL-PARSER_c310f42dcaa2,
 CONTEXTUAL-PARSER_712bc31c58d5,
 CONTEXTUAL-PARSER_33a381cc561e,
 CONTEXTUAL-PARSER_19bbf8091ca3,
 MERGE_a653ce055651,
 DE-IDENTIFICATION_ae309c4ceb31,
 DE-IDENTIFICATION_746e07dacaec,
 DE-IDENTIFICATION_6a27cdc8a13f,
 DE-IDENTIFICATION_27363070431f]

In [47]:
text = """CARGILL, INCORPORATED

By:     Pirkko Suominen



Name: Pirkko Suominen Title: Director, Bio Technology Development  Center,  Date:   10/19/2011

BIOAMBER, SAS

By:     Jean-François Huc



Name: Jean-François Huc  Title: President Date:   October 15, 2011

email : jeanfran@gmail.com
phone : 18087339090 """

In [48]:
deid_res = deid_pipeline.annotate(text)

In [49]:
deid_res.keys()

dict_keys(['obfuscated', 'email', 'document', 'ner_signers_chunk', 'deidentified', 'ner_contracts_chunk', 'alias', 'chiefs', 'masked_fixed_length_chars', 'ner_contracts', 'date', 'token', 'ner_signers', 'ner_generic_chunk', 'embeddings', 'ner_date_chunk', 'merged_ner_chunks', 'sentence', 'phone', 'orgs', 'masked_with_chars', 'ner_generic'])

In [50]:
pd.set_option("display.max_colwidth", 100)

df = pd.DataFrame(
    list(
        zip(
            deid_res["sentence"],
            deid_res["deidentified"],
            deid_res["masked_with_chars"],
            deid_res["masked_fixed_length_chars"],
            deid_res["obfuscated"],
        )
    ),
    columns=[
        "Sentence",
        "Masked",
        "Masked with Chars",
        "Masked with Fixed Chars",
        "Obfuscated",
    ],
)

df

Unnamed: 0,Sentence,Masked,Masked with Chars,Masked with Fixed Chars,Obfuscated
0,"CARGILL, INCORPORATED\n\nBy: Pirkko Suominen\n\n\n\nName: Pirkko Suominen Title: Director, B...","<ORG>\n\nBy: <PERSON>\n\n\n\nName: <PERSON> Title: <PROFESSION>, Date: <DATE>\n\nBIOAMBER...",[*******************]\n\nBy: [*************]\n\n\n\nName: [*************] Title: [**********...,"****\n\nBy: ****\n\n\n\nName: **** Title: ****, Date: ****\n\nBIOAMBER, SAS\n\nBy: **...","Rohm and Haas\n\nBy: Aneita Keens\n\n\n\nName: Aneita Keens Title: Production assistant, rad..."
