# De-identification Dicom documents with encapsulated Pdf document

## Install spark-ocr python packge
Need specify:
- license
- path to `spark-ocr-assembly-[version].jar` and `spark-nlp-jsl-[version]`
- or `secret` for Spark OCR and `nlp_secret` for Spark NLP Internal
- `aws_access_key` and `aws_secret_key`for download pretrained models

For more details about Dicom de-identification please read:

 - [DICOM de-identification at scale in Visual NLP — Part 1.](https://medium.com/john-snow-labs/dicom-de-identification-at-scale-in-visual-nlp-part-1-68784177f5f0)

 - [DICOM de-identification at scale in Visual NLP — Part 2.](https://medium.com/john-snow-labs/dicom-de-identification-at-scale-in-visual-nlp-part-2-361af5e36412)

- [DICOM de-identification at scale in Visual NLP — Part 3.](https://medium.com/john-snow-labs/dicom-de-identification-at-scale-in-visual-nlp-part-3-61cecc3adb56)

In [1]:
license = ""
secret = ""
nlp_secret = ""
aws_access_key = ""
aws_secret_key = ""

version = secret.split("-")[0]
spark_ocr_jar_path = "../../../target/scala-2.12"

## Start Spark session

In [3]:
from sparkocr import start
import os
from pyspark import SparkConf

if license:
    os.environ['JSL_OCR_LICENSE'] = license
    os.environ['SPARK_NLP_LICENSE'] = license

if aws_access_key:
    os.environ['AWS_ACCESS_KEY'] = aws_access_key
    os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_key


spark = start(secret=secret,
              nlp_secret=nlp_secret,
              jar_path=spark_ocr_jar_path,
              nlp_internal="6.2.2")
spark

Spark version: 3.5.0
Spark NLP version: 6.2.0
Spark NLP for Healthcare version: 6.2.0
Spark OCR version: 6.2.2rc1



## Import transformers and annotators

In [4]:
import os
import sys

from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp_jsl
from sparknlp_jsl.annotator import *

import sparkocr
from sparkocr.transformers import *
from sparkocr.utils import *
from sparkocr.enums import *
from sparkocr.schemas import BinarySchema

from pyspark.ml import PipelineModel, Pipeline
from pyspark.sql.functions import *

print(f"Spark NLP version: {sparknlp.version()}")
print(f"Spark NLP internal version: {sparknlp_jsl.version()}")
print(f"Spark OCR version: {sparkocr.version()}")

Spark NLP version: 6.2.0
Spark NLP internal version: 6.2.0
Spark OCR version: 6.2.2rc1


## Define Spark NLP pipeline for de-identification text

In [5]:
def deidentification_nlp_pipeline(input_column, prefix = "", model="ner_deid_large"):
    document_assembler = DocumentAssembler() \
        .setInputCol(input_column) \
        .setOutputCol(prefix + "document_raw")

    cleanUpPatterns = ["<[^>]*>", ":"]
    documentNormalizer = DocumentNormalizer() \
      .setInputCols(prefix + "document_raw") \
      .setOutputCol(prefix + "document") \
      .setAction("clean") \
      .setPatterns(cleanUpPatterns) \
      .setReplacement(" ") \
      .setPolicy("pretty_all") 

    # Sentence Detector annotator, processes various sentences per line
    sentence_detector = SentenceDetector() \
        .setInputCols([prefix + "document"]) \
        .setOutputCol(prefix + "sentence")

    tokenizer = Tokenizer() \
        .setInputCols([prefix + "sentence"]) \
        .setOutputCol(prefix + "token")

    # Clinical word embeddings
    word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \
        .setInputCols([prefix + "sentence", prefix + "token"]) \
        .setOutputCol(prefix + "embeddings") \
        .setEnableInMemoryStorage(True)

    clinical_ner = MedicalNerModel.pretrained(model, "en", "clinical/models") \
        .setInputCols([prefix + "sentence", prefix + "token", prefix + "embeddings"]) \
        .setOutputCol(prefix + "ner")

    custom_ner_converter = NerConverter() \
        .setInputCols([prefix + "sentence", prefix + "token", prefix + "ner"]) \
        .setOutputCol(prefix + "ner_chunk") \
        .setWhiteList(['NAME', 'AGE', 'CONTACT', 'ID',
                   'LOCATION', 'PROFESSION', 'PERSON', 'DATE', 'DOCTOR'])

    nlp_pipeline = Pipeline(stages=[
            document_assembler,
            documentNormalizer,
            sentence_detector,
            tokenizer,
            word_embeddings,
            clinical_ner,
            custom_ner_converter
        ])
    empty_data = spark.createDataFrame([[""]]).toDF(input_column)
    nlp_model = nlp_pipeline.fit(empty_data)
    return nlp_model

## Define Spark Ocr pipeline

In [6]:
# Extract encapsulated Pdf from the Dicom
dicom_to_pdf = DicomToPdf() \
    .setInputCols(["path"]) \
    .setOutputCol("pdf") \
    .setKeepInput(True)

# Convert Pdf to the image
pdf_to_image = PdfToImage() \
    .setInputCol("pdf") \
    .setOutputCol("image") \
    .setFallBackCol("text_image")

# Recognize text
ocr = ImageToText() \
    .setInputCol("image") \
    .setOutputCol("text") \
    .setIgnoreResolution(False) \
    .setPageIteratorLevel(PageIteratorLevel.SYMBOL) \
    .setPageSegMode(PageSegmentationMode.SPARSE_TEXT) \
    .setConfidenceThreshold(70)

# Found coordinates of sensitive data
position_finder = PositionFinder() \
    .setInputCols("ner_chunk") \
    .setOutputCol("regions") \
    .setPageMatrixCol("positions") \
    .setOcrScaleFactor(1)

# Hide sensitive data
drawRegions = ImageDrawRegions()  \
    .setInputCol("image")  \
    .setInputRegionsCol("regions")  \
    .setOutputCol("image_with_regions")  \
    .setFilledRect(True) \
    .setRectColor(Color.gray)

# Convert image to Pdf
image_to_pdf = ImageToPdf() \
    .setInputCol("image_with_regions") \
    .setOutputCol("pdf")

# Update Pdf in Dicom
dciom_update_pdf = DicomUpdatePdf() \
    .setInputCol("path") \
    .setInputPdfCol("pdf") \
    .setOutputCol("dicom") \
    .setKeepInput(True)

# Deidentify metadata in Dicom
dicom_deidentifier = DicomMetadataDeidentifier() \
    .setInputCols(["dicom"]) \
    .setOutputCol("dicom_cleaned")

# OCR pipeline
pipeline = PipelineModel(stages=[
     dicom_to_pdf,
     pdf_to_image,
     ocr,
     deidentification_nlp_pipeline(input_column="text", prefix="", model="ner_deid_generic_augmented"),
     position_finder,
     drawRegions,
     image_to_pdf,
     dciom_update_pdf,
     dicom_deidentifier
])

No error
No error
No error
No error
No error
embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[ | ]embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
Download done! Loading the resource.
[OK!]
ner_deid_generic_augmented download started this may take some time.
Approximate size to download 13.8 MB
[ | ]ner_deid_generic_augmented download started this may take some time.
Approximate size to download 13.8 MB
[ / ]Download done! Loading the resource.
[ \ ]

2025-12-19 20:19:23.188014: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-19 20:19:23.336935: W external/org_tensorflow/tensorflow/core/common_runtime/colocation_graph.cc:1218] Failed to place the graph without changing the devices of some resources. Some of the operations (that had to be colocated with resource generating operations) are not supported on the resources' devices. Current candidate devices are [
  /job:localhost/replica:0/task:0/device:CPU:0].
See below for details of this colocation group:
Colocation Debug Info:
Colocation group had the following types and supported devices: 
Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_de

[OK!]


## Read dicom files

In [7]:
dicom_path = './../data/dicom/encapsulated/*.dcm'
dicom_df = spark.read.format("binaryFile").load(dicom_path)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

## Run pipeline and store resulst

In [8]:
output_path = "./deidentified_pdf/"

def get_name(path, keep_subfolder_level=0):
    path = path.split("/")
    path[-1] = ".".join(path[-1].split('.')[:-1])
    return "/".join(path[-keep_subfolder_level-1:])

result = pipeline.transform(dicom_df)
result.withColumn("fileName", udf(get_name, StringType())(col("path"))) \
    .write \
    .format("binaryFormat") \
    .option("type", "dicom") \
    .option("field", "dicom_cleaned") \
    .option("prefix", "") \
    .option("nameField", "fileName") \
    .mode("overwrite") \
    .save(output_path)

25/12/19 20:20:32 ERROR PositionFinder: PositionFinder unmatched:::Annotation(type: chunk, begin: 946, end: 1000, result: Industries Served Computer software, Banking, Insurance), index: 9
20:20:38, INFO Run DicomMetadataDeidentifier                        (0 + 1) / 1]
20:20:42, INFO DicomMetadataDeidentifier : Tag '00020100' Present in Strategy file but missing in Dicom.
20:20:42, ERROR DicomMetadataDeidentifier : Invalid VR : 'OB' For Action : 'hashId' Unsupported Action.
20:20:42, ERROR DicomMetadataDeidentifier : Invalid VR : 'CS' For Action : 'hashId' Unsupported Action.
20:20:42, ERROR DicomMetadataDeidentifier : Invalid VR : 'CS' For Action : 'hashId' Unsupported Action.
20:20:42, INFO DicomMetadataDeidentifier : Tag '00041432' Present in Strategy file but missing in Dicom.
20:20:42, ERROR DicomMetadataDeidentifier : Invalid VR : 'CS' For Action : 'hashId' Unsupported Action.
20:20:42, INFO DicomMetadataDeidentifier : Tag '00080012' Present in Strategy file but missing in Dicom.

## Remove results

In [9]:
%%bash
rm -r -f ./deidentified_pdf