# Example of usage Spark OCR with Dicom Image Deidentifier

## Install spark-ocr python packge
Need specify:
- license
- path to `spark-ocr-assembly-[version].jar` and `spark-nlp-jsl-[version]`
- or `secret` for Spark OCR and `nlp_secret` for Spark NLP Internal
- `aws_access_key` and `aws_secret_key`for download pretrained models

In [1]:
license = ""
secret = ""
nlp_secret = ""
aws_access_key = ""
aws_secret_key = ""

version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.11"

In [2]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [3]:
import os
import sys

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [4]:
# install from PYPI using secret
%pip install --upgrade spark-nlp-jsl==2.5.0  --extra-index-url https://pypi.johnsnowlabs.com/$nlp_secret
%pip install spark-nlp==2.5.3
#%pip install spark-ocr==$version --user --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

Looking in indexes: https://pypi.org/simple, https://pypi.johnsnowlabs.com/l5rISdi5Xk
Requirement already up-to-date: spark-nlp-jsl==2.5.0 in c:\users\pc\appdata\local\programs\python\python37\lib\site-packages (2.5.0)
Collecting spark-nlp==2.5.0
  Using cached spark_nlp-2.5.0-py2.py3-none-any.whl (120 kB)
Installing collected packages: spark-nlp
  Attempting uninstall: spark-nlp
    Found existing installation: spark-nlp 2.5.3
    Uninstalling spark-nlp-2.5.3:
      Successfully uninstalled spark-nlp-2.5.3
Successfully installed spark-nlp-2.5.0
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\users\pc\appdata\local\programs\python\python37\python.exe -m pip install --upgrade pip' command.


Collecting spark-nlp==2.5.3
  Using cached spark_nlp-2.5.3-py2.py3-none-any.whl (123 kB)
Installing collected packages: spark-nlp
  Attempting uninstall: spark-nlp
    Found existing installation: spark-nlp 2.5.0
    Uninstalling spark-nlp-2.5.0:
      Successfully uninstalled spark-nlp-2.5.0
Successfully installed spark-nlp-2.5.3
Note: you may need to restart the kernel to use updated packages.


ERROR: spark-nlp-jsl 2.5.0 has requirement spark-nlp==2.5.0, but you'll have spark-nlp 2.5.3 which is incompatible.
You should consider upgrading via the 'c:\users\pc\appdata\local\programs\python\python37\python.exe -m pip install --upgrade pip' command.


In [5]:
%pip install --user ../../python/dist/spark-ocr-1.5.0.tar.gz

Processing e:\ideaproject\spark-ocr-master\python\dist\spark-ocr-1.5.0.tar.gz
Building wheels for collected packages: spark-ocr
  Building wheel for spark-ocr (setup.py): started
  Building wheel for spark-ocr (setup.py): finished with status 'done'
  Created wheel for spark-ocr: filename=spark_ocr-1.5.0-py3-none-any.whl size=7211429 sha256=990de7d731ebe5cfd6c37a45f96e94ae207558b8dc6c7627d2f97853adab7f6e
  Stored in directory: c:\users\pc\appdata\local\pip\cache\wheels\e5\4d\42\9b9d11a5c5ce2bcae6da03ec982593f1c2baef04fb159121c5
Successfully built spark-ocr
Installing collected packages: spark-ocr
  Attempting uninstall: spark-ocr
    Found existing installation: spark-ocr 1.5.0
    Uninstalling spark-ocr-1.5.0:
      Successfully uninstalled spark-ocr-1.5.0
Successfully installed spark-ocr-1.5.0
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\users\pc\appdata\local\programs\python\python37\python.exe -m pip install --upgrade pip' command.


## Initialization of spark session

In [6]:
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license
    os.environ['SPARK_NLP_LICENSE'] = license

if aws_access_key:
    os.environ['AWS_ACCESS_KEY'] = aws_access_key
    os.environ['AWS_SECRET_ACCESS_KEY'] = aws_secret_key

# if secret:
spark = start(secret=secret,
              nlp_secret=nlp_secret,
              jar_path=spark_ocr_jar_path,
              nlp_version="2.5.0",
              nlp_internal=True)

spark

## Import Spark OCR transformers and Spark NLP annotators

In [7]:
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp_jsl
from sparknlp_jsl.annotator import *

import sparkocr
from sparkocr.transformers import *
from sparkocr.utils import display_image
from pyspark.sql.functions import  lit

print(f"Spark NLP version: {sparknlp.version()}")
print(f"Spark NLP internal version: {sparknlp_jsl.version()}")
print(f"Spark OCR version: {sparkocr.version()}")

Spark NLP version: 2.5.3
Spark NLP internal version: 2.5.0
Spark OCR version: 1.5.0


## Define de-identification  NLP pipeline

In [8]:
def deidentification_nlp_pipeline(input_column, prefix = ""):
    document_assembler = DocumentAssembler() \
        .setInputCol(input_column) \
        .setOutputCol(prefix + "document")

    # Sentence Detector annotator, processes various sentences per line
    sentence_detector = SentenceDetector() \
        .setInputCols([prefix + "document"]) \
        .setOutputCol(prefix + "sentence")

    tokenizer = Tokenizer() \
        .setInputCols([prefix + "sentence"]) \
        .setOutputCol(prefix + "token")

    # Clinical word embeddings
    word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \
        .setInputCols([prefix + "sentence", prefix + "token"]) \
        .setOutputCol(prefix + "embeddings")
    # NER model trained on i2b2 (sampled from MIMIC) dataset
    clinical_ner = NerDLModel.pretrained("ner_deid_large", "en", "clinical/models") \
        .setInputCols([prefix + "sentence", prefix + "token", prefix + "embeddings"]) \
        .setOutputCol(prefix + "ner")

    custom_ner_converter = NerConverter() \
        .setInputCols([prefix + "sentence", prefix + "token", prefix + "ner"]) \
        .setOutputCol(prefix + "ner_chunk") \
        .setWhiteList(['NAME', 'AGE', 'CONTACT',
                   'LOCATION', 'PROFESSION', 'PERSON'])

    nlp_pipeline = Pipeline(stages=[
            document_assembler,
            sentence_detector,
            tokenizer,
            word_embeddings,
            clinical_ner,
            custom_ner_converter
        ])
    empty_data = spark.createDataFrame([[""]]).toDF(input_column)
    nlp_model = nlp_pipeline.fit(empty_data)
    return nlp_model

## Define OCR transformers and pipeline

In [9]:
# Extract images from Dicom form
dicom_to_image = DicomToImage() \
    .setInputCol("content") \
    .setOutputCol("image_raw") \
    .setMetadataCol("metadata") \
    .setDeIdentifyMetadata(False)

# Extract text from image
ocr = ImageToText() \
    .setInputCol("image_raw") \
    .setOutputCol("text") \
    .setIgnoreResolution(False) \
    .setOcrParams(["preserve_interword_spaces=0"])


# Found sensitive data using DeIdentificationModel
deidentification_rules = DeIdentificationModel.pretrained("deidentify_rb_no_regex", "en", "clinical/models") \
    .setInputCols(["metadata_sentence", "metadata_token","metadata_ner_chunk"]) \
    .setOutputCol("deidentified_metadata_raw")

finisher = Finisher() \
    .setInputCols(["deidentified_metadata_raw"]) \
    .setOutputCols("deidentified_metadata") \
    .setOutputAsArray(False) \
    .setValueSplitSymbol("") \
    .setAnnotationSplitSymbol("")

# Found coordinates of sensitive data
position_finder = PositionFinder() \
    .setInputCols("ner_chunk") \
    .setOutputCol("coordinates") \
    .setPageMatrixCol("positions") \
    .setMatchingWindow(10) \
    .setPadding(0)

# Draw filled rectangle for hide sensitive data
drawRegions = ImageDrawRegions()  \
    .setInputCol("image_raw")  \
    .setInputRegionsCol("coordinates")  \
    .setOutputCol("image_with_regions")  \
    .setFilledRect(True)

# Store image back to Dicom document
imageToDicom = ImageToDicom() \
    .setInputCol("image_with_regions") \
    .setOutputCol("dicom") \
    .setInputMetadata("deidentified_metadata")

# OCR pipeline
pipeline = Pipeline(stages=[
    dicom_to_image,
    ocr,
    deidentification_nlp_pipeline(input_column="text"),
    deidentification_nlp_pipeline(input_column="metadata", prefix="metadata_"),
    deidentification_rules,
    position_finder,
    drawRegions,
    finisher,
    imageToDicom
    
])

## Read dicom objects as binary file and display metadata


### Useful Patient Identification Tags to check
 - (00100010)	PN	Patient Name
 - (00100020)	LO	Patient ID
 - (00100030)	DA	Patient's Birth Date
 - (00100040)	CS	Patient's Sexe
 - (00101010)	AS	Patient's Age
 - (00101030)	DS	Patient's Weight

In [10]:
dicom_path = '././patient_data/*.dcm'
dicom_df = spark.read.format("binaryFile").load(dicom_path).withColumn("text",lit(""))
for image in DicomToImage().transform(dicom_df).collect():
     print(image.exception)
     print(image.metadata)
     #display_image(image.image)
   

None

{
    "00020001": {
        "vr": "OB",
        "InlineBinary": "AAE="
    },
    "00020002": {
        "vr": "UI",
        "Value": [
            "1.2.840.10008.5.1.4.1.1.4"
        ]
    },
    "00020003": {
        "vr": "UI",
        "Value": [
            "1.3.12.2.1107.5.2.30.26137.3.2011101312051826650713142"
        ]
    },
    "00020010": {
        "vr": "UI",
        "Value": [
            "1.2.840.10008.1.2.1"
        ]
    },
    "00020012": {
        "vr": "UI",
        "Value": [
            "1.2.276.0.7230010.3.0.3.6.0"
        ]
    },
    "00020013": {
        "vr": "SH",
        "Value": [
            "OFFIS_DCMTK_360"
        ]
    },
    "00080005": {
        "vr": "CS",
        "Value": [
            "ISO_IR 100"
        ]
    },
    "00080008": {
        "vr": "CS",
        "Value": [
            "ORIGINAL",
            "PRIMARY",
            "M",
            "ND",
            "NORM"
        ]
    },
    "00080012": {
        "vr": "DA",
        "Value": [


## Run pipeline and store result

In [11]:
#dicom_to_image.setDeIdentifyMetadata(True)
de_dicom_df = pipeline.fit(dicom_df).transform(dicom_df).cache()
output_path = "././data/patients/dicom/deidentified/"

if not os.path.exists(output_path):
    os.makedirs(output_path)

for r in de_dicom_df.select("dicom", "path").collect():
    path, name = os.path.split(r.path)
    filename_split = os.path.splitext(name)
    file_name = os.path.join(output_path, filename_split[0]+".dcm")
    print(f"Storing to {file_name}")
    with open(file_name, "wb") as file:
        file.write(r.dicom)

IllegalArgumentException: 'requirement failed: because there is no column with name ner_chunk and an appropriate Annotation schema'

## Display Deidentified Image and Deidentified metadata

In [None]:
dicom_gen_df = spark.read.format("binaryFile").load("././data/patients/dicom/deidentified/*.dcm")
de_dicom_gen_df = DicomToImage().transform(dicom_gen_df)
for r in de_dicom_gen_df.select("image", "metadata").collect():
    print(r.metadata)
    if r.image:
       display_image(r.image)

## Remove reults

In [None]:
%%bash
rm -r -f ././data/patients/dicom/deidentified2
