# Example of usage Spark OCR with Dicom Image Deidentifier

## Import Spark OCR transformers and Spark NLP annotators

## Install spark-ocr python packge
Need specify path to `spark-ocr-assembly-[version].jar` or `secret`

In [1]:
nlp_secret = ""
secret = ""
license = ""
nlp_license = ""
version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.11"
spark_nlp_jsl_jar = "../../target/spark-nlp-jsl-2.5.0rc3.jar"

In [None]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [None]:
import os
import sys
from IPython.core.display import display

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [None]:
# install from PYPI using secret
%pip install --upgrade spark-nlp-jsl==2.5.0rc3  --extra-index-url https://pypi.johnsnowlabs.com/$nlp_secret
%pip install spark-nlp==2.5.0
%pip install spark-ocr==$version --user --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

In [None]:
# %pip install --user ../dist/spark-ocr-[version].tar.gz

## Initialization of spark session
Need specify path to `spark-ocr-assembly.jar` or `secret`

In [2]:
from pyspark import SparkConf
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license
if nlp_license:
    os.environ['SPARK_NLP_LICENSE'] = license



import sparknlp
import sparknlp_jsl
print(sparknlp.version())
print(sparknlp_jsl.version())

# you can set AWS API Keys to env variables
# os.environ['AWS_ACCESS_KEY'] = "your key"
# os.environ['AWS_SECRET_ACCESS_KEY'] = "your secret"

# set additinal dependensies for read data from S3
conf = SparkConf() \
    .set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.3")


# or you can set AWS API Keys here
#    .set('spark.hadoop.fs.s3a.access.key', "your key" ) \
#    .set('spark.hadoop.fs.s3a.secret.key', "your secret")

if secret:
    spark = start(secret=secret, jar_path=spark_ocr_jar_path, extra_conf=conf,nlp_version="2.5.0",nlp_internal=True)
else:
    spark = start(secret=secret, jar_path=spark_ocr_jar_path, extra_conf=conf,nlp_version="2.5.0",nlp_internal=True,jar_nlp_internal_path=spark_nlp_jsl_jar)

spark

SparkConf Configured, Starting to listen on port: 59744
JAR PATH:/usr/local/lib/python3.7/site-packages/sparkmonitor/listener.jar


In [4]:
from sparknlp.annotator import *
from sparknlp.base import *
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel
from sparkocr.transformers import ImageToDicom, ImageDrawRegions, ImageToText,DicomToImage,PositionFinder
from sparkocr.enums import PageSegmentationMode
from sparkocr.utils import display_image

## Define OCR transformers and pipeline

In [5]:
dicom_to_image = DicomToImage() \
            .setInputCol("content") \
            .setOutputCol("image_raw") \
            .setMetadataCol("metadata")


# Run tesseract OCR
ocr = ImageToText() \
    .setInputCol("image_raw") \
    .setOutputCol("text") \
    .setConfidenceThreshold(60) \
    .setLanguage("eng") \
    .setIgnoreResolution(False) \
    .setPageSegMode(PageSegmentationMode.SPARSE_TEXT)


position_finder = PositionFinder() \
    .setInputCols("ner_chunk") \
    .setOutputCol("coordinates") \
    .setPageMatrixCol("positions") \
    .setMatchingWindow(10) \
    .setPadding(0)


drawRegions = ImageDrawRegions()  \
      .setInputCol("image_raw")  \
      .setInputRegionsCol("coordinates")  \
      .setOutputCol("image_with_regions")  \
      .setFilledRect(True)

imageToDicom = ImageToDicom() \
      .setInputCol("image_with_regions") \
      .setOutputCol("dicom") \
      .setInputMetadata("metadata")

from sparknlp_jsl.annotator import *

documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")


# Sentence Detector annotator, processes various sentences per line

sentenceDetector = SentenceDetector()\
  .setInputCols(["document"])\
  .setOutputCol("sentence")


tokenizer = Tokenizer()\
  .setInputCols(["sentence"])\
  .setOutputCol("token")

# Clinical word embeddings trained on PubMED dataset
word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models")\
  .setInputCols(["sentence", "token"])\
  .setOutputCol("embeddings")

# NER model trained on i2b2 (sampled from MIMIC) dataset
clinical_ner = NerDLModel.pretrained("ner_deid_large", "en", "clinical/models") \
  .setInputCols(["sentence", "token", "embeddings"]) \
  .setOutputCol("ner")

In [None]:
deidentification_rules = DeIdentificationModel.pretrained("deidentify_rb_no_regex", "en", "clinical/models") \
      .setInputCols(["sentence", "token", "ner_chunk"]) \
      .setOutputCol("deidentified")


def get_deidentify_model():

    custom_ner_converter = NerConverter()\
      .setInputCols(["sentence", "token", "ner"])\
      .setOutputCol("ner_chunk")
      #.setWhiteList(entity_types)

    deidentify_pipeline = Pipeline(
            stages = [
            documentAssembler,
            sentenceDetector,
            tokenizer,
            word_embeddings,
            clinical_ner,
            custom_ner_converter,
            deidentification_rules
            ])

    empty_data = spark.createDataFrame([[""]]).toDF("text")

    model_deidentify = deidentify_pipeline.fit(empty_data)

    return model_deidentify

deid_model = get_deidentify_model()

# OCR pipeline
pipeline = PipelineModel(stages=[
    dicom_to_image,
    ocr,
    deid_model,
    position_finder,
    drawRegions,
    imageToDicom
])

## Read Dicom Object as binary file anf Run OCR Pipeline

In [6]:
%%time
dicom = '././data/dicom/deidentify-medical.DCM'
df=spark.read.format("binaryFile").load(dicom)

## Display Original Image and Dicom metadata

In [None]:
de_dicom_df = dicom_to_image.transform(df)
for r in de_dicom_df.distinct().collect():
    display_image(r.image_raw)
    display(r.metadata)
ocr_data = pipeline.transform(df)
pd = ocr_data.collect()



In [7]:
with open("deidentified-medical.DCM", "wb") as file:
    file.write(pd[0].dicom)

## Display Deidentified Image and Deidentified metadata

In [None]:
de_df = spark.read.format("binaryFile").load("deidentified-medical.DCM")
de_dicom_df = dicom_to_image.setDeIdentifyMetadata(True).transform(de_df)

for r in de_dicom_df.distinct().collect():
    display_image(r.image_raw)
    display(r.metadata)



In [None]:
%%bash
rm -r -f deidentified-medical.DCM