![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/tutorials/streamlit_notebooks/ocr/DEID_PDF.ipynb)

# **De-identify PDF Documents**
Deidentify text and metada

To run this yourself, you will need to upload your **Spark OCR & Sprk NLP** license keys to the notebook. Otherwise, you can look at the example outputs at the bottom of the notebook. To upload license keys, open the file explorer on the left side of the screen and upload `workshop_license_keys.json` to the folder that opens.

# 1. Colab Setup

Install correct version of Pillow and Restart runtime

In [1]:
# Install correct Pillow version
import PIL
if PIL.__version__  != '6.2.1':
  print ('Installing correct version of Pillow. Kernel will restart automatically')
  !pip install --upgrade pillow==6.2.1
  # hard restart runtime
  import os
  os.kill(os.getpid(), 9)
else:
  print ('Correct Pillow detected')

Correct Pillow detected


Read License Key

In [2]:
import os
import json

with open('workshop_license_keys.json') as f:
    license_keys = json.load(f)

secret = license_keys['JSL_OCR_SECRET']
jsl_secret = license_keys['JSL_SECRET']
os.environ['SPARK_OCR_LICENSE'] = license_keys['SPARK_OCR_LICENSE']
os.environ['JSL_OCR_LICENSE'] = license_keys['SPARK_OCR_LICENSE']
os.environ['AWS_ACCESS_KEY_ID']= license_keys['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY'] = license_keys['AWS_SECRET_ACCESS_KEY']
version = secret.split("-")[0]
jsl_version = jsl_secret.split('-')[0]
print ('Spark OCR Version:', version)
print ('OCR Version:', version,)
print ('JSL Version:', jsl_version)

Spark OCR Version: 1.5.0
OCR Version: 1.5.0
JSL Version: 2.5.5


Install Dependencies

In [None]:
# Install Java
!apt-get update
!apt-get install -y openjdk-8-jdk
!java -version

# Install pyspark
!pip install --ignore-installed -q pyspark==2.4.4
# Install Spark OCR from PYPI using secret
!python -m pip install --upgrade spark-ocr==$version  --extra-index-url https://pypi.johnsnowlabs.com/$secret

# Install Spark NLP and Spark NLP JSL
! pip install --ignore-installed -q spark-nlp
!python -m pip install --upgrade spark-nlp-jsl==$jsl_version --extra-index-url https://pypi.johnsnowlabs.com/$jsl_secret


Importing Libraries

In [4]:
import pandas as pd
import numpy as np
import os

#Pyspark Imports
from pyspark.sql import SparkSession
from pyspark.ml import PipelineModel
from pyspark.sql import functions as F

# Necessary imports from Spark OCR library
from sparkocr import start
from sparkocr.transformers import *
from sparkocr.enums import *
from sparkocr.utils import display_image, to_pil_image
from sparkocr.metrics import score
import pkg_resources

# import sparknlp packages
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp_jsl
from sparknlp_jsl.annotator import *

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]


Start Spark Session

In [5]:
spark = start(secret=secret,
              nlp_secret=jsl_secret,
              nlp_version=jsl_version,
              nlp_internal=True)

spark

# 2. Download and read PDF Document

In [6]:
pdf_example = pkg_resources.resource_filename('sparkocr', 'resources/ocr/pdfs/test_document.pdf')
pdf_example_df = spark.read.format("binaryFile").load(pdf_example).cache()
pdf_example_df.show()

+--------------------+-------------------+------+--------------------+
|                path|   modificationTime|length|             content|
+--------------------+-------------------+------+--------------------+
|file:/usr/local/l...|2020-08-21 17:28:08|693743|[25 50 44 46 2D 3...|
+--------------------+-------------------+------+--------------------+



Convert & View PDF as images

In [10]:
for image in PdfToImage().transform(pdf_example_df).collect():
  #print(image.exception)
  #print(image.metadata)
  display_image(image.image)

Output hidden; open in https://colab.research.google.com to view.

# 3. Construct OCR and DEID (NLP) Pipelines

De-identification Pipeline

In [11]:
def deidentification_nlp_pipeline(input_column, prefix = ""):
    document_assembler = DocumentAssembler() \
        .setInputCol(input_column) \
        .setOutputCol(prefix + "document")

    # Sentence Detector annotator, processes various sentences per line
    sentence_detector = SentenceDetector() \
        .setInputCols([prefix + "document"]) \
        .setOutputCol(prefix + "sentence")

    tokenizer = Tokenizer() \
        .setInputCols([prefix + "sentence"]) \
        .setOutputCol(prefix + "token")

    # Clinical word embeddings
    word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical", "en", "clinical/models") \
        .setInputCols([prefix + "sentence", prefix + "token"]) \
        .setOutputCol(prefix + "embeddings")
    # NER model trained on i2b2 (sampled from MIMIC) dataset
    clinical_ner = NerDLModel.pretrained("ner_deid_large", "en", "clinical/models") \
        .setInputCols([prefix + "sentence", prefix + "token", prefix + "embeddings"]) \
        .setOutputCol(prefix + "ner")

    custom_ner_converter = NerConverter() \
        .setInputCols([prefix + "sentence", prefix + "token", prefix + "ner"]) \
        .setOutputCol(prefix + "ner_chunk") \
        .setWhiteList(['NAME', 'AGE', 'CONTACT',
                   'LOCATION', 'PROFESSION', 'PERSON'])

    nlp_pipeline = Pipeline(stages=[
            document_assembler,
            sentence_detector,
            tokenizer,
            word_embeddings,
            clinical_ner,
            custom_ner_converter
        ])
    empty_data = spark.createDataFrame([[""]]).toDF(input_column)
    nlp_model = nlp_pipeline.fit(empty_data)
    return nlp_model

OCR and PDF to Image Conversion Pipeline.

In [12]:
# Extract images from Dicom foram
# If text PDF extract text
pdf_to_text = PdfToText() \
    .setInputCol("content") \
    .setOutputCol("text") \
    .setSplitPage(False)

# If image pdf, extract image
pdf_to_image = PdfToImage() \
    .setInputCol("content") \
    .setOutputCol("image_raw") \
    .setKeepInput(True)

# Extract text from image
ocr = ImageToText() \
    .setInputCol("image_raw") \
    .setOutputCol("text") \
    .setIgnoreResolution(False) \
    .setOcrParams(["preserve_interword_spaces=0"])


# Find coordinates of sensitive data
position_finder = PositionFinder() \
    .setInputCols("ner_chunk") \
    .setOutputCol("coordinates") \
    .setPageMatrixCol("positions") \
    .setMatchingWindow(10) \
    .setPadding(0)

# Draw filled rectangle to hide sensitive data
draw_regions = ImageDrawRegions()  \
    .setInputCol("image_raw")  \
    .setInputRegionsCol("coordinates")  \
    .setOutputCol("image_with_regions")  \
    .setFilledRect(True)

# Store image back to pdf
image_to_pdf = ImageToPdf() \
        .setInputCol("image_with_regions") \
        .setOutputCol("pdf")

# OCR pipeline
pipeline = PipelineModel(stages=[
    pdf_to_text,
    pdf_to_image,
    ocr,
    deidentification_nlp_pipeline(input_column="text"),
    position_finder,
    draw_regions,
    image_to_pdf
])

embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_deid_large download started this may take some time.
Approximate size to download 13.9 MB
[OK!]


# 4. Run the pipelines and save De-identified PDF Document

Run Pipeline

In [14]:
result = pipeline.transform(pdf_example_df).cache()

Save PDF

In [15]:
pdf = result.select("pdf").head().pdf
pdfFile = open("Result.pdf", "wb")
pdfFile.write(pdf)
pdfFile.close()

# 5. Load De-identified PDF and Visualize Results

In [17]:
pdf_example_df = spark.read.format("binaryFile").load("Result.pdf")
for image in PdfToImage().transform(pdf_example_df).collect():
  #print(image.exception)
  #print(image.metadata)
  display_image(image.image)