# Example of usage Spark OCR

## Install spark-ocr python packge
Need specify path to `spark-ocr-assembly-[version].jar` or `secret`

In [1]:
secret = ""
license = ""
version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.11"

In [2]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [3]:
import sys
import os

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [None]:
# install from PYPI using secret
#%pip install spark-ocr==$version+spark30 --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

In [None]:
# or install from local path
# %pip install ../../python/dist/spark-ocr-3.6.0+spark30.tar.gz

## Initialization of spark session

In [4]:
from pyspark.sql import SparkSession
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license

spark = start(secret=secret, jar_path=spark_ocr_jar_path)
spark

Spark version: 2.4.4
Spark NLP version: 2.5.5
Spark OCR version: 3.1.0



## Import OCR transformers

In [5]:
from sparkocr.transformers import *
from pyspark.ml import PipelineModel

## Define OCR transformers and pipeline

In [6]:
def pipeline():
    
    # Transforrm PDF document to images per page
    pdf_to_image = PdfToImage()
    pdf_to_image.setInputCol("content")
    pdf_to_image.setOutputCol("image")

    # Run OCR
    ocr = ImageToText()
    ocr.setInputCol("image")
    ocr.setOutputCol("text")
    ocr.setConfidenceThreshold(65)
    
    pipeline = PipelineModel(stages=[
        pdf_to_image,
        ocr
    ])
    
    return pipeline

## Read PDF document as binary file

In [7]:
import pkg_resources
pdf_example = pkg_resources.resource_filename('sparkocr', 'resources/ocr/pdfs/test_document.pdf')
pdf_example_df = spark.read.format("binaryFile").load(pdf_example).cache()

## Run OCR pipelines

In [8]:
result = pipeline().transform(pdf_example_df).cache()

IllegalArgumentException: 'requirement failed: License Key not set please set environment variable JSL_OCR_LICENSE or property jsl.sparkocr.settings.license!'

## Display results

In [7]:
result.select("pagenum","text", "confidence").show()

+-------+--------------------+-----------------+
|pagenum|                text|       confidence|
+-------+--------------------+-----------------+
|      0|Patient Nam
Finan...|80.66660189628601|
|      1|Random Hospital

...|69.66282038534841|
+-------+--------------------+-----------------+



### Display recognized text

In [8]:
print("\n".join([row.text for row in result.select("text").collect()]))

Patient Nam
Financial Numbe

Random Hospital Date of Birth

Patient Location

  
   

| H & P |
Chief Complaint Arincitis | |
Shortness of breath CHF - Congestive heart failure
Chronic kidney disease
History of Present Illness Chronic venous insufficiency
Edema
a . . | GI bleeding
Patient is an 64-year-old male wilh a past medical history of hypertension, HFpEF last Glaucoma
known EF 55%, mild to moderate TA, pulmonary hypertension, permanent atrial Gout
fibrillation on Eliquis, history of GI blesd, CK-M8, and anemia who presents with full weeks vpertension
oi ccneralized fatigue and fecling unwell. He also notes some shortness oi Breath and Peptic ulcer
worsening dyspnea will) minimal exerlion. His major comp!aints are shoulder and join nha TT, a“
ng 2791 : Pe CONC eee en Peripheral ncusopath,
pains. diffuscly. He also complains of "bone pain’. He denics having any fevers or chills. Peripheral vascular disease
e demes having any chest pain, palpitalicns, He denies any worse extremity 

## Clear cache

In [10]:
result.unpersist()

DataFrame[path: string, modificationTime: timestamp, length: bigint, image: struct<origin:string,height:int,width:int,nChannels:int,mode:int,resolution:int,data:binary>, pagenum: int, confidence: double, positions: array<struct<mapping:array<struct<c:string,p:int,x:float,y:float,width:float,height:float,fontSize:int>>>>, exception: string, text: string]