# Example of building REST API on top of Spark OCR

In [1]:
secret = ""
license = ""


version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.11"
AWS_ACCESS_KEY_ID = ""
AWS_SECRET_ACCESS_KEY = ""


import os

SERVER_HOST = "localhost"
SERVER_PORT = 8889
SERVER_API_NAME = "spark_ocr_api"

if AWS_ACCESS_KEY_ID != "":
    os.environ["AWS_ACCESS_KEY_ID"] = AWS_ACCESS_KEY_ID
    os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET_ACCESS_KEY
    
if license:
    os.environ['JSL_OCR_LICENSE'] = license

In [10]:
#!pip install synapseml
#!pip install --upgrade spark-ocr==4.0.0+spark32 --user --extra-index-url https://pypi.johnsnowlabs.com/$secret --upgrade

## Start Spark session

In [3]:
from sparkocr import start
from pyspark import SparkConf

conf = SparkConf() \
   .set("spark.jars.packages", "com.microsoft.azure:synapseml_2.12:0.9.4") \
   .set("spark.jars.repositories", "https://repo1.maven.org/maven2,https://mmlspark.azureedge.net/maven")

spark = start(secret = secret, extra_conf=conf)
spark

Spark version: 3.2.1
Spark NLP version: 4.0.0
Spark OCR version: 4.0.0



## Define Spark OCR pipeline

In [4]:
import synapse.ml
from synapse.ml.io import *
import pyspark
import tempfile
from pyspark.sql.functions import udf, col, length
from pyspark.sql.types import *
from pyspark.ml import PipelineModel

import pyspark.sql.functions as f
from sparkocr.transformers import *


pdf_to_image = PdfToImage() \
    .setOutputCol("image")

ocr = ImageToText() \
    .setInputCol("image")\
    .setOutputCol("text")

pipeline = PipelineModel(stages=[
    pdf_to_image,
    ocr
])


## Start server

In [5]:
checkpoint_dir = tempfile.TemporaryDirectory("_spark_ocr_server_checkpoint")

df = spark.readStream.server() \
    .address(SERVER_HOST, SERVER_PORT, SERVER_API_NAME) \
    .load() \
    .parseRequest(SERVER_API_NAME, schema=StructType().add("image", BinaryType())) \
    .withColumn("path", f.lit("")) \
    .withColumnRenamed("image", "content")

replies = pipeline.transform(df)\
    .makeReply("text") 

server = replies\
    .writeStream \
    .server() \
    .replyTo(SERVER_API_NAME) \
    .queryName("spark_ocr") \
    .option("checkpointLocation", checkpoint_dir) \
    .start()

print(f"Checkpoint: {checkpoint_dir}")



Checkpoint: <TemporaryDirectory '/tmp/tmpejopbze7_spark_ocr_server_checkpoint'>


# Call API

Display image

In [7]:
import pkg_resources
import json
import base64
import requests
from IPython.display import Image

imagePath = pkg_resources.resource_filename('sparkocr', '/resources/ocr/pdfs/test_document.pdf')

#display(Image(filename=imagePath))

## Send request

In [8]:
with open(imagePath, "rb") as image_file:
    im_bytes = image_file.read() 

im_b64 = base64.b64encode(im_bytes).decode("utf8")

headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}
  
payload = json.dumps({"image": im_b64})
r = requests.post(data=payload, headers=headers, url=f"http://{SERVER_HOST}:{SERVER_PORT}/{SERVER_API_NAME}")

print("Response:\n\n{}".format(r.text))

Response:

Patient Nam
Financial Numbe

Random Hospital Date of Birth

Patient Location

t
ere st eee - eR eo ee. - ee 8 ee et —— oe

Chief Complaint
Shortness of breath

History of Present Illness

Patient is an 84-year-old male wilh a past medical history of hypertension, HFpEF last
known EF 55%, mild to moderate TA, pulmonary hypertension, permanent atrial
fibrillation on Eliquis, history of GI blesd, CK-M8, and anemia who presents with full weeks
oi ccneralized fatigue and fecling unwell. He also notes some shortness oi Breath and
worsening dyspnea willy minimal exerlion. His major complaints are shoulder and joint
pains. diffusely. He also complains of "bone pain’. He denics having any fevers or cnills.

e demes having any chest pain, palpitalicns, He denies any worse extremity
swelling than his baseline. He states he’s been compliant with his mcdications. Although
he stales he ran out of his Eliquis & few weeks ago. He denies having any blood in his
stools or mc!ena, although he 

## Stop server

In [9]:
server.stop()