# Save/Load  Spark OCR pipeline
## Initialize spark session

In [1]:
secret = ""
license = ""
version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.11"

In [2]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [3]:
import os
import sys

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [4]:
# install from PYPI using secret
%pip install spark-ocr==$version\.spark24 --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

In [None]:
# or install from local path
# %pip install --user ../../python/dist/spark-ocr-1.9.0.tar.gz

## Initialization of spark session

In [6]:
from pyspark import SparkConf
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license
    
conf = SparkConf() \
    .set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.3")

spark = start(secret=secret, jar_path=spark_ocr_jar_path, extra_conf=conf)
spark

SparkConf Configured, Starting to listen on port: 54494
JAR PATH:/usr/local/lib/python3.7/site-packages/sparkmonitor/listener.jar


## Imports

In [7]:
from pyspark.ml import PipelineModel

from sparkocr.transformers import *

## Define paths to images on S3

In [8]:
images_path = "s3a://dev.johnsnowlabs.com/ocr/datasets/input/images/*.png"

## Read images

In [9]:
images = spark.read.format("binaryFile").load(images_path).cache()
images.count()

1

## Define OCR pipeline 

In [10]:
# Transform binary to image
binary_to_image = BinaryToImage()
binary_to_image.setInputCol("content")
binary_to_image.setOutputCol("image")

# Run OCR for each region
ocr = ImageToText()
ocr.setInputCol("image")
ocr.setOutputCol("text")
ocr.setIgnoreResolution(False)

# OCR pipeline
pipeline = PipelineModel(stages=[
    binary_to_image,
    ocr
])

## Run OCR

In [11]:
pipeline.transform(images) \
   .select("pagenum","text", "confidence") \
   .show()

+-------+--------------------+-----------------+
|pagenum|                text|       confidence|
+-------+--------------------+-----------------+
|      0|> Confidential Cl...|84.30319298638238|
+-------+--------------------+-----------------+



## Save the pipeline to disk

In [12]:
pipeline.write().overwrite().save("ocr_model")

## Load back the model pipeline

In [13]:
stored_pipeline = PipelineModel.load("ocr_model")

## Run loaded pipeline

In [14]:
stored_pipeline.transform(images) \
  .select("pagenum","text", "confidence") \
  .show()

+-------+--------------------+-----------------+
|pagenum|                text|       confidence|
+-------+--------------------+-----------------+
|      0|> Confidential Cl...|84.30319298638238|
+-------+--------------------+-----------------+



In [2]:
%%bash
rm -r -f ./ocr_model