# Save  Images Objects to S3 using  Spark OCR
## Initialize spark session

In [None]:
secret = ""
license = ""
version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.11"

In [None]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [None]:
import os
import sys

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [None]:
# install from PYPI using secret
%pip install spark-ocr==$version --user --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

## Initialization of spark session

In [None]:

from pyspark import SparkConf
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license
    
# you can set AWS API Keys to env variables  
# os.environ['AWS_ACCESS_KEY_ID'] = "your key"
# os.environ['AWS_SECRET_ACCESS_KEY'] = "your secret"

# set additinal dependensies for read data from S3
conf = SparkConf() \
    .set("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.3")
# or you can set AWS API Keys here
#    .set('spark.hadoop.fs.s3a.access.key', "your key" ) \
#    .set('spark.hadoop.fs.s3a.secret.key', "your secret")

spark = start(secret=secret, jar_path=spark_ocr_jar_path, extra_conf=conf)
spark

## Imports

In [55]:
from pyspark.ml import PipelineModel

from sparkocr.transformers import *

## Define paths to images on S3

In [56]:
images_path = "s3a://dev.johnsnowlabs.com/ocr/datasets/input/images/*.png"

## Read images

In [57]:
images = spark.read.format("binaryFile").load(images_path).cache()
images.count()

## Define OCR pipeline 

In [58]:
# Transform binary to image
binary_to_image = BinaryToImage()
binary_to_image.setInputCol("content")
binary_to_image.setOutputCol("image")

# Run tesseract OCR for each region
ocr = TesseractOcr()
ocr.setInputCol("image")
ocr.setOutputCol("text")
ocr.setIgnoreResolution(False)

# OCR pipeline
pipeline = PipelineModel(stages=[
    binary_to_image,
    ocr
])

## Fit the pipeline to training images.

In [59]:
model = pipeline.fit(images)

## Run OCR

In [60]:
results=model.transform(images)

+-------+--------------------+-----------------+
|pagenum|                text|       confidence|
+-------+--------------------+-----------------+
|      0|FOREWORD

Electro...|95.88622707790799|
+-------+--------------------+-----------------+



## save the fitted pipeline to disk

In [None]:
model.write().overwrite().save("ocr_model")

## save the unfit  pipeline to disk

In [None]:
pipeline.write().overwrite().save("unfit_ocr_model")

## load back the model pipeline

In [None]:
sameModel = PipelineModel.load("ocr_model")
