# Example of usage Spark OCR for Signature Detection

## Install spark-ocr python packge
Need to specify:
- secret
- license
- aws credentials

In [None]:
secret = ""
license = ""
AWS_ACCESS_KEY_ID = ""
AWS_SECRET_ACCESS_KEY = ""

version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.11"
imagePath = "./data/signature/LIL18369-Lease_Z-1.jpg"

In [None]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [None]:
# install from local package 
#%pip install ../../python/dist/spark-ocr-3.2.0.spark30.tar.gz

In [None]:
# install from PYPI using secret
#%pip install spark-ocr==$version\.spark30 --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

In [None]:
import os
import sys

if AWS_ACCESS_KEY_ID != "":
    os.environ["AWS_ACCESS_KEY_ID"] = AWS_ACCESS_KEY_ID
    os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET_ACCESS_KEY
    
if license:
    os.environ['JSL_OCR_LICENSE'] = license
    
if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

## Initialization of spark session
Need specify path to `spark-ocr-assembly.jar` or `secret`

In [None]:
from pyspark import SparkConf
from sparkocr import start

spark = start(secret=secret, jar_path = spark_ocr_jar_path, nlp_version="3.0.0") #"3.0.3"

spark

In [None]:
spark.sparkContext.setLogLevel("INFO")

## Read images and display it

In [None]:
from pyspark.ml import PipelineModel
import pyspark.sql.functions as f
from sparkocr.transformers import *
from sparkocr.enums import *
from sparkocr.utils import display_images

image_df = spark.read.format("binaryFile").load(imagePath)

display_images(BinaryToImage().transform(image_df), "image")

## Define OCR Pipeline

In [None]:
binary_to_image = BinaryToImage()#imageType=5) 
binary_to_image.setImageType(ImageType.TYPE_3BYTE_BGR)

pretrained_model = ("image_signature_detector_gsa0611", "en", "public/ocr/models")
signature_detector = ImageSignatureDetector()
signature_detector.pretrained(*pretrained_model)
signature_detector.setInputCol("image")
signature_detector.setOutputCol("signature_regions")

pipeline = PipelineModel(stages=[
    binary_to_image,
    signature_detector,
])

## Run pipeline and show results

In [None]:
result =  pipeline.transform(image_df)

In [None]:
result.printSchema()

In [None]:
#result.show(truncate=False)

In [None]:
result = result.withColumn("coordinate", f.explode(f.col("signature_regions.coordinates"))).select("coordinate").where(f.col("coordinate").isNotNull())

In [None]:
assert( result.count() > 0 )

In [None]:
result.show(truncate=False)