# Example of usage Spark OCR for recognize text and store results to HOCR

## Install spark-ocr python packge
Need specify path to `spark-ocr-assembly-[version].jar` or `secret`

In [1]:
secret = ""
license = ""
version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.11"

In [2]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [3]:
import sys
import os

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [None]:
# install from PYPI using secret
%pip install spark-ocr==$version.spark24 --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

In [None]:
# or install from local path
# %pip install ../../python/dist/spark-ocr-1.9.0.spark24.tar.gz

## Initialization of spark session

In [4]:
from pyspark.sql import SparkSession
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license

spark = start(secret=secret, jar_path=spark_ocr_jar_path)
spark

Spark version: 2.4.4
Spark NLP version: 2.5.5
Spark OCR version: 1.9.0rc1



## Import OCR transformers

In [5]:
from sparkocr.transformers import *
from pyspark.ml import PipelineModel

## Define OCR transformers and pipeline

In [6]:
def pipeline():
    
    # Transforrm PDF document to images per page
    pdf_to_image = PdfToImage()
    pdf_to_image.setInputCol("content")
    pdf_to_image.setOutputCol("image")

    # Run OCR
    ocr = ImageToHocr()
    ocr.setInputCol("image")
    ocr.setOutputCol("hocr")
    ocr.setIgnoreResolution(False)
    
    pipeline = PipelineModel(stages=[
        pdf_to_image,
        ocr
    ])
    
    return pipeline

## Read PDF document as binary file

In [7]:
import pkg_resources
pdf_example = pkg_resources.resource_filename('sparkocr', 'resources/ocr/pdfs/test_document.pdf')
pdf_example_df = spark.read.format("binaryFile").load(pdf_example).cache()

## Run OCR pipelines

In [8]:
result = pipeline().transform(pdf_example_df).cache()

## Display results

In [9]:
result.select("pagenum", "hocr").show()

+-------+--------------------+
|pagenum|                hocr|
+-------+--------------------+
|      0|  <div class='ocr...|
|      1|  <div class='ocr...|
+-------+--------------------+



### Display hocr as html

In [10]:
from IPython.core.display import display, HTML
display(HTML(result.select("hocr").collect()[0].hocr))

## Clear cache

In [None]:
result.unpersist()