# Example of using Base64ToImage Transformer on Spark OCR
## Install spark-ocr python packge
Need specify path to `spark-ocr-assembly-[version].jar` or `secret`

In [1]:
license = ""
secret = ""
nlp_secret = ""
aws_access_key = ""
aws_secret_key = ""

version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.12"

In [2]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [3]:
import sys
import os

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

## Initialization of spark session

In [4]:
from pyspark.sql import SparkSession
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license
    
# Pass nlp_secret=nlp_secret, if you don't have a jar
spark = start(jar_path=spark_ocr_jar_path)

Spark version: 3.2.1
Spark NLP version: 4.4.1
Spark NLP for Healthcare version: 4.3.0
Spark OCR version: 4.4.1rc6



In [13]:
from pyspark.ml import PipelineModel
from pyspark.sql.functions import *
from sparkocr.transformers import *

In [6]:
import pkg_resources
img_path = pkg_resources.resource_filename('sparkocr', 'resources/ocr/images/check.jpg')

## Convert to Base64

In [7]:
import base64
with open(img_path, 'rb') as f:
    base64_img = base64.b64encode(f.read()).decode('ascii')

## Define OCR pipeline

In [8]:
# Transform base64 to image
base64_to_image = Base64ToImage()
base64_to_image.setOutputCol("image")

# Run OCR for each region
ocr = ImageToText()
ocr.setInputCol("image")
ocr.setOutputCol("text")
ocr.setConfidenceThreshold(60)

# OCR pipeline
pipeline = PipelineModel(stages=[
    base64_to_image,
    ocr
])

## Create LightPipeline

In [9]:
from sparkocr.base import LightPipeline

In [10]:
lp = LightPipeline(pipeline)

In [11]:
%%time
result = lp.fromString(base64_img)
result

base64_to_image
ocr_pipeline_exception
image_to_text
page_matrix
base64_source_file
CPU times: user 15.3 ms, sys: 2.46 ms, total: 17.8 ms
Wall time: 1.09 s


[{'image': ImageOutput(path: base64_in_memory, exception: None),
  'exception': Light Pipeline Exception(message: [ocr_pipeline_exception::]),
  'text': Annotation(image_to_text, 0, 324, STARBUCKS Store #19208
  11902 Euclid Avenue
  Cleveland, OH (216) 229-U749
  
  CHK 664250
  12/07/2014 06:43 PM
  112003. Drawer: 2. Reg: 2
  
  ¥t Pep Mocha 4.95
  
  Sbux Card 4.95
  
  AXKANRKAXERAK, 3228
  
  Subtotal $4.95
  
  Total $4.95
  Change Cue BOC
  
  - Check Closed ~
  “12/01/2014 06:43 py
  
  oBUX Card «3228 New Balance: 37.45
  Card is registertd
  , Map(confidence -> 73.71952590942382, exception -> , sourcePath -> base64_in_memory), []),
  'positions': None,
  'content': source base64 file buffer, of len: 20. First 20 chars: /9j/4AAQSkZJRgABAQEA}]