# Example of using Base64ToImage Transformer on Spark OCR
## Install spark-ocr python packge
Need specify path to `spark-ocr-assembly-[version].jar` or `secret`

In [1]:
license = ""
secret = ""
nlp_secret = ""
aws_access_key = ""
aws_secret_key = ""

version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.12"

In [2]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [3]:
import sys
import os

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

## Initialization of spark session

In [None]:
from pyspark.sql import SparkSession
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license
    
# Pass nlp_secret=nlp_secret, if you don't have a jar
spark = start(jar_path=spark_ocr_jar_path)

In [3]:
from pyspark.ml import PipelineModel
from pyspark.sql.functions import *
from sparkocr.transformers import *

## Images
### Convert to Base64

In [4]:
import pkg_resources
img_path = pkg_resources.resource_filename('sparkocr', 'resources/ocr/images/check.jpg')

In [7]:
import base64
with open(img_path, 'rb') as f:
    base64_img = base64.b64encode(f.read()).decode('ascii')

## Define OCR pipeline

In [8]:
# Transform base64 to image
base64_to_image = Base64ToImage()
base64_to_image.setOutputCol("image")

# Run OCR for each region
ocr = ImageToText()
ocr.setInputCol("image")
ocr.setOutputCol("text")
ocr.setConfidenceThreshold(60)

# OCR pipeline
pipeline = PipelineModel(stages=[
    base64_to_image,
    ocr
])

## Create LightPipeline

In [8]:
from sparkocr.base import LightPipeline

In [10]:
lp = LightPipeline(pipeline)

In [11]:
%%time
result = lp.fromString(base64_img)
result

CPU times: user 3.78 ms, sys: 3.96 ms, total: 7.73 ms
Wall time: 742 ms


[{'image': ImageOutput(path: base64_in_memory, exception: None),
  'exception': Light Pipeline Exception(message: [ocr_pipeline_exception::]),
  'text': Annotation(image_to_text, 0, 324, STARBUCKS Store #19208
  11902 Euclid Avenue
  Cleveland, OH (216) 229-U749
  
  CHK 664250
  12/07/2014 06:43 PM
  112003. Drawer: 2. Reg: 2
  
  ¥t Pep Mocha 4.95
  
  Sbux Card 4.95
  
  AXKANRKAXERAK, 3228
  
  Subtotal $4.95
  
  Total $4.95
  Change Cue BOC
  
  - Check Closed ~
  “12/01/2014 06:43 py
  
  oBUX Card «3228 New Balance: 37.45
  Card is registertd
  , Map(confidence -> 73.70487899780274, exception -> , sourcePath -> base64_in_memory), []),
  'positions': PositionsOutput(mappings: [{'c': 'STARBUCKS Store #19208\n11902 Euclid Avenue\nCleveland, OH (216) 229-U749\n\n', 'x': 103.0, 'width': 371.0, 'y': 32.0, 'fontSize': 11, 'source': 'ocr', 'height': 92.0}, {'c': 'CHK 664250\n12/07/2014 06:43 PM\n112003. Drawer: 2. Reg: 2\n\n', 'x': 108.0, 'width': 352.0, 'y': 164.0, 'fontSize': 10, 'so

## PDFs
### Convert to Base64

In [12]:
import pkg_resources
img_path = pkg_resources.resource_filename('sparkocr', 'resources/ocr/pdfs/four_pages_clinical_note/4_pages_clinical_note.pdf')

In [17]:
import base64
with open(img_path, 'rb') as f:
    base64_pdf = base64.b64encode(f.read()).decode('ascii')

In [19]:
from sparkocr.enums import ImageType
# Transform base64 to binary
base64_to_bin = Base64ToBinary()
base64_to_bin.setOutputCol("content")

pdf_to_image = PdfToImage()
pdf_to_image.setInputCol("content")
pdf_to_image.setOutputCol("image")
pdf_to_image.setImageType(ImageType.TYPE_3BYTE_BGR)
pdf_to_image.setResolution(72)

# Run OCR for each region
ocr = ImageToText()
ocr.setInputCol("image")
ocr.setOutputCol("text")
ocr.setConfidenceThreshold(60)

# OCR pipeline
pipeline = PipelineModel(stages=[
    base64_to_bin,
    pdf_to_image,
    ocr
])

In [20]:
lp = LightPipeline(pipeline)

In [22]:
r = lp.fromString(base64_pdf)
r

[{'image': ImageOutput(path: base64_in_memory:JVBERi0xLjUKJbXtrvsK, exception: ),
  'exception': [Light Pipeline Exception(message: [ocr_pipeline_exception::]),
   Light Pipeline Exception(message: [ocr_pipeline_exception::])],
  'text': Annotation(image_to_text, 0, 476, Result Information
  ‘Status Provider Status
  
  Entry Date
  
   
  
   
  
   
  
   
  
  Component Results
  ‘Component ta
  Surgical Pathology
  
   
  
  ©. Loft intemal and extemal tae lymph node, resection:
  Single lymph node with metastatic yrothesal carcinoma, 1.8 cm largest dametor
  ony
  
  Left obturator mph node, resection:
  “Three benign lymph nodes and flrcadipose tssue, negative for tunar (0/3)
  
  G. Let common tac mph, resection:
  One benign lym nodes and Rorozdpose tissue, negate for tumor (0/1)
  
   
  , Map(confidence -> 61.55688985188802, exception -> , sourcePath -> base64_in_memory:JVBERi0xLjUKJbXtrvsK), []),
  'b64string': source base64 file buffer, of len: 20. First 20 chars: JVBERi0xL

In [25]:
(r[1]['text'], r[1]['positions'])

(Annotation(image_to_text, 0, 539,  
 
  
 
 Surgical resecton margins are negative for tumor
 “Incidental prostatic adenocarcinoma, Gleason grade 2+3=6, organ confined (p72a)
 and negatve surgical margins
 
 -See CAP cancer template fr futher detais
 
 Specimen
 Bladder and prostate
 
 Histologic Grade
 High grade
 
 Margins
 Margins uninvolved by invasive carcinoma
 
 ‘Addiional Pathologic Findings
 Evidence of previous biopsy
 Incidental adenocarcinoma of prostate (see CAP template below)
 
 Anciary Studies:
 ‘lock J18 has been sent for ancilary studies, see separate report,
 
 Page 2 of 8
 , Map(confidence -> 61.146457290649415, exception -> , sourcePath -> base64_in_memory:JVBERi0xLjUKJbXtrvsK), []),
 PositionsOutput(mappings: [{'c': ' \n\n', 'x': 7.0, 'width': 578.0, 'y': 55.0, 'fontSize': 7, 'source': 'ocr', 'height': 13.0}, {'c': ' \n\n', 'x': 74.0, 'width': 57.0, 'y': 57.0, 'fontSize': 20, 'source': 'ocr', 'height': 10.0}, {'c': 'Surgical resecton margins are negative for tumo

## Examle of call Light Pipeline with fromBinary method

### Define pipeline

In [5]:
binary_to_image = BinaryToImage()
binary_to_image.setOutputCol("image")

# Run OCR for each region
ocr = ImageToText()
ocr.setInputCol("image")
ocr.setOutputCol("text")
ocr.setConfidenceThreshold(60)

# OCR pipeline
pipeline = PipelineModel(stages=[
    binary_to_image,
    ocr
])


### Call pipeline

In [10]:
with open(img_path, 'rb') as f:
    image = f.read()
    
lp = LightPipeline(pipeline)
result = lp.fromBinary(image)
print(result)

Detected 82 diacritics


[{'image': ImageOutput(path: memory, exception: None), 'text': Annotation(image_to_text, 0, 324, STARBUCKS Store #19208
11902 Euclid Avenue
Cleveland, OH (216) 229-U749

CHK 664250
12/07/2014 06:43 PM
112003. Drawer: 2. Reg: 2

¥t Pep Mocha 4.95

Sbux Card 4.95

AXKANRKAXERAK, 3228

Subtotal $4.95

Total $4.95
Change Cue BOC

- Check Closed ~
“12/01/2014 06:43 py

oBUX Card «3228 New Balance: 37.45
Card is registertd
, Map(confidence -> 73.70487899780274, exception -> , sourcePath -> memory), []), 'positions': PositionsOutput(mappings: [{'c': 'STARBUCKS Store #19208\n11902 Euclid Avenue\nCleveland, OH (216) 229-U749\n\n', 'x': 103.0, 'width': 371.0, 'y': 32.0, 'fontSize': 11, 'source': 'ocr', 'height': 92.0}, {'c': 'CHK 664250\n12/07/2014 06:43 PM\n112003. Drawer: 2. Reg: 2\n\n', 'x': 108.0, 'width': 352.0, 'y': 164.0, 'fontSize': 10, 'source': 'ocr', 'height': 92.0}, {'c': '¥t Pep Mocha 4.95\n\nSbux Card 4.95\n\nAXKANRKAXERAK, 3228\n\nSubtotal $4.95\n\nTotal $4.95\nChange Cue BOC\n\n'