# Example of usage Spark OCR layout text stripper

## Install spark-ocr python packge
Need specify license and path to `spark-ocr-assembly-[version].jar` or `secret`

In [1]:
secret = ""
license = ""
AWS_ACCESS_KEY_ID = ""
AWS_SECRET_ACCESS_KEY = ""

version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.12/"
imagePath = "./data/tab_images/*.jpg"

In [2]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [3]:
import sys
import os

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [4]:
# install from PYPI using secret
#%pip install spark-ocr==$version+spark30 --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

In [5]:
import os

if AWS_ACCESS_KEY_ID != "":
    os.environ["AWS_ACCESS_KEY_ID"] = AWS_ACCESS_KEY_ID
    os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET_ACCESS_KEY
    
if license:
    os.environ['JSL_OCR_LICENSE'] = license
    
if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

## Initialization of spark session

In [6]:
from pyspark import SparkConf
from sparkocr import start

if license:
    os.environ['SPARK_OCR_LICENSE'] = license

spark = start(jar_path = spark_ocr_jar_path, nlp_version="3.2.3")

spark

Spark version: 3.0.2
Spark NLP version: 3.3.2
Spark OCR version: 3.9.0



## Import OCR transformers

In [7]:
from sparkocr.transformers import *
from sparkocr.enums import *
from pyspark.ml import PipelineModel
from sparkocr.utils import *

## Sample of keeping layout during OCR on pdf

In [8]:
%%html
<style type='text/css'>
.CodeMirror{
font-size: 15px;
}

div.output_area pre {
    font-size: 9px;

</style>

In [9]:
def pipeline():
    # Transforrm PDF document to images per page
    # Convert to images
    pdf_to_image = PdfToImage() \
        .setInputCol("content") \
        .setOutputCol("image")
    
    skew =  ImageSkewCorrector() \
        .setInputCol("image") \
        .setOutputCol("corrected_image") \
        .setAutomaticSkewCorrection(True)
    
    # Run OCR and render results to PDF
    ocr = ImageToTextPdf() \
        .setInputCol("corrected_image") \
        .setOutputCol("pdf_page")
    
    pdf = PdfToText() \
        .setInputCol("pdf_page") \
        .setTextStripper("PDFLayoutTextStripper") \
        .setPageNumCol("page1") \
        .setOutputCol("text")


    pipeline = PipelineModel(stages=[
        pdf_to_image,
        skew,
        ocr,
        pdf
    ])
    
    return pipeline

pdf_example_df = spark.read.format("binaryFile").load("data/keeplayout/formLayoutText.pdf").cache()
result = pipeline().transform(pdf_example_df).cache()
print(result.select("text").collect()[0].text)

                                                                                                                                                    
              PDF      Form       Example                                                                                                           
                                                                                                                                                    
                                                                                                                                                    
                                                                                                                                                    
              This is an example of a user fillable       PDF form.    Normally PDF is used as a        final publishing format.                    
              However PDF has an option to be used as an entry form that can                  be edited an

## Sample of missing layout during OCR on image

In [10]:
def pipeline_nolayout():
    bin_to_image = BinaryToImage() \
        .setInputCol("content") \
        .setOutputCol("image")
    
    skew =  ImageSkewCorrector() \
        .setInputCol("image") \
        .setOutputCol("corrected_image") \
        .setAutomaticSkewCorrection(True)
    
    ocr = ImageToText() \
        .setInputCol("corrected_image") \
        .setOutputCol("text")
    
    pipeline = PipelineModel(stages=[
        bin_to_image,
        skew,
        ocr
    ])
    
    return pipeline

bin_example_df = spark.read.format("binaryFile").load("data/keeplayout/formLayoutText.jpg").cache()
result_bin = pipeline_nolayout().transform(bin_example_df).cache()
print(result_bin.select("text").collect()[0].text)

Foersom

PDF Form Example

This is an example of a user fillable PDF form. Normally PDF is used as a final publishing format.
However PDF has an option to be used as an entry form that can be edited and saved by the user.

The fields of this form have been selected to demonstrate as many as possible of the common
entry fields.

This document and PDF form have been created with OpenOffice (version 3.4.0).

To fill out the form, make sure the PDF file is not read-only. If the file is read-only save it first to a
folder or computer desktop. Close this file and open the saved file.

Please fill out the following fields. Important fields are marked yellow.

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

Given Name: Jonathan

Family Name: Link

Address 1: House nr:
Address 2:

Postcode: City:

Country: Germany

Gender: Man

Height (cm): 150

Driving License: []

| speak and understand (tick all that apply):

[_] Deutsch [¥] English Francais [_] Esperanto [_] Latin

## Sample of keeping layout during OCR on image

In [13]:
def pipeline_keeplayout():
    bin_to_image = BinaryToImage() \
        .setInputCol("content") \
        .setOutputCol("image")

    ocr = ImageToText() \
        .setInputCol("image") \
        .setOutputCol("text") \
        .setOcrParams(["preserve_interword_spaces=1", ]) \
        .setKeepLayout(True)
    
    pipeline = PipelineModel(stages=[
        bin_to_image,
        ocr
    ])
    
    return pipeline

example2_df = spark.read.format("binaryFile").load("data/keeplayout/formLayoutText.jpg").cache()
result2 = pipeline_keeplayout().transform(example2_df).cache()
print(result2.select("text").collect()[0].text)


                  Foersom                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   
               PDF      Form        Example                                                                                                                                                                                                                                                                                                                                                