# Benchmark Spark OCR Libs  PdfToText V1.0.0
## Initialize spark session

In [1]:
from sparkocr.enums import PageSegmentationMode
secret = ""
license = ""
version = "1.0.0"
spark_ocr_jar_path = "../../target/scala-2.11"

In [2]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [3]:
import os
import sys

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [None]:
# install from PYPI using secret
%pip install spark-ocr==$version --user --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

## Initialization of spark session

In [4]:
from pyspark import SparkConf
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license

conf = SparkConf()
spark = start(secret=secret, jar_path=spark_ocr_jar_path, extra_conf=conf)
spark

SparkConf Configured, Starting to listen on port: 50980
JAR PATH:/usr/local/lib/python3.7/site-packages/sparkmonitor/listener.jar


## Imports

In [5]:
from pyspark.ml import PipelineModel
from sparkocr.transformers import *

## Define paths to pdf

In [6]:
binary_pdf = "data/pdfs/test_doc_output.pdf"

## Read pdf objects

In [7]:
pdfs = spark.read.format("binaryFile").load(binary_pdf).cache()

18

## Define OCR pipeline 

In [9]:

# Run PdfToText OCR
pdf_to_text = PdfToText() \
        .setInputCol("content") \
        .setOutputCol("text") \
        .setSplitPage(False)

pdf_to_image = PdfToImage() \
    .setOutputCol("image") \
    .setFallBackCol("text") \
    .setMinSizeBeforeFallback(10) \
    .setKeepInput(True) \
    .setResolution(200)

skewCorrector = ImageSkewCorrector() \
    .setInputCol("image") \
    .setOutputCol("deskewed_image") \
    .setAutomaticSkewCorrection(True)

layoutAnalyzer = ImageLayoutAnalyzer() \
    .setInputCol("deskewed_image") \
    .setOutputCol("region") \
    .setPageIteratorLevel(0) \
    .setPageSegMode(6)

splitter = ImageSplitRegions() \
    .setInputCol("deskewed_image") \
    .setInputRegionsCol("region") \
    .setOutputCol("image_region")

ocr = TesseractOcr() \
    .setInputCol("deskewed_image") \
    .setOutputCol("text") \
    .setConfidenceThreshold(60) \
    .setLanguage("eng") \
    .setIgnoreResolution(False) \
    .setPageSegMode(PageSegmentationMode.SPARSE_TEXT)

# OCR pipeline
pipeline = PipelineModel(stages=[
            pdf_to_text,
            pdf_to_image,
            skewCorrector,
            layoutAnalyzer,
            splitter,
            ocr


        ])

## Run OCR pipeline

In [10]:
%%time
results = pipeline.transform(pdfs)
pd=results.collect()


