# Example of running OCR in streaming mode for process PDF's
## Install spark-ocr python packge
Need specify path to `spark-ocr-assembly-[version].jar` or `secret`

In [1]:
secret = ""
license = ""
version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.11"

In [2]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [3]:
import sys
import os

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [4]:
# install from PYPI using secret
#%pip install spark-ocr==$version+spark30 --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

In [5]:
# or install from local path
# %pip install ../../python/dist/spark-ocr-1.9.0.spark24.tar.gz

## Initialization of spark session

In [6]:
from pyspark.sql import SparkSession
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license

spark = start(secret=secret, jar_path=spark_ocr_jar_path)
spark

Spark version: 3.2.1
Spark NLP version: 4.0.0
Spark NLP for Healthcare version: 4.0.0
Spark OCR version: 4.0.0



In [7]:
from pyspark.ml import PipelineModel
from pyspark.sql.functions import *

from sparkocr.transformers import *

In [8]:
# fill path to folder with PDF's here
dataset_path = "data/pdfs/*.pdf"

In [9]:
# read one file for infer schema
pdfs_df = spark.read.format("binaryFile").load(dataset_path).limit(1)

## Define OCR pipeline

In [10]:
# Transform binary to image
pdf_to_image = PdfToImage()
pdf_to_image.setOutputCol("image")

# Run OCR for each region
ocr = ImageToText()
ocr.setInputCol("image")
ocr.setOutputCol("text")
ocr.setConfidenceThreshold(60)

# OCR pipeline
pipeline = PipelineModel(stages=[
    pdf_to_image,
    ocr
])

## Define streaming pipeline and start it
Note: each start erase previous results

In [11]:
# count of files in one microbatch
maxFilesPerTrigger = 4 

# read files as stream
pdf_stream_df = spark.readStream \
.format("binaryFile") \
.schema(pdfs_df.schema) \
.option("maxFilesPerTrigger", maxFilesPerTrigger) \
.load(dataset_path)

# process files using OCR pipoeline
result = pipeline.transform(pdf_stream_df).withColumn("timestamp", current_timestamp())

# store results to memory table
query = result.writeStream \
 .format('memory') \
 .queryName('result') \
 .start()

In [12]:
# get progress of streamig job
query.lastProgress

In [12]:
# need to run for stop steraming job
query.stop()

## Show results from 'result' table

In [13]:
# count of processed records (number of processed pages in results)
spark.table("result").count() 

1

In [14]:
# show results
spark.table("result").select("timestamp","pagenum", "path", "text").show(10)

+--------------------+-------+--------------------+--------------------+
|           timestamp|pagenum|                path|                text|
+--------------------+-------+--------------------+--------------------+
|2022-07-14 17:41:...|      0|file:/home/jose/s...| \n\n \n\n \n\nne...|
+--------------------+-------+--------------------+--------------------+



## Run streaming job for storing results to disk

In [15]:
query = result.select("text").writeStream \
 .format('text') \
 .option("path", "results/") \
 .option("checkpointLocation", "checkpointDir") \
 .start()

In [18]:
# get progress of streamig job
query.lastProgress

{'id': '843edde6-9e8e-4fd3-ba8a-5f7cc3ef5de0',
 'runId': '2d9fab9d-1aa7-4d6f-bef8-1f82840ef0b3',
 'name': None,
 'timestamp': '2022-07-14T20:43:36.505Z',
 'batchId': 1,
 'numInputRows': 0,
 'inputRowsPerSecond': 0.0,
 'processedRowsPerSecond': 0.0,
 'durationMs': {'latestOffset': 3, 'triggerExecution': 3},
 'stateOperators': [],
 'sources': [{'description': 'FileStreamSource[file:/home/jose/spark-ocr/workshop/jupyter/data/pdfs/*.pdf]',
   'startOffset': {'logOffset': 0},
   'endOffset': {'logOffset': 0},
   'latestOffset': None,
   'numInputRows': 0,
   'inputRowsPerSecond': 0.0,
   'processedRowsPerSecond': 0.0}],
 'sink': {'description': 'FileSink[results/]', 'numOutputRows': -1}}

In [19]:
# need to run for stop steraming job
query.stop()

## Read results from disk

In [20]:
# NBVAL_SKIP
results = spark.read.format("text").load("results/*.txt")
results.show()

+--------------------+
|               value|
+--------------------+
|                    |
|                    |
|                    |
|                    |
|                    |
|                    |
|ne Pa a Date: 7/1...|
|er ‘Sample No. _ ...|
|“ Original reques...|
|                    |
|Sample specificat...|
| , BLEND CASING R...|
|                    |
|- OLD GOLD STRAIG...|
|                    |
|Control for Sampl...|
|                    |
|         Cigarettes:|
|                    |
|   OLD GOLD STRAIGHT|
+--------------------+
only showing top 20 rows



## Clean results and checkpoint folders

In [21]:
%%bash
rm -r -f results
rm -r -f checkpointDir