# Example of usage Spark OCR with Update Text Position

## Import Spark OCR transformers and Spark NLP annotators

## Install spark-ocr python packge
Need specify path to `spark-ocr-assembly-[version].jar` or `secret`

In [1]:
secret = ""
license = ""
version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.11"

In [2]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [4]:
import os
import sys

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

In [None]:
# install from PYPI using secret
%pip install spark-nlp==2.5.5
%pip install spark-ocr==$version --user --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

In [None]:
# %pip install --user ../dist/spark-ocr-[version].tar.gz

## Initialization of spark session
Need specify path to `spark-ocr-assembly.jar` or `secret`

In [5]:
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license

spark = start(secret=secret, jar_path=spark_ocr_jar_path, nlp_version="2.5.5")
spark

--- Logging error ---
Traceback (most recent call last):
  File "/Users/nmelnik/.pyenv/versions/3.7.9/lib/python3.7/logging/__init__.py", line 1025, in emit
    msg = self.format(record)
  File "/Users/nmelnik/.pyenv/versions/3.7.9/lib/python3.7/logging/__init__.py", line 869, in format
    return fmt.format(record)
  File "/Users/nmelnik/.pyenv/versions/3.7.9/lib/python3.7/logging/__init__.py", line 608, in format
    record.message = record.getMessage()
  File "/Users/nmelnik/.pyenv/versions/3.7.9/lib/python3.7/logging/__init__.py", line 369, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/Users/nmelnik/.pyenv/versions/3.7.9/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/Users/nmelnik/.pyenv/versions/3.7.9/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/Users/nmelnik/.local/share/virtualenvs/OcrMissedPages-W3gpzUuJ/lib/python3.

In [4]:
from pyspark.ml import Pipeline
from pyspark.ml import PipelineModel
from sparkocr.transformers import *
from sparknlp.annotator import *
from sparknlp.base import *
from sparkocr.enums import PageSegmentationMode

## Define OCR transformers and pipeline

In [5]:
def update_text_pipeline():

    document_assembler = DocumentAssembler() \
        .setInputCol("text") \
        .setOutputCol("document")

    sentence_detector = SentenceDetector() \
        .setInputCols(["document"]) \
        .setOutputCol("sentence")

    tokenizer = Tokenizer() \
        .setInputCols(["sentence"]) \
        .setOutputCol("tokens")

    spell = NorvigSweetingModel().pretrained("spellcheck_norvig", "en") \
          .setInputCols("tokens") \
          .setOutputCol("spell")
    
    tokenAssem = TokenAssembler() \
          .setInputCols(["spell", "document"]) \
          .setOutputCol("newDocs")

    updatedText = UpdateTextPosition() \
          .setInputCol("positions") \
          .setOutputCol("output_positions") \
          .setInputText("newDocs.result")

    pipeline = Pipeline(stages=[
        document_assembler,
        sentence_detector,
        tokenizer,
        spell,
        tokenAssem,
        updatedText
    ])
    
    return pipeline


def ocr_pipeline():
    # Transforrm PDF document to images per page
        pdf_to_image = PdfToImage() \
            .setInputCol("content") \
            .setOutputCol("image_raw") \
            .setKeepInput(True)

        binarizer = ImageBinarizer() \
            .setInputCol("image_raw") \
            .setOutputCol("image") \
            .setThreshold(130)

        ocr = ImageToText() \
            .setInputCol("image") \
            .setOutputCol("text") \
            .setIgnoreResolution(False) \
            .setPageSegMode(PageSegmentationMode.SPARSE_TEXT) \
            .setConfidenceThreshold(60)

        pipeline = Pipeline(stages=[
            pdf_to_image,
            binarizer,
            ocr
        ])
        return pipeline

## Read PDF document as binary file

In [6]:
import pkg_resources
pdf_example = pkg_resources.resource_filename('sparkocr', 'resources/ocr/pdfs/multiplepages/image_3_pages.pdf')
pdf_example_df = spark.read.format("binaryFile").load(pdf_example).cache()

## Run OCR pipelines

In [7]:
ocr_result = ocr_pipeline().fit(pdf_example_df).transform(pdf_example_df)
updated_result= update_text_pipeline().fit(ocr_result).transform(ocr_result)
textToPdf = TextToPdf() \
      .setInputCol("output_positions") \
      .setInputImage("image") \
      .setOutputCol("pdf")
result = textToPdf.transform(updated_result).cache()

## Store results to pdf file

In [9]:
pdf_raw_bytes = result.head()
pdfFile = open("pdf_raw_bytes.pdf", "wb")
pdfFile.write(pdf_raw_bytes.pdf)


72914

In [None]:
%%bash
rm -r -f pdf_raw_bytes.pdf