# Example of using LightPipelines on Spark OCR
## Install spark-ocr python packge
Need specify path to `spark-ocr-assembly-[version].jar` or `secret`

In [1]:
secret = ""
license = ""
version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.12"

In [3]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [4]:
import sys
import os

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

## Initialization of spark session

In [2]:
from pyspark.sql import SparkSession
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license

spark = start(secret=secret, jar_path=spark_ocr_jar_path)
spark

Spark version: 3.2.1
Spark NLP version: 4.2.1
Spark NLP for Healthcare version: 4.2.1
Spark OCR version: 4.2.1rc1



In [3]:
from pyspark.ml import PipelineModel
from pyspark.sql.functions import *
from sparkocr.transformers import *

In [4]:
pdfs_path = "./pdfs"

## Define OCR pipeline

In [5]:
# Transform binary to image
pdf_to_image = PdfToImage()
pdf_to_image.setOutputCol("image")

# Run OCR for each region
ocr = ImageToText()
ocr.setInputCol("image")
ocr.setOutputCol("text")
ocr.setConfidenceThreshold(60)

# OCR pipeline
pipeline = PipelineModel(stages=[
    pdf_to_image,
    ocr
])

## Create LightPipeline

In [6]:
from sparkocr.base import LightPipeline

In [7]:
lp = LightPipeline(pipeline)

In [8]:
%%time
result = lp.fromLocalPath(pdfs_path)

CPU times: user 7.98 ms, sys: 0 ns, total: 7.98 ms
Wall time: 10.5 s


In [9]:
result

{0: 'Result Information\nStatus Provider Status\n\nFinal result Reviewed\na Date\n\nComponent Results\n\nComponent Lab\nSurgical Pathology\n\nPatient Nam\nMR#:! }CD-O-3\nSpecime\nCarcinome unsthelialWos\n\nFinal Diagnosis Z)20f/3\n\nA. Right ureter, biopsy: in\n-Segment of benign ureter, negative for tumor Sitez L lacs Aer, Wall, 5\n-Frozen section diagnosis confirmed\n\nCw. 4\n\n \n\n \n\nB. Left ureter, biopsy: Br2ATM IA\n-Segment of benign ureter, negative for tumor £0\n-Frozen section diagnosis confirmed\n\nC. Left external and internal lymph nodes, resection:\n-Benign fibrovascular tissue, negative for tumor\n-No lymph nodes identified\n\nD. Left internal and external iliac lymph node, resection:\n-Single lymph node with metastatic urothelial carcinoma, 1.8 cm largest diameter\n(1/1)\n\nE. Left internal and external iliac lymph nodes, resection:\n-Eight benign lymph nodes and fibroadipose tissue, negative for tumor (0/8)\n\nF. Left obturator lymph node, resection:\n-Three benign l

## [TODO Integration with Spark-NLP pipelines]

In [None]:
!wget https://github.com/tesseract-ocr/tessdata/blob/main/eng.traineddata?raw=true -o eng.traineddata 

In [None]:
# coming soon!