# Example of usage Spark OCR

## Install spark-ocr python packge
Need specify path to `spark-ocr-assembly-[version].jar` or `secret`

In [1]:
secret = ""
license = ""
version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.12"

## Initialization of spark session

In [3]:
from pyspark.sql import SparkSession
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license

spark = start(secret=secret, jar_path=spark_ocr_jar_path)
spark

Spark version: 3.4.1
Spark NLP version: 6.1.1
Spark NLP for Healthcare version: 6.1.0
Spark OCR version: 6.1.0



## Import OCR transformers

In [4]:
from sparkocr.transformers import *
from pyspark.ml import PipelineModel

## Define OCR transformers and pipeline

In [5]:
def pipeline():
    
    # Transforrm PDF document to images per page
    pdf_to_image = PdfToImage()
    pdf_to_image.setInputCol("content")
    pdf_to_image.setOutputCol("image")

    # Run OCR
    ocr = ImageToText()
    ocr.setInputCol("image")
    ocr.setOutputCol("text")
    ocr.setConfidenceThreshold(65)
    
    pipeline = PipelineModel(stages=[
        pdf_to_image,
        ocr
    ])
    
    return pipeline

## Read PDF document as binary file

In [6]:
import pkg_resources
pdf_example = pkg_resources.resource_filename('sparkocr', 'resources/ocr/pdfs/test_document.pdf')
pdf_example_df = spark.read.format("binaryFile").load(pdf_example).cache()

## Run OCR pipelines

In [7]:
result = pipeline().transform(pdf_example_df).cache()

## Display results

In [8]:
result.select("pagenum","text", "confidence").show()

+-------+--------------------+-----------------+
|pagenum|                text|       confidence|
+-------+--------------------+-----------------+
|      0|Patient Nam\nFina...| 84.6904648674859|
|      1|Patient Name\nFin...|83.06709993802585|
+-------+--------------------+-----------------+



### Display recognized text

In [9]:
print("\n".join([row.text for row in result.select("text").collect()]))

Patient Nam
Financial Numbe

Random Hospital Date of Birth

Patient Location

  
   

 

 

 

Chief Complaint
Shortness of breath

History of Present Illness

 

Patient is an 84-year-old male wilh a past medical history of hypertension, HFpEF last
known EF 55%, mild to moderate TA, pulmonary hypertension, permanent atrial
fibrillation on Eliquis, history of GI blesd, CK-IM8, and anemia who presents with full weeks
oi ccncralized fatigue and fecling unwell. He also notes some shortness oi Breath and
worsening dyspnea willy minimal exerlion. His major complaints are shoulder and joint
pains. diffusely. He also complains of "bone pain’. He denics having any fevers or crills.

e demes having any chest pain, palpitalicns, He denies any worse extremity
swelling than his baseline. He states he’s been compliant with his mcdications. Although
he stales he ran out of his Eliquis @ few weeks ago. He denies having any blood in his
Stools or mc!ena, although he does take iron pills and states his

## Clear cache

In [10]:
result.unpersist(blocking=False)

path,modificationTime,length,image,total_pages,pagenum,documentnum,confidence,exception,text,positions
file:/home/jose/....,2025-08-24 22:54:...,693743,{file:/home/jose/...,2,0,0,84.6904648674859,,Patient Nam\nFina...,[{[{Patient Nam\n...
file:/home/jose/....,2025-08-24 22:54:...,693743,{file:/home/jose/...,2,1,0,83.06709993802585,,Patient Name\nFin...,[{[{Patient Name\...
