# Example of using LightPipelines on Spark OCR
## Install spark-ocr python packge
Need specify path to `spark-ocr-assembly-[version].jar` or `secret`

In [5]:
license = ""
secret = ""
nlp_secret = ""
aws_access_key = ""
aws_secret_key = ""

version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.12"

In [2]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [3]:
import sys
import os

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

## Initialization of spark session

In [6]:
from pyspark.sql import SparkSession
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license
    
# Pass nlp_secret=nlp_secret, if you don't have a jar
spark = start(secret=secret,
              jar_path=spark_ocr_jar_path,
              nlp_internal=True)

Spark version: 3.2.1
Spark NLP version: 4.3.1
Spark NLP for Healthcare version: 4.3.0
Spark OCR version: 4.4.0rc1



In [5]:
from pyspark.ml import PipelineModel
from pyspark.sql.functions import *
from sparkocr.transformers import *

In [6]:
import pkg_resources
pdfs_path = pkg_resources.resource_filename('sparkocr', 'resources/ocr/pdfs/four_pages_clinical_note')


## Define OCR pipeline

In [7]:
# Transform binary to image
pdf_to_image = PdfToImage()
pdf_to_image.setOutputCol("image")

# Run OCR for each region
ocr = ImageToText()
ocr.setInputCol("image")
ocr.setOutputCol("text")
ocr.setConfidenceThreshold(60)

# OCR pipeline
pipeline = PipelineModel(stages=[
    pdf_to_image,
    ocr
])

## Create LightPipeline

In [8]:
from sparkocr.base import LightPipeline

In [9]:
lp = LightPipeline(pipeline)

In [10]:
%%time
lp.fromLocalPath(pdfs_path)

CPU times: user 60.9 ms, sys: 8.25 ms, total: 69.1 ms
Wall time: 6.33 s


[{'image': ImageOutput(path: 4_pages_clinical_note.pdf, exception: ),
  'exception': [Light Pipeline Exception(message: [ocr_pipeline_exception::]),
   Light Pipeline Exception(message: [ocr_pipeline_exception::])],
  'text': Annotation(image_to_text, 0, 1752, Result Information
  
   
  
   
  
  Status Provider Status
  Final result iy Reviewed
  Entry Date
  
   
  
  Component Results
  
  Component Lab
  Surgical Pathology
  
  Patient Nam
  MR#: JCD-O-3
  Specime
  Ca rCtnome unshelia| NOS
  
  Final Diagnosis ZBIAO/B3
  
  A. Right ureter, biopsy: x
  -Segment of benign ureter, negative for tumor Sitdez Lb lad cNer ,wall, m5
  -Frozen section diagnosis confirmed
  
  Cw7.4
  
  B. Left ureter, biopsy: B27 IU
  -Segment of benign ureter, negative for tumor #0
  -Frozen section diagnosis confirmed
  
  C. Left external and internal lymph nodes, resection:
  ~Benign fibrovascular tissue, negative for tumor
  -No lymph nodes identified
  
  D. Left internal and external iliac lymph 

## Integration with Spark-NLP pipelines

In [11]:
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

sentenceDetector = SentenceDetector()\
  .setInputCols("document")\
  .setOutputCol("sentence")

tokenizer = Tokenizer()\
  .setInputCols("sentence")\
  .setOutputCol("token")

embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")\
  .setInputCols(["sentence","token"])\
  .setOutputCol("embeddings")

ner = MedicalNerModel.pretrained("ner_jsl", "en","clinical/models")\
  .setInputCols(["sentence","token","embeddings"])\
  .setOutputCol("ner")

converter = NerConverter()\
  .setInputCols("sentence", "token", "ner")\
  .setOutputCol("entities")


embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_jsl download started this may take some time.
[OK!]


## Mixed Pipeline

In [12]:
# OCR pipeline
from pyspark.ml import Pipeline, PipelineModel
pipeline = Pipeline(stages=[
    pdf_to_image,
    ocr,
    documentAssembler,
    sentenceDetector,
    tokenizer,
    embeddings,
    ner,
    converter
])

empty_data = spark.createDataFrame([[b'', ""]]).toDF("content", "path")
pipeline = pipeline.fit(empty_data)
lp = LightPipeline(pipeline)

In [13]:
result = lp.fromLocalPath(pdfs_path)

In [14]:
import pandas as pd
pd.DataFrame.from_records(result)

Unnamed: 0,entities,image,document,exception,text,pagenum,positions,token,content,ner,embeddings,sentence
0,"[Annotation(chunk, 230, 234, Right, Map(entity...","ImageOutput(path: 4_pages_clinical_note.pdf, e...","Annotation(document, 0, 1751, Result Informati...",[Light Pipeline Exception(message: [ocr_pipeli...,"Annotation(image_to_text, 0, 1752, Result Info...",0,,"[Annotation(token, 0, 5, Result, Map(sentence ...","source file 4_pages_clinical_note.pdf, 426742 ...","[Annotation(named_entity, 0, 5, O, Map(word ->...","[Annotation(word_embeddings, 0, 5, Result, Map...","[Annotation(document, 0, 1267, Result Informat..."
1,"[Annotation(chunk, 45, 49, tumor, Map(entity -...","ImageOutput(path: 4_pages_clinical_note.pdf, e...","Annotation(document, 0, 1475, -Surgical resect...",[Light Pipeline Exception(message: [ocr_pipeli...,"Annotation(image_to_text, 0, 1476, -Surgical r...",1,,"[Annotation(token, 0, 0, -, Map(sentence -> 0)...","source file 4_pages_clinical_note.pdf, 426742 ...","[Annotation(named_entity, 0, 0, O, Map(word ->...","[Annotation(word_embeddings, 0, 0, -, Map(isOO...","Annotation(document, 0, 1474, -Surgical resect..."
2,"[Annotation(chunk, 3, 12, CAP cancer, Map(enti...","ImageOutput(path: 4_pages_clinical_note.pdf, e...","Annotation(document, 0, 1348, \n\nCAP cancer ...",[Light Pipeline Exception(message: [ocr_pipeli...,"Annotation(image_to_text, 0, 1349, \n\nCAP ca...",2,,"[Annotation(token, 3, 5, CAP, Map(sentence -> ...","source file 4_pages_clinical_note.pdf, 426742 ...","[Annotation(named_entity, 3, 5, B-Oncological,...","[Annotation(word_embeddings, 3, 5, CAP, Map(is...","[Annotation(document, 3, 485, CAP cancer templ..."
3,"[Annotation(chunk, 6, 9, Left, Map(entity -> D...","ImageOutput(path: 4_pages_clinical_note.pdf, e...","Annotation(document, 0, 2030, \n\nL. Left obt...",[Light Pipeline Exception(message: [ocr_pipeli...,"Annotation(image_to_text, 0, 2031, \n\nL. Lef...",3,,"[Annotation(token, 3, 3, L, Map(sentence -> 0)...","source file 4_pages_clinical_note.pdf, 426742 ...","[Annotation(named_entity, 3, 3, O, Map(word ->...","[Annotation(word_embeddings, 3, 3, L, Map(isOO...","[Annotation(document, 3, 270, L. Left obturato..."
