# Example of using LightPipelines on Spark OCR
## Install spark-ocr python packge
Need specify path to `spark-ocr-assembly-[version].jar` or `secret`

In [1]:
secret = ""
license = ""
version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.12"

In [2]:
%%bash
if python -c 'import google.colab' &> /dev/null; then
    echo "Run on Google Colab!"
    echo "Install Open JDK"
    apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
    java -version
fi

In [3]:
import sys
import os

if 'google.colab' in sys.modules:
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

## Initialization of spark session

In [4]:
from pyspark.sql import SparkSession
from sparkocr import start

if license:
    os.environ['JSL_OCR_LICENSE'] = license
    

secret = '4.2.1-XYZXYZXYZXYZXYZXYZXYZXYZXYZ'
spark = start(jar_path=spark_ocr_jar_path, nlp_secret=secret, nlp_jsl=True, nlp_version="4.2.1")
spark

Spark version: 3.2.1
Spark NLP version: 4.2.1
Spark NLP for Healthcare version: 4.2.1
Spark OCR version: 4.2.1rc1



In [5]:
from pyspark.ml import PipelineModel
from pyspark.sql.functions import *
from sparkocr.transformers import *

In [6]:
import pkg_resources
pdfs_path = pkg_resources.resource_filename('sparkocr', 'resources/ocr/pdfs/four_pages_clinical_note')


In [7]:
pdfs_path

'/home/jose/.local/lib/python3.7/site-packages/sparkocr/resources/ocr/pdfs/four_pages_clinical_note'

## Define OCR pipeline

In [8]:
# Transform binary to image
pdf_to_image = PdfToImage()
pdf_to_image.setOutputCol("image")

# Run OCR for each region
ocr = ImageToText()
ocr.setInputCol("image")
ocr.setOutputCol("text")
ocr.setConfidenceThreshold(60)

# OCR pipeline
pipeline = PipelineModel(stages=[
    pdf_to_image,
    ocr
])

## Create LightPipeline

In [9]:
from sparkocr.base import LightPipeline

In [10]:
lp = LightPipeline(pipeline)

In [11]:
%%time
lp.fromLocalPath(pdfs_path)

CPU times: user 9.93 ms, sys: 2.23 ms, total: 12.2 ms
Wall time: 5.67 s


[{'text': [Annotation(ImageToText, 0, 1752, Result Information
   
    
   
    
   
   Status Provider Status
   Final result iy Reviewed
   Entry Date
   
    
   
   Component Results
   
   Component Lab
   Surgical Pathology
   
   Patient Nam
   MR#: JCD-O-3
   Specime
   Ca rCtnome unshelia| NOS
   
   Final Diagnosis ZBIAO/B3
   
   A. Right ureter, biopsy: x
   -Segment of benign ureter, negative for tumor Sitdez Lb lad cNer ,wall, m5
   -Frozen section diagnosis confirmed
   
   Cw7.4
   
   B. Left ureter, biopsy: B27 IU
   -Segment of benign ureter, negative for tumor #0
   -Frozen section diagnosis confirmed
   
   C. Left external and internal lymph nodes, resection:
   ~Benign fibrovascular tissue, negative for tumor
   -No lymph nodes identified
   
   D. Left internal and external iliac lymph node, resection:
   -Single lymph node with metastatic urothelial carcinoma, 1.8 cm largest diameter
   (1/1)
   
   E. Left internal and external iliac lymph nodes, resection:
  

## Integration with Spark-NLP pipelines

In [12]:
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.annotator import *
from sparknlp_jsl.annotator import *

documentAssembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

sentenceDetector = SentenceDetector()\
  .setInputCols("document")\
  .setOutputCol("sentence")

tokenizer = Tokenizer()\
  .setInputCols("sentence")\
  .setOutputCol("token")

embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")\
  .setInputCols(["sentence","token"])\
  .setOutputCol("embeddings")

ner = MedicalNerModel.pretrained("ner_jsl", "en","clinical/models")\
  .setInputCols(["sentence","token","embeddings"])\
  .setOutputCol("ner")

converter = NerConverter()\
  .setInputCols("sentence", "token", "ner")\
  .setOutputCol("entities")


embeddings_clinical download started this may take some time.
Approximate size to download 1.6 GB
[OK!]
ner_jsl download started this may take some time.
[OK!]


## Mixed Pipeline

In [13]:
# OCR pipeline
from pyspark.ml import Pipeline, PipelineModel
pipeline = Pipeline(stages=[
    pdf_to_image,
    ocr,
    documentAssembler,
    sentenceDetector,
    tokenizer,
    embeddings,
    ner,
    converter
])

empty_data = spark.createDataFrame([[b'', ""]]).toDF("content", "path")
pipeline = pipeline.fit(empty_data)
lp = LightPipeline(pipeline)

In [14]:
result = lp.fromLocalPath(pdfs_path)

In [15]:
import pandas as pd
import itertools as it

single_page = result[0]
p_df = pd.DataFrame(list(zip(it.cycle(single_page["text"]), it.cycle(single_page["sentence"]), 
                           single_page["entities"])),
                 columns=["Text", "Sentence", "Entities"])

p_df

Unnamed: 0,Text,Sentence,Entities
0,"Annotation(ImageToText, 0, 1752, Result Inform...","Annotation(document, 0, 1267, Result Informati...","Annotation(chunk, 230, 234, Right, Map(entity ..."
1,"Annotation(ImageToText, 0, 1752, Result Inform...","Annotation(document, 1269, 1624, Right obturat...","Annotation(chunk, 236, 241, ureter, Map(entity..."
2,"Annotation(ImageToText, 0, 1752, Result Inform...","Annotation(document, 1626, 1715, involving bla...","Annotation(chunk, 244, 249, biopsy, Map(entity..."
3,"Annotation(ImageToText, 0, 1752, Result Inform...","Annotation(document, 1717, 1718, 8., Map(sente...","Annotation(chunk, 266, 278, benign ureter, Map..."
4,"Annotation(ImageToText, 0, 1752, Result Inform...","Annotation(document, 0, 1267, Result Informati...","Annotation(chunk, 294, 298, tumor, Map(entity ..."
5,"Annotation(ImageToText, 0, 1752, Result Inform...","Annotation(document, 1269, 1624, Right obturat...","Annotation(chunk, 320, 323, wall, Map(entity -..."
6,"Annotation(ImageToText, 0, 1752, Result Inform...","Annotation(document, 1626, 1715, involving bla...","Annotation(chunk, 330, 353, Frozen section dia..."
7,"Annotation(ImageToText, 0, 1752, Result Inform...","Annotation(document, 1717, 1718, 8., Map(sente...","Annotation(chunk, 376, 379, Left, Map(entity -..."
8,"Annotation(ImageToText, 0, 1752, Result Inform...","Annotation(document, 0, 1267, Result Informati...","Annotation(chunk, 381, 386, ureter, Map(entity..."
9,"Annotation(ImageToText, 0, 1752, Result Inform...","Annotation(document, 1269, 1624, Right obturat...","Annotation(chunk, 389, 394, biopsy, Map(entity..."


## Try single file

In [17]:
lp.fromLocalPath("./pdfs/a68.pdf")

[{'entities': [Annotation(chunk, 92, 123, Component Lab
   Surgical Pathology, Map(entity -> Clinical_Dept, sentence -> 0, chunk -> 0, confidence -> 0.63225)),
   Annotation(chunk, 213, 217, Right, Map(entity -> Direction, sentence -> 1, chunk -> 1, confidence -> 0.9391)),
   Annotation(chunk, 219, 224, ureter, Map(entity -> Internal_organ_or_component, sentence -> 1, chunk -> 2, confidence -> 0.7579)),
   Annotation(chunk, 227, 232, biopsy, Map(entity -> Procedure, sentence -> 1, chunk -> 3, confidence -> 0.9858)),
   Annotation(chunk, 239, 262, Segment of benign ureter, Map(entity -> Internal_organ_or_component, sentence -> 1, chunk -> 4, confidence -> 0.34007502)),
   Annotation(chunk, 278, 282, tumor, Map(entity -> Oncological, sentence -> 1, chunk -> 5, confidence -> 0.8625)),
   Annotation(chunk, 284, 299, Sitez L lacs Aer, Map(entity -> Medical_Device, sentence -> 1, chunk -> 6, confidence -> 0.21432501)),
   Annotation(chunk, 311, 334, Frozen section diagnosis, Map(entity -> Te