# Visual Document Ner

In [None]:
secret = ""
license = ""
spark_ocr_jar_path = "../../target/scala-2.11"

pretrained_model = ("visual_document_NER_SROIE0518", "en", "public/ocr/models")


In [None]:
#%pip install spark-ocr==$version\.spark30 --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

In [None]:
#%pip install ../../python/dist/spark-ocr-3.0.0.spark30.tar.gz
#%pip install pyspark==3.0.1

In [None]:
import pyspark
import sparkocr
import json
import os

from pyspark.sql import SparkSession
from pyspark.ml import PipelineModel
import pyspark.sql.functions as f

from sparkocr.transformers import ImageToHocr, VisualDocumentNer, BinaryToImage
from sparkocr.utils import display_image

### Initialize Spark session

In [None]:
from sparkocr import start
from pyspark import SparkConf

if license:
    os.environ['JSL_OCR_LICENSE'] = license

spark = start(secret=secret, jar_path=spark_ocr_jar_path, nlp_version="3.0.0")
spark

## Load test images

In [None]:
import pkg_resources
test_image_path = pkg_resources.resource_filename('sparkocr', 'resources/ocr/images/SROIE/')
bin_df = spark.read.format("binaryFile").load(test_image_path)
bin_df.show()

## Display images

In [None]:
for item in BinaryToImage().transform(bin_df).select("image").collect():
    display_image(item.image)

### Define pipeline

In [None]:
binary_to_image = BinaryToImage()\
    .setOutputCol("image")

img_to_hocr = ImageToHocr()\
    .setInputCol("image")\
    .setOutputCol("hocr")\
    .setIgnoreResolution(False)\
    .setOcrParams(["preserve_interword_spaces=0"])


doc_ner = VisualDocumentNer()\
    .pretrained(*pretrained_model)\
    .setInputCol("hocr")\
    .setLabelCol("label")\
    .setConfidenceCol("conf")

# OCR pipeline
pipeline = PipelineModel(stages=[
    binary_to_image,
    img_to_hocr,
    doc_ner
])

## Call pipeline

In [None]:
results = pipeline.transform(bin_df)

## Show results

In [None]:
results.withColumn("filename", f.element_at(f.split("path", "/"), -1)) \
       .select("filename", "word", "token", "label") \
       .show(truncate=False)