# Visual Document NER v2 FineTune

## Install spark-ocr python packge

Need specify path to spark-ocr-assembly-[version].jar or secret

### To simulate the build 

In [None]:
!git clone -b GH337_LayoutLMv2NER_Finetune --recurse-submodules https://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@github.com/JohnSnowLabs/spark-ocr.git

In [None]:
secret = ""
license = ""
version = secret.split("-")[0]
AWS_ACCESS_KEY_ID = ""
AWS_SECRET_ACCESS_KEY = ""
spark_ocr_jar_path = "../../target/scala-2.11"

import os
os.environ['JSL_OCR_LICENSE'] = license
os.environ["AWS_ACCESS_KEY_ID"] = AWS_ACCESS_KEY_ID
os.environ["AWS_SECRET_ACCESS_KEY"] = AWS_SECRET_ACCESS_KEY

In [None]:
!pip install -r ../requirements.txt

In [1]:
# !pip install spark-ocr==$version+spark30 --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade
# !pip install ../../python/dist/spark-ocr-3.9.0+spark30.tar.gz

In [None]:
import sys
import os
sys.path.append(os.path.abspath("spark-ocr/python"))

In [None]:
import pyspark
import sparkocr
import os

from pyspark.sql import SparkSession
from pyspark.ml import PipelineModel
import pyspark.sql.functions as f

from sparkocr.transformers import *
from sparkocr.utils import display_images
from sparkocr.enums import *

from pathlib import Path
import logging
import shutil

### Initialize Spark session

In [None]:
from sparkocr import start
from pyspark import SparkConf

spark = start(jar_path = spark_ocr_jar_path)
spark

## Load test images

In [None]:
import pkg_resources
test_image_path = pkg_resources.resource_filename('sparkocr', 'resources/ocr/forms')
bin_df = spark.read.format("binaryFile").load(test_image_path)
bin_df.show()

## Display images

In [None]:
image_df = BinaryToImage().transform(bin_df)
display_images(image_df)

## Prepare Pipeline 

In [None]:
binary_to_image = BinaryToImage()\
    .setOutputCol("image") \
    .setImageType(ImageType.TYPE_3BYTE_BGR)

img_to_hocr = ImageToHocr()\
    .setInputCol("image")\
    .setOutputCol("hocr")\
    .setIgnoreResolution(False)\
    .setOcrParams(["preserve_interword_spaces=0"])

tokenizer = HocrTokenizer()\
    .setInputCol("hocr")\
    .setOutputCol("token")

doc_ner = VisualDocumentNerV2()\
    .pretrained("layoutlmv2_funsd", "en", "clinical/ocr")\
    .setInputCols(["token", "image"])\
    .setOutputCol("entities")\
    .setWhiteList(["other", "b-header", "i-header", "b-question", "i-question", "b-answer", "i-answer"])

draw = ImageDrawAnnotations() \
    .setInputCol("image") \
    .setInputChunksCol("entities") \
    .setOutputCol("image_with_annotations") \
    .setFontSize(10) \
    .setLineWidth(4)\
    .setRectColor(Color.red)

# OCR pipeline
pipeline = PipelineModel(stages=[
    binary_to_image,
    img_to_hocr,
    tokenizer,
    doc_ner,
    draw
])

## Prepare Pyspark Dataframe

In [None]:
results = pipeline.transform(bin_df).cache()
## since pyspark2.3 doesn't have element_at, 'getItem' is involked
path_array = f.split(results['path'], '/')

# from pyspark2.4
# results.withColumn("filename", f.element_at(f.split("path", "/"), -1)) \

results.withColumn('filename', path_array.getItem(f.size(path_array)- 1)) \
    .withColumn("exploded_entities", f.explode("entities")) \
    .select("filename", "exploded_entities") \
    

### Define VisualDocumentNerV2 instance object for training

In [None]:
doc_ner = VisualDocumentNerV2()\
    .setInputCols(["token", "image"])\
    .setOutputCol("entities")\
    .setWhiteList(["other", "b-header", "i-header", "b-question", "i-question", "b-answer", "i-answer"])\
    .setbatchSize(4)\
    .setShuffleBatchTraining(True)\
    .setmodelNameOrPath("nielsr/layoutlmv2-finetuned-funsd")\
    .setvocabPath("sparkocr/resources/models/layoutlm/LayoutLM.v2.voc.txt")\
    .setcudeDevice('cuda:0')\
    .setnumTrainEpochs(3)    

### Calling fit method for training

In [None]:
model_nerv2 = doc_ner.fit(pyspark_dataframe=results)

In [2]:
model_save_path = ''

In [None]:
model_nerv2.saveModel(model_path=model_save_path)