![JohnSnowLabs](https://nlp.johnsnowlabs.com/assets/images/logo.png)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp-workshop/blob/master/visual-nlp/1.1.Text_recognition.ipynb)

## Blogposts and videos

- [Text Detection in Spark OCR](https://medium.com/spark-nlp/text-detection-in-spark-ocr-dcd8002bdc97)

- [Table Detection & Extraction in Spark OCR](https://medium.com/spark-nlp/table-detection-extraction-in-spark-ocr-50765c6cedc9)

- [Extract Tabular Data from PDF in Spark OCR](https://medium.com/spark-nlp/extract-tabular-data-from-pdf-in-spark-ocr-b02136bc0fcb)

- [Signature Detection in Spark OCR](https://medium.com/spark-nlp/signature-detection-in-spark-ocr-32f9e6f91e3c)

- [GPU image pre-processing in Spark OCR](https://medium.com/spark-nlp/gpu-image-pre-processing-in-spark-ocr-3-1-0-6fc27560a9bb)

- [How to Setup Spark OCR on UBUNTU - Video](https://www.youtube.com/watch?v=cmt4WIcL0nI)


**More examples here**

https://github.com/JohnSnowLabs/spark-ocr-workshop

For get the trial license please go to:

https://www.johnsnowlabs.com/install/

### Colab Setup

In [None]:
# Install the johnsnowlabs library to access Spark-OCR and Spark-NLP for Healthcare, Finance, and Legal.
!pip install -q johnsnowlabs 

In [None]:
from google.colab import files
print('Please Upload your John Snow Labs License using the button below')
license_keys = files.upload()

In [None]:
from johnsnowlabs import nlp, visual

# After uploading your license run this to install all licensed Python Wheels and pre-download Jars the Spark Session JVM
nlp.install(refresh_install=True, visual=True)

In [None]:
from johnsnowlabs import nlp, visual
import pandas as pd

# Automatically load license data and start a session with all jars user has access to
spark = nlp.start(visual=True)

In [None]:
import pkg_resources

from pyspark.ml import PipelineModel
from pyspark.sql import functions as F

## Image to Text

In [None]:
image_path = pkg_resources.resource_filename('sparkocr', 'resources/ocr/images/check.jpg')
image_example_df = spark.read.format("binaryFile").load(image_path).cache()

visual.display_images(visual.BinaryToImage().transform(image_example_df))

In [None]:
binary_to_image = visual.BinaryToImage()\
      .setInputCol("content")\
      .setOutputCol("image")

# Run OCR
ocr = visual.ImageToText()\
      .setInputCol("image")\
      .setOutputCol("text")\
      .setConfidenceThreshold(65)
      #.setKeepLayout(True) # to preserve the layout of the input

image_to_text_pipeline = PipelineModel(stages=[
    binary_to_image,
    ocr
])

In [None]:
result = image_to_text_pipeline.transform(image_example_df).cache()
result.select("pagenum", "text", "confidence").show()

result.printSchema()

In [None]:
print("\n".join([row.text for row in result.select("text").collect()]))

## Image to HOCR

In [None]:
def pipeline():
    
    # Transforrm PDF document to images per page
    binary_to_image = visual.BinaryToImage() \
        .setInputCol("content") \
        .setOutputCol("image") \
        .setImageType(visual.ImageType.TYPE_3BYTE_BGR)

    # Run OCR
    ocr = visual.ImageToHocr() \
        .setInputCol("image") \
        .setOutputCol("hocr") \
        .setIgnoreResolution(False)
    
    document_assembler = visual.HocrDocumentAssembler() \
        .setInputCol("hocr") \
        .setOutputCol("document")

    tokenizer = visual.HocrTokenizer() \
        .setInputCol("hocr") \
        .setOutputCol("token") \

    draw_annotations = visual.ImageDrawAnnotations() \
        .setInputCol("image") \
        .setInputChunksCol("token") \
        .setOutputCol("image_with_annotations") \
        .setFilledRect(False) \
        .setFontSize(10) \
        .setRectColor(visual.Color.red)
    
    pipeline = PipelineModel(stages=[
        binary_to_image,
        ocr,
        document_assembler,
        tokenizer,
        draw_annotations
    ])
    
    return pipeline

### Run pipeline and show results

In [None]:
result = pipeline().transform(image_example_df).cache()
print(result.select("hocr").collect()[0].hocr)

### Display recognized text on original image

In [None]:
visual.display_images(result, "image_with_annotations", width=1000)

## Pdf to Text

In [None]:
pdf_path = pkg_resources.resource_filename('sparkocr', 'resources/ocr/pdfs/test_document.pdf')
pdf_example_df = spark.read.format("binaryFile").load(pdf_path).cache()

visual.display_pdf(pdf_example_df)

In [None]:
# Transform PDF document to images per page
pdf_to_image = visual.PdfToImage()\
      .setInputCol("content")\
      .setOutputCol("image")

# Run OCR
ocr = visual.ImageToText()\
      .setInputCol("image")\
      .setOutputCol("text")\
      .setConfidenceThreshold(65)

pdf_to_text_pipeline = PipelineModel(stages=[
    pdf_to_image,
    ocr
])

In [None]:
result = pdf_to_text_pipeline.transform(pdf_example_df).cache()
result.select("pagenum", "text", "confidence").show()

In [None]:
print("\n".join([row.text for row in result.select("text").collect()]))

## Skew correction

In [None]:
pdf_path = pkg_resources.resource_filename('sparkocr', 'resources/ocr/pdfs/rotated/400/400_rot.pdf')
pdf_rotated_df = spark.read.format("binaryFile").load(pdf_path).cache()

visual.display_pdf(pdf_rotated_df)

In [None]:
def ocr_pipeline(skew_correction=False):
    
    # Transforrm PDF document to images per page
    pdf_to_image = visual.PdfToImage()\
          .setInputCol("content")\
          .setOutputCol("image")

    # Image skew corrector 
    skew_corrector = visual.ImageSkewCorrector()\
          .setInputCol("image")\
          .setOutputCol("corrected_image")\
          .setAutomaticSkewCorrection(skew_correction)

    # Run OCR
    ocr = visual.ImageToText()\
          .setInputCol("corrected_image")\
          .setOutputCol("text")
    
    pipeline_ocr = PipelineModel(stages=[
        pdf_to_image,
        skew_corrector,
        ocr
    ])
    
    return pipeline_ocr

In [None]:
result = ocr_pipeline(False).transform(pdf_rotated_df).cache()
print("\n".join([row.text for row in result.select("text").collect()]))

In [None]:
corrected_result = ocr_pipeline(True).transform(pdf_rotated_df).cache()
print("\n".join([row.text for row in corrected_result.select("text").collect()]))

In [None]:
visual.display_images(corrected_result, "corrected_image")

### Calculate scores for showing improvement

In [None]:
detected = "\n".join([row.text for row in result.collect()])
corrected_detected = "\n".join([row.text for row in corrected_result.collect()])

# read original text
path_to_pdf_rotated_text = pkg_resources.resource_filename('sparkocr', 'resources/ocr/pdfs/rotated/400.txt')

pdf_rotated_text = open(path_to_pdf_rotated_text, "r").read()

# compute scores
detected_score = visual.score(pdf_rotated_text, detected)
corrected_score = visual.score(pdf_rotated_text, corrected_detected)

#  print scores
print("Score without skew correction: {0}".format(detected_score))
print("Score with skew correction: {0}".format(corrected_score))

## Image Text Cleaner

In [None]:
!wget -q https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jupyter/data/pdfs/noised.pdf

In [None]:
pdf_noised_df = spark.read.format("binaryFile").load('noised.pdf').cache()

visual.display_pdf(pdf_noised_df)

In [None]:
pdf_to_image = visual.PdfToImage() \
    .setInputCol("content") \
    .setOutputCol("image") \
    .setResolution(150)

ocr = visual.ImageToText() \
    .setInputCol("image") \
    .setOutputCol("text") \
    .setConfidenceThreshold(70) \
    .setIgnoreResolution(False)

cleaner = visual.ImageTextCleaner \
    .pretrained("text_cleaner_v1", "en", "clinical/ocr") \
    .setInputCol("image") \
    .setOutputCol("corrected_image") \
    .setMedianBlur(0) \
    .setSizeThreshold(10) \
    .setTextThreshold(0.3) \
    .setLinkThreshold(0.2) \
    .setPadding(5) \
    .setBinarize(False)

ocr_corrected = visual.ImageToText() \
    .setInputCol("corrected_image") \
    .setOutputCol("corrected_text") \
    .setConfidenceThreshold(70) \
    .setIgnoreResolution(False)

pipeline = PipelineModel(stages=[
    pdf_to_image,
    ocr,
    cleaner,
    ocr_corrected
])

In [None]:
results = pipeline.transform(pdf_noised_df).cache()

print(f"Detected text:\n{results.select('text').collect()[0].text}")

In [None]:
visual.display_images(results, "corrected_image")

In [None]:
print(f"Detected text from corrected image:\n{results.select('corrected_text').collect()[0].corrected_text}")

In [None]:
visual.display_images_horizontal(results, "image,corrected_image")