# OCR remove background noise and draw regions
## Install spark-ocr python packge
Need specify path to `spark-ocr-assembly-[version].jar` or `secret`

In [None]:
secret = ""
license = ""
version = secret.split("-")[0]
spark_ocr_jar_path = "../../target/scala-2.12"

In [None]:
# install from PYPI using secret
#%pip install spark-ocr==$version --extra-index-url=https://pypi.johnsnowlabs.com/$secret --upgrade

In [None]:
# or install from local path
# %pip install ../dist/spark-ocr-[version].tar.gz
%pip install termcolor

## Initialization of spark session


In [None]:
from pyspark.sql import SparkSession
from sparkocr import start
import os

if license:
    os.environ['SPARK_OCR_LICENSE'] = license

spark = start(secret=secret, jar_path=spark_ocr_jar_path)
spark

Spark version: 3.4.1
Spark NLP version: 5.1.2
Spark OCR version: 5.1.0



## Imports

In [None]:
from termcolor import colored

from pyspark.ml import PipelineModel
from pyspark.sql import functions as F

from sparkocr.transformers import *
from sparkocr.enums import *
from sparkocr.utils import display_image
from sparkocr.metrics import score

## Define OCR pipeline

In [None]:
# Read binary as image
pdf_to_image = PdfToImage()
pdf_to_image.setInputCol("content")
pdf_to_image.setOutputCol("image")
pdf_to_image.setResolution(400)

# Binarize using adaptive tresholding
binarizer = ImageAdaptiveThresholding()
binarizer.setInputCol("image")
binarizer.setOutputCol("binarized_image")
binarizer.setBlockSize(91)
binarizer.setOffset(50)
binarizer.setKeepInput(True)

# Apply morphology opening
opening = ImageMorphologyOperation()
opening.setKernelShape(KernelShape.SQUARE)
opening.setOperation(MorphologyOperationType.OPENING)
opening.setKernelSize(3)
opening.setInputCol("binarized_image")
opening.setOutputCol("opening_image")

# Remove small objects
remove_objects = ImageRemoveObjects()
remove_objects.setInputCol("opening_image")
remove_objects.setOutputCol("corrected_image")
remove_objects.setMinSizeObject(130)

# Image Layout Analyzer for detect regions
image_layout_analyzer = ImageLayoutAnalyzer()
image_layout_analyzer.setInputCol("corrected_image")
image_layout_analyzer.setOutputCol("region")
#image_layout_analyzer.setPageSegMode(PageSegmentationMode.SPARSE_TEXT)

draw_regions = ImageDrawRegions()
draw_regions.setInputCol("corrected_image")
draw_regions.setInputRegionsCol("region")
draw_regions.setOutputCol("image_with_regions")

# Run tesseract OCR for corrected image
ocr_corrected = ImageToText()
ocr_corrected.setInputCol("corrected_image")
ocr_corrected.setOutputCol("corrected_text")
ocr_corrected.setPositionsCol("corrected_positions")
ocr_corrected.setConfidenceThreshold(65)

# Run OCR for original image
ocr = ImageToText()
ocr.setInputCol("image")
ocr.setOutputCol("text")

# OCR pipeline
pipeline = PipelineModel(stages=[
    pdf_to_image,
    binarizer,
    opening,
    remove_objects,
    image_layout_analyzer,
    draw_regions,
    ocr,
    ocr_corrected
])

## Read image with noise

In [None]:
import pkg_resources
imagePath ='data/pdfs/noised.pdf'
image_df = spark.read.format("binaryFile").load(imagePath).cache()
image_df.show()

+--------------------+--------------------+-------+--------------------+
|                path|    modificationTime| length|             content|
+--------------------+--------------------+-------+--------------------+
|file:/content/dat...|2023-11-17 17:47:...|2115939|[25 50 44 46 2D 3...|
+--------------------+--------------------+-------+--------------------+



## Run OCR pipelines

In [None]:
result = pipeline \
.transform(image_df) \
.cache()

## Results with original imge

In [None]:
grouped_results = result.groupBy("path", "pagenum").agg(F.concat_ws("", F.collect_list("text")).alias("text"))
for row in grouped_results.collect():
    print(colored("Filename:\n%s , page: %d" % (row.path, row.pagenum), "red"))
    print("Recognized text:\n%s" % row.text)

Filename:
file:/content/data/pdfs/noised.pdf , page: 0
Recognized text:
a 1S at ee et ere ee ee
= —.

omit — ee ee

——— | ee ES ee ee oe

 

 

 

 

Sates wos: wots
< ° i se - Date: 7/16/68
- ‘Sample No. _ 5031 * Some S a bel ing

a Original request made by Mr. €. L. Tucker, Jr. ° on | ' 7/10/68

Sample specifications written by

 

 

. , BLEND CASING RECASING
OLD GOLD STRAIGHT Tobacco Blend

Control for Sample No. 5030

 

af Cigarettes:

OLD GOLD STRAIGHT

 

Brand ---------
' Length --------- 85 mm.
Circumference--~ 25.3 mn.
Paper ~---------- Ecusta 556 ;
Fixmness. «-—--.. OLD GOLD STRAIGHT.
Draw -----~-{~--~~ OLD GOLD STRAIGHT
Weight --------- OLD GOLD STRAIGHT
Tipping Paper -- _— _
Print--~~~------~ OLD GOLD STRAIGHT
C - Filter Length--- —=
- =. Requirements:
Laboratory ----- One Tray

Others

 

| ; £
Laborat Analysis: ° ale
ory Analysis:
) Ee
Tars and Nicotine, Taste Panel
Benzo (A) Pyrene Analyses —
Responsibility:

 

Tobacco Blend
Filter Production---
Making & Packing -~-- 

## Results with corrected image

In [None]:
grouped_results = result.groupBy("path", "pagenum").agg(F.concat_ws("", F.collect_list("corrected_text")).alias("corrected_text"))
for row in grouped_results.collect():
    print(colored("Filename:\n%s , page: %d" % (row.path, row.pagenum), "red"))
    print("Recognized text:\n%s" % row.corrected_text)

Filename:
file:/content/data/pdfs/noised.pdf , page: 0
Recognized text:
 

 

 

 

 

2 7 Date: 7/16/68
Sample No. —_ 5031 >
* Oraginal request made by Mr. €. L Tucker, Jr. on ‘7/10/68
Sample specifications written by John H. M. Bohlken
, BLEND CASING RECASING FINAL FLAVOR MENTHOL FLAVOR

 

 

 

OLD GOLD STRAIGHT Tobacco Blend

Control for Sample No. 5030

Cigarettes:

 

 

 

Brand -+--------- OLD GOLD STRAIGHT
Length --------- 85 mm.
Circumference~~-~ 25.3 mm.
Paper eee me meme Ecusta 556
Firymness -----—-. OLD GOLD STRAIGHT
Draw -----~~.~——. OLD GOLD STRAIGHT
Weaght mee eee OLD GOLD STRAIGHT Wrappings ‘
Tipping Paper -- --
Prant-~---------- OLD GOLD STRAIGHT pene “777 OLD GOLD STRAIGHT
Filter Length--- __ sures--- Standard Blue
~ Tear Tape-- Gold
. Cartons --- OLD GOLD STRAIGHT
Requirements Markings-- Sample number on each

Laboratory ----- One Tray . pack and carton
Others ---~--~-----

 

Tars and Nicotine, Taste Panel, Burning Time, Gas Phase Analysis,
Benzo (A) Pyrene Analys

## ABBYY Lingvo results

In [None]:
abbyy = """-----
% Date: 7/16/68
X*: I; * • ■ Sample No. 5031___ — .*
•* Original request made by _____Mr. C. L. Tucker, Jr. on
Sample specifications written by
BLEND CASING RECASING
OLD GOLD STRAIGHT Tobacco Blend
Control for Sample No. 5030
John H. M. Bohlken
FINAL FLAVOR
) 7/10/68
MENTHOL FLAVOR
• Cigarettes; * . .v\ . /,*, *, S •
Brand --------- OLD GOLD STRAIGHT -V . ••••
; . L e n g t h ------- — 85 mm. . : '
Circumference-- 25.3 mm. • ' *;. • •
P a p e r ---------- Ecusta 556 • * .
F i r m n e s s---- —— OLD GOLD STRAIGHT . ! •■'
D r a w ___________ OLD GOLD STRAIGHT
W e i g h t --------- 0LD GOLD STRAIGHT Wrappings: « -
Tipping Paper — — *
p H n f —. — — _ _ ~ L a b e l s ----OLD GOLD STRAIGHT
( • Filter Length-- . — Closures--- Standard Blue .
^ ^ ; • Tear Tape— Gold
Cartons --- OLD GOLD STRAIGHT
s Requirements: . - •' • Markings-- Sample number on each
• pack and carton Laboratory----- One Tray .
O t h e r s --------- * , s • • . 4
Laboratory A n a l ysis^ I " '/***• * 7 ' ^ ^
Tars and Nicotine, Taste Panel, Burning Time, Gas Phase Analysis,
Benzo (A) Pyrene Analyses — J-ZZ-Zf'- (£. / •
Responsibility;
Tobacco B l e n d ------Manufacturing - A. Kraus . . * -
Filter Production--- —
• Making & P a c k i n g---Product Development , John H. M. Bohlken
Shipping -----------
Reports:
t
Written by — John H. M. Bohlken
Original to - Mr. C. L. Tucker, Jr.
Copies t o ---Dr. A. W. Spears
• 9 ..
"""

## Display original and corrected images with regions

In [None]:
for r in result.select("path","image","image_with_regions").distinct().collect():
    print("Original: %s" % r.path)
    display_image(r.image)
    print("Corrected: %s" % r.path)
    display_image(r.image_with_regions)