In [1]:
import os
os.environ["TESSDATA_PREFIX"] = "C:/Program Files/Tesseract-OCR/tessdata"

In [2]:
import fitz

## Some preprocessing


In [3]:
import tempfile

import cv2
import numpy as np
from PIL import Image

IMAGE_SIZE = 1800
BINARY_THREHOLD = 180

def process_image_for_ocr(file_path):
    # TODO : Implement using opencv
    temp_filename = set_image_dpi(file_path)
    im_new = remove_noise_and_smooth(temp_filename)
    return im_new

def set_image_dpi(file_path):
    im = Image.open(file_path)
    length_x, width_y = im.size
    factor = max(1, int(IMAGE_SIZE / length_x))
    size = factor * length_x, factor * width_y
    # size = (1800, 1800)
    im_resized = im.resize(size, Image.LANCZOS)
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.jpg')
    temp_filename = temp_file.name
    im_resized.save(temp_filename, dpi=(300, 300))
    return temp_filename

def image_smoothening(img):
    ret1, th1 = cv2.threshold(img, BINARY_THREHOLD, 255, cv2.THRESH_BINARY)
    ret2, th2 = cv2.threshold(th1, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    blur = cv2.GaussianBlur(th2, (1, 1), 0)
    ret3, th3 = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return th3

def remove_noise_and_smooth(file_name):
    img = cv2.imread(file_name, 0)
    filtered = cv2.adaptiveThreshold(img.astype(np.uint8), 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 41,
                                     3)
    kernel = np.ones((1, 1), np.uint8)
    opening = cv2.morphologyEx(filtered, cv2.MORPH_OPEN, kernel)
    closing = cv2.morphologyEx(opening, cv2.MORPH_CLOSE, kernel)
    img = image_smoothening(img)
    or_image = cv2.bitwise_or(img, closing)
    return or_image


In [4]:
cv2.imwrite('images/preprocessed_handwritten.jpg', process_image_for_ocr('images/handwritten.jpg'))


True

## OCR


In [5]:
path = 'images/Template1_Instance0.jpg'
# path = 'images/rvKPY.jpg'

In [6]:
import easyocr
reader = easyocr.Reader(['en']) # this needs to run only once to load the model into memory
result = reader.readtext(path)

  from .autonotebook import tqdm as notebook_tqdm
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [7]:
doc = fitz.open(path)
page = doc[0]

In [8]:
# make the TextPage object. It does all the OCR.
full_tp = page.get_textpage_ocr(flags=0, dpi=300, full=True)

# now look at what we have got
print(page.get_text(textpage=full_tp))

TAX INVOICE
Date: 20-Mar-2008
Due Date
: 16-Oct-2016
PO Number :35
Address:16424 Timothy Mission
Bill to:Denise Perez
Markville, AK 58294 US
16424 Timothy Mission
Markville, AK 58294 US
Email:melvindO@example.net
Tal:+(352)259-8443
www.ThompsonandSons.org
Email:melvind0@example.net
(GSTIN: 12345670 00070007
Site:http://smith_org/
GSTIN: OG@AAMFCO376K124
a
7
Pvc.
CT
Ste
Total in words: seven hundred and thirt-
;
y-four point three three
SUB_TOTAL : 725.30 EUR
Bank Name
State Bank of California
DISCOUNT(1.85%): (-) 13.42
Branch Name
Raf CAMP
TAX:VAT (3.88%): 28.18 EUR
Bank Account Number 11695435
Bank Swift Code
SBININBB250
TOTAL : 734.33 EUR
Note:
This order is shipped through blue dart courier



In [9]:
partial_tp = page.get_textpage_ocr(full=False, flags=0)

# look at the result
print(page.get_text(textpage=partial_tp))  # sort by vertical, then horizontal

TAX INVOICE
Hels
Es SR
Due Date: 18-0ct2016
Po Manor
8
Asdress:16424 Timothy Mission
bil to:Dense Perez
fare
A S204 US
$424 Toth scion
Molle
ar eeese US
Email:melvin40@example.net
Tel:+(352)259-8443
cutee ooo
eS
bensikinn
UST abasoT0 00070005
Shorntpatomitergy
GSTIN: OG@AAMFCO376K124
a
|
a
CT |
2
|
|
a
TT |
Teal
esate ro anata
Teel ocean
suB_TOTAL: 72530 EUR
ploy potttiree
tee of Calforola
DISCOUNTIL ASN) () 1342
mates
etcame
Peruri ore
Bani Aecour Number 11895435
Bonk Suit Code
SSMNNBDISO
TOTAL: 73439 EUR
note
Ts oiesete ened evn bles aarrcoae



In [10]:
data = {'words': [], 'bboxes': []}

for block in page.get_text("dict", textpage=full_tp)["blocks"]:
    for line in block["lines"]:
        for span in line["spans"]:
            data['words'].append(span['text'])
            bbox = list(map(int, span['bbox']))
            data["bboxes"].append(bbox)


# LayoutLM for Question answering


In [11]:
from transformers import AutoTokenizer, LayoutLMForQuestionAnswering
from datasets import load_dataset
import torch

tokenizer = AutoTokenizer.from_pretrained("impira/layoutlm-document-qa", add_prefix_space=True)
model = LayoutLMForQuestionAnswering.from_pretrained("impira/layoutlm-document-qa", revision="1e3ebac")

# dataset = load_dataset("nielsr/funsd", split="train")
# example = dataset[0]
question = "What's the note?"
example = data
words = example["words"]
boxes = example["bboxes"]

encoding = tokenizer(
    question.split(), words, is_split_into_words=True, return_token_type_ids=True, return_tensors="pt"
)
bbox = []
for i, s, w in zip(encoding.input_ids[0], encoding.sequence_ids(0), encoding.word_ids(0)):
    if s == 1:
        bbox.append(boxes[w])
    elif i == tokenizer.sep_token_id:
        bbox.append([1000] * 4)
    else:
        bbox.append([0] * 4)
encoding["bbox"] = torch.tensor([bbox])

word_ids = encoding.word_ids(0)
outputs = model(**encoding)
loss = outputs.loss
start_scores = outputs.start_logits
end_scores = outputs.end_logits
start, end = word_ids[start_scores.argmax(-1)], word_ids[end_scores.argmax(-1)]
print(" ".join(words[start : end + 1]))

This  order  is  shipped  through  blue  dart  courier


# LayoutLMv3 for question answering


In [None]:

image = Image.open(path)

In [None]:
from transformers import AutoProcessor, AutoModelForQuestionAnswering
from datasets import load_dataset
import torch

processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
model = AutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")

# dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train")
# example = dataset[0]
# image = example["image"]

example = data
question = "what is the title"
words = example["words"]
boxes = example["bboxes"]

encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])


outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)
loss = outputs.loss
start_scores = outputs.start_logits
end_scores = outputs.end_logits

Some weights of LayoutLMv3ForQuestionAnswering were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['qa_outputs.dense.bias', 'qa_outputs.dense.weight', 'qa_outputs.out_proj.bias', 'qa_outputs.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

predict_answer_tokens = encoding.input_ids[0, answer_start_index : answer_end_index + 1]
print(processor.decode(predict_answer_tokens, skip_special_tokens=True))

 13.42 Branch Name Raf CAMP TAX:VAT (3.88%): 28.18 EUR Bank Account Number 11695435 Bank Swift Code SBININBB250 TOTAL : 734.33 EUR Note: This order is shipped through blue dart courier
