extracting and validating invoice data using OCR 
Code uses Tesseract OCR and Python

In [2]:
import cv2  # OpenCV for image processing
import pytesseract  # Tesseract for OCR
from pytesseract import Output  # Output format for structured OCR results
from PIL import Image  # Pillow for image manipulation
import numpy as np  # NumPy for array manipulation
import re  # Regular expressions for data validation

In [3]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [4]:
# Load the invoice image
image_path = 'invoice5.jpg'
image = cv2.imread(image_path)

simplifying our data by converting our image into a single channel using grayscale which will also reduce the noise

In [5]:
#Convert to grayscale to reduce complexity
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

applying thresholding to convert the grayscale images into binary where pixels are either white or black

In [6]:
# Apply Otsu's thresholding
_, thresh_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)


In [7]:
# Apply dilation to connect some of the broken letters
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
dilated_image = cv2.dilate(thresh_image, kernel, iterations=1)

In [8]:
# Apply erosion to reduce noise after dilation
eroded_image = cv2.erode(dilated_image, kernel, iterations=1)

In [9]:
# Optional: resize the image to improve OCR accuracy
resized_image = cv2.resize(eroded_image, None, fx=2, fy=2, interpolation=cv2.INTER_LINEAR)

further reduction of noise on our image by applying filtering technique => medianblur

In [10]:
processed_image = cv2.medianBlur(resized_image, 3)

In [11]:
# Perform OCR on the processed image
ocr_result = pytesseract.image_to_string(processed_image)

In [12]:
# Display the OCR result (raw text)
print("OCR Result:\n", ocr_result)

OCR Result:
 PER ot *~, ~
CHANOARANA SUPERMA’.. ..° -"O
Sigonivuce Mal
Sbecpersatsaces AAO L OP

Ott A dvpomas POereecl
finbanb:,

Na.tobl, Kenya
Pn 0793371973
PO.Box 14078 00600 Naitobs
Pin Ito PO00601 772P
wic@chardalanasupermatkels co.ke
TAX INVOICE

————

Customor; CASH
Involee No. : 1101021017186 Dale : 29-Aug-2024

Cashie + AmbeaMulheuN Time : 08. 198

Ervin

Session No: 1 TiWNo: 04
Beserpion SOS —— “Sic! fax”
Barcode Cty Price Amaunt__
KARTASI CREENUFVCH ARCH FILE BK A
4034674200022 100 24900 245.00
YOSOGO CLIPBOARO W PEN CLIP C8 A
0546452001867 200 we 00 693.00
LUXOR COMBO PACK 1? ASSORTED! A
9901089059292 100 «= 750.00 750.00
FLOURESCENT PAFEH Ad A
5019 1000 20000 200 00
Nema: 4 Qty: 5.03 Ds 000 1807 00
_ Gros sale Value : “T1997 OO
Net Payable : 1e97 00
Recelvod Amount: 1 897 00

Balance Paid : G00

PAYMENT DETAILS
Equlty Credit Card . 1897.00

14 Summary :-
TaxO° pt? =. axable Aniount Tas Amount
4 1035 34 261 68
. 261 66

CU Date 29-Aug-2024 8 19am
CU Senal NO KRAMYV00920220

In [13]:
# 1. Extract the Invoice Number (Assumes a pattern like Invoice Number:)
invoice_number_pattern = r'Invoice Number:\s*\d+'
invoice_number = re.findall(invoice_number_pattern, ocr_result)
# Extract the number part if found
if invoice_number:
    # Extract just the digits from the matched string
    invoice_number = re.search(r'\d+', invoice_number[0]).group()
    print("Invoice Number:", invoice_number)
else:
    print("Invoice Number: Not found")

Invoice Number: Not found


In [14]:
# 2. Extract the Invoice Date (Assumes a date format like MM/DD/YYYY or similar)
date_pattern = r'Date:\s*(\d{1,2}[\/.-]\d{1,2}[\/.-]\d{2,4}|\d{1,2}\s+\w+\s+\d{4})'
invoice_date = re.findall(date_pattern, ocr_result)
print("Invoice Date:", invoice_date[0] if invoice_date else "Not found")

Invoice Date: Not found


In [15]:
# 3. Extract the Total Amount 
amount_pattern = r'(?:Taxable Amount|Gross sale value|Total Amount|Ksh)\s*:\s*(\d+(?:,\d{3})*(?:\.\d{2})?)'
total_amount = re.findall(amount_pattern, ocr_result)
print("Total Amount:", total_amount[0] if total_amount else "Not found")

Total Amount: Not found


In [16]:
# Validate extracted data

# 1. Validate Invoice Number (Check against a predefined format)
if invoice_number and re.match(r'Invoice Number:\s*\d+', invoice_number[0]):
    print("Invoice Number is valid.")
else:
    print("Invoice Number is invalid.")

Invoice Number is invalid.


In [17]:
# 2. Validate Date (Check for a valid date format)
if invoice_date and re.match(r'Date:\s*(\d{1,2}[\/.-]\d{1,2}[\/.-]\d{2,4}|\d{1,2}\s+\w+\s+\d{4})', invoice_date[0]):
    print("Invoice Date is valid.")
else:
    print("Invoice Date is invalid.")

Invoice Date is invalid.


In [21]:
# 3. Validate Total Amount (Check for a valid currency format)
if total_amount and re.match( r'(?:Taxable Amount|Gross sale value|Total Amount|Ksh)\s*:\s*(\d+(?:,\d{3})*(?:\.\d{2})?)', total_amount[0]):
    print("Total Amount is valid.")
else:
    print("Total Amount is invalid.")

Total Amount is invalid.


In [19]:
# Optional: Draw bounding boxes around detected text (visualization)
detailed_ocr_result = pytesseract.image_to_data(processed_image, output_type=Output.DICT)
n_boxes = len(detailed_ocr_result['level'])
for i in range(n_boxes):
    (x, y, w, h) = (detailed_ocr_result['left'][i], detailed_ocr_result['top'][i],
                    detailed_ocr_result['width'][i], detailed_ocr_result['height'][i])
    cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)

In [20]:
# Save the image with bounding boxes (optional)
cv2.imwrite('invoice_with_boxes.jpg', image)

# Display the image with bounding boxes (optional)
cv2.imshow('Invoice with Bounding Boxes', image)
cv2.waitKey(0)  # Wait for a key press to close the window
cv2.destroyAllWindows()