extracting and validating invoice data using OCR 
Code uses Tesseract OCR and Python

In [129]:
import cv2  # OpenCV for image processing
import pytesseract  # Tesseract for OCR
from pytesseract import Output  # Output format for structured OCR results
from PIL import Image  # Pillow for image manipulation
import numpy as np  # NumPy for array manipulation
import re  # Regular expressions for data validation

In [130]:
#pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [131]:
# Load the invoice image
image_path = 'invoice_sample.jpg'
image = cv2.imread(image_path)

simplifying our data by converting our image into a single channel using grayscale which will also reduce the noise

In [132]:
#Convert to grayscale to reduce complexity
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

In [133]:
# Step 2: Optional Gaussian blur to reduce noise
blurred_image = cv2.GaussianBlur(gray_image, (5, 5), 0)

applying thresholding to convert the grayscale images into binary where pixels are either white or black

In [134]:
# Apply Otsu's thresholding
thresh_image = cv2.adaptiveThreshold(blurred_image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)


further reduction of noise on our image by applying filtering technique => medianblur

In [135]:
# Perform OCR on the processed image
ocr_result = pytesseract.image_to_data(thresh_image, output_type=Output.DICT)

In [136]:
# Display the OCR result (raw text)
print("OCR Result:\n", ocr_result)

OCR Result:
 {'level': [1, 2, 3, 4, 5], 'page_num': [1, 1, 1, 1, 1], 'block_num': [0, 1, 1, 1, 1], 'par_num': [0, 0, 1, 1, 1], 'line_num': [0, 0, 0, 1, 1], 'word_num': [0, 0, 0, 0, 1], 'left': [0, 0, 0, 0, 0], 'top': [0, 0, 0, 0, 0], 'width': [1024, 1024, 1024, 1024, 1024], 'height': [1024, 1024, 1024, 1024, 1024], 'conf': [-1, -1, -1, -1, 95], 'text': ['', '', '', '', '']}


In [137]:
# Extract text from the detailed OCR result
extracted_text = " ".join([ocr_result['text'][i] for i in range(len(ocr_result['text'])) if int(ocr_result['conf'][i]) > 60])

In [138]:
# Regular expressions for validating extracted data
invoice_number_pattern = re.compile(r'Invoice\s*Number[:\s]*\d+')
invoice_date_pattern = re.compile(r'Date[:\s]*\d{1,2}[\/\.-]\d{1,2}[\/\.-]\d{2,4}')
total_amount_pattern = re.compile(r'(?:Total Amount|Net Payable|Gross sale value)[:\s]*\d+[.,]?\d*')

In [139]:
# Validating extracted fields
invoice_number = invoice_number_pattern.findall(extracted_text)
invoice_date = invoice_date_pattern.findall(extracted_text)
total_amount = total_amount_pattern.findall(extracted_text)


In [140]:
# Results and Validation
results = {
    'Invoice Number': invoice_number[0] if invoice_number else 'Invalid',
    'Invoice Date': invoice_date[0] if invoice_date else 'Invalid',
    'Total Amount': total_amount[0] if total_amount else 'Invalid'
}

results


{'Invoice Number': 'Invalid',
 'Invoice Date': 'Invalid',
 'Total Amount': 'Invalid'}