In [1]:
! pip install pytesseract opencv-python pillow

Defaulting to user installation because normal site-packages is not writeable
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [None]:
import pytesseract
import cv2
import re
import json
from PIL import Image

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Load the image
img_path = "smaple bill image.jpg"
image = cv2.imread(img_path)

# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Denoise image (removes background noise)
gray = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)

# Resize to improve accuracy
gray = cv2.resize(gray, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)

# Adaptive thresholding
gray = cv2.adaptiveThreshold(gray, 255,
                             cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                             cv2.THRESH_BINARY, 11, 2)

# Save processed image to debug
cv2.imwrite("processed.jpg", gray)

text = pytesseract.image_to_string(gray, lang='eng')
print(text)


# --- Extract Metadata ---
invoice_no = re.search(r'No[:.]?\s*(\d+)', text)
date = re.search(r'Date[:.]?\s*([A-Za-z]+\s+\d{1,2},?\s*\d{4})', text)
sold_to = re.search(r'Sold to M\s*(.*)', text)
address = re.search(r'Address\s*(.*)', text)

# --- Extract Total Amounts ---
total_sales = re.search(r'Amount Due\s*([\d,]+\.\d+)', text)
vat = re.search(r'Add: VAT\s*([\d,]+\.\d+)', text)
total_due = re.search(r'TOTAL AMOUNT DUE\s*([\d,]+\.\d+)', text)

# --- Extract Items Table ---
items = []
lines = text.split('\n')
for line in lines:
    if re.match(r'\d+\s+', line):  # Lines starting with quantity
        parts = re.split(r'\s{2,}', line.strip())  # Split by large spaces
        if len(parts) >= 5:
            items.append({
                "qty": parts[0],
                "unit": parts[1],
                "article": parts[2],
                "unit_price": parts[-2],
                "amount": parts[-1]
            })

# --- Create Final JSON ---
bill_data = {
    "invoice_no": invoice_no.group(1) if invoice_no else None,
    "date": date.group(1) if date else None,
    "sold_to": sold_to.group(1).strip() if sold_to else None,
    "address": address.group(1).strip() if address else None,
    "items": items,
    "total_sales": total_sales.group(1) if total_sales else None,
    "vat": vat.group(1) if vat else None,
    "total_due": total_due.group(1) if total_due else None
}

# Print JSON output
print(json.dumps(bill_data, indent=2))


ire :

SALES INVOICE

ame al Date -\
Some ML Hu, one eo Checked by
Buntiiss See ~Webandes " . ~ Vp ; ee PO Ne.
Aditruxs ~ avdcleu ‘a = t 1 tina_— Terms
OSCAWAVIETID No SCOPWD Sue

Unit
' Amuunt
ve Price n d.00

oma uF JI fe0= | Sew.00
Alon LOND Sanda Sealy [60 | 66.00.

Poa [= Cla ils Tage | 630° | 63020
i |_| lacy. a 48500

a dealt (Ua Gace 2.0000
A. “wee | 250°] ZS020
ZN] Kio _| Cotlon ask ___ 7580"
ZL Nd 3m. &. Parte * tay | Total Sates (VAT Inctuswe) 215- 430.00
SO AC we. Ss. A 1SQ| Leas VAT Soe 750

\AL Mn Salles
WAL-Caumn Salus [less SCPWD Ducount | owen nate Tliscount

Zito Haid Satine Ee 5. 265 3?

TOTAL AMOUNT DUE \9, 695 <o

Recewed tn good orde/m can] es h
to the terms and condita there
AV Ape


{
  "invoice_no": null,
  "date": null,
  "sold_to": null,
  "address": null,
  "items": [],
  "total_sales": null,
  "vat": null,
  "total_due": null
}
