<a href="https://colab.research.google.com/github/Manish-Singh-Mehra/Invoice_OCR/blob/main/Invoice_OCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!sudo apt install -q tesseract-ocr
!pip install -q pytesseract
!pip install opencv-python-headless

In [None]:
import numpy as np
import cv2
import pytesseract

In [None]:
def preprocess_image(image):
    # Convert the image to grayscale
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply thresholding to enhance text visibility
    _, threshold_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Perform image denoising
    denoised_image = cv2.fastNlMeansDenoising(threshold_image, None, 10, 7, 21)

    return denoised_image

In [None]:
def ocr_extract_info(image_path):
    # Load the image using OpenCV
    image = cv2.imread(image_path)

    # Detect edges using Canny edge detection
    edges = cv2.Canny(image, threshold1=50, threshold2=150)

    # Detect lines using Hough Line Transform
    lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold=100, minLineLength=100, maxLineGap=10)

    # Calculate the average angle of detected lines
    angles = []
    for line in lines:
        x1, y1, x2, y2 = line[0]
        angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
        angles.append(angle)
    average_angle = np.mean(angles)

    # Rotate the image to correct alignment
    rotated_image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)

    # Preprocess the rotated image
    preprocessed_image = preprocess_image(rotated_image)

    # Perform OCR using pytesseract on the preprocessed image
    extracted_text = pytesseract.image_to_string(preprocessed_image)

    return extracted_text

In [None]:
if __name__ == "__main__":
    image_path = "/content/drive/MyDrive/Colab Notebooks/Sample_invoices/Permali Wallace Pvt Ltd-1.jpg"
    text = ocr_extract_info(image_path)

    # Print the extracted information
    print(text)

In [None]:
import re

def extract_invoice_details(extracted_text):
    invoice_number = re.search(r'Invoice Number:\s*(\w+)', text, re.IGNORECASE)
    invoice_date = re.search(r'Invoice Date:\s*(\d{2}/\d{2}/\d{4})', text)
    gst_number = re.search(r'GSTIN:\s*(^[0-9]{2}[A-Z]{5}[0-9]{4}[A-Z]{1}[1-9A-Z]{1}Z[0-9A-Z]{1}$)', text, re.IGNORECASE)

    vendor_name = re.search(r'Vendor Name:\s*(.*?)\n', text, re.IGNORECASE)
    vendor_address = re.search(r'Vendor Address:\s*(.*?)\n', text, re.IGNORECASE)

    delivery_address = re.search(r'Delivery Address:\s*(.*?)\n', text, re.IGNORECASE)

    buyer_name = re.search(r'Buyer Name:\s*(.*?)\n', text, re.IGNORECASE)
    buyer_address = re.search(r'Buyer Address:\s*(.*?)\n', text, re.IGNORECASE)

    item_details = re.findall(r'Item:\s*(\w+)\s+Price:\s*(\d+\.\d{2})\s+Quantity:\s*(\d+)\s+', text)

    total_invoice_amount = re.search(r'Total Invoice Amount:\s*(\d+\.\d{2})', text)
    total_tax_amount = re.search(r'Total Tax Amount:\s*(\d+\.\d{2})', text)
    po_number = re.search(r'PO Number:\s*(\w+)', text, re.IGNORECASE)

    return {
        "invoice_number": invoice_number.group(1) if invoice_number else None,
        "invoice_date": invoice_date.group(1) if invoice_date else None,
        "gst_number": gst_number.group(1) if gst_number else None,
        "vendor_name": vendor_name.group(1) if vendor_name else None,
        "vendor_address": vendor_address.group(1) if vendor_address else None,
        "delivery_address": delivery_address.group(1) if delivery_address else None,
        "buyer_name": buyer_name.group(1) if buyer_name else None,
        "buyer_address": buyer_address.group(1) if buyer_address else None,
        "item_details": [{"item_code": item[0], "price": float(item[1]), "quantity": int(item[2])} for item in item_details],
        "total_invoice_amount": float(total_invoice_amount.group(1)) if total_invoice_amount else None,
        "total_tax_amount": float(total_tax_amount.group(1)) if total_tax_amount else None,
        "po_number": po_number.group(1) if po_number else None,
    }

if __name__ == "__main__":
    extracted_text = text
    invoice_details = extract_invoice_details(extracted_text)

    # Print the extracted invoice details
    for key, value in invoice_details.items():
        print(f"{key}: {value}")


In [None]:
type(text)

In [None]:
#for GST Number
pattern = '\d{2}[A-Z]{5}\d{4}[A-Z]{1}[A-Z\d]{1}[Z]{1}[A-Z\d]{1}'
matches = []
matches.append(re.findall('\d{2}[A-Z]{5}\d{4}[A-Z]{1}[A-Z\d]{1}[Z]{1}[A-Z\d]{1}',text))
matches.append(re.findall('\d{2}-\d{2}-\d{2,4}',text))
matches