<a href="https://colab.research.google.com/github/Manish-Singh-Mehra/Invoice_OCR/blob/main/Invoice_OCR_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!sudo apt install -q tesseract-ocr
!pip install -q pytesseract
!pip install opencv-python-headless

In [None]:
import pytesseract
import cv2
import numpy as np

In [None]:
def preprocess_image(image_path):
    image = cv2.imread(image_path)

    # Detect edges using Canny edge detection
    edges = cv2.Canny(image, threshold1=50, threshold2=150)

    # Detect lines using Hough Line Transform
    lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold=100, minLineLength=100, maxLineGap=10)

    # Calculate the average angle of detected lines
    angles = []
    for line in lines:
        x1, y1, x2, y2 = line[0]
        angle = np.arctan2(y2 - y1, x2 - x1) * 180 / np.pi
        angles.append(angle)
    average_angle = np.mean(angles)

    # Rotate the image to correct alignment
    rotated_image = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)

    # Convert the image to grayscale
    gray_image = cv2.cvtColor(rotated_image, cv2.COLOR_BGR2GRAY)

    # Apply thresholding to enhance text visibility
    _, threshold_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Perform image denoising
    denoised_image = cv2.fastNlMeansDenoising(threshold_image, None, 10, 7, 21)

    return denoised_image

In [None]:
from google.colab.patches import cv2_imshow
image = '/content/drive/MyDrive/Colab Notebooks/Sample_invoices/Permali Wallace Pvt Ltd-1.jpg'
no_noise = preprocess_image(image)
cv2_imshow(no_noise)

In [None]:
text = pytesseract.image_to_string(no_noise)
print(text)

In [None]:
import re

def extract_invoice_information(text):
    extracted_info = {}

    invoice_number = re.findall('\b[A-Za-z0-9]+\/\d+\b', text, re.IGNORECASE)
    if invoice_number:
      extracted_info['Invoice Number'] = invoice_number[0]

    invoice_date = re.findall('Invoice Date: \s*(\d{2}-\d{2}-\d{4})', text)
    if invoice_date:
      extracted_info['Invoice Date'] = invoice_date[-1]

    gst_number = re.findall('\d{2}[A-Z]{5}\d{4}[A-Z]{1}[A-Z\d]{1}[Z]{1}[A-Z\d]{1}',text)
    if gst_number:
      extracted_info['GST Number'] = gst_number[0]

    email = re.findall('^([a-z\d\.\-]+)@([a-z\d-]+)\.([a-z][2,8])(\.[a-z]{2,8})?$',text)
    if email:
     extracted_info['Email'] = email[0]

    invoice_amount = re.findall('\b[\d,]+(?:\.\d{2})?\b', text)
    if invoice_amount:
      extracted_info['Total Invoice Amount'] = invoice_amount[-1]

    address = re.findall('\b(?:\d+\s+)?(?:[A-Za-z0-9\s.,#-]+)\s*,?\s*(?:[A-Za-z\s.,#-]+)?\s*,?\s*(?:[A-Za-z\s.,#-]+)?\s*\b', text)
    if address:
      extracted_info['Address'] = address

    return extracted_info

ocr_text = text
extracted_info = extract_invoice_information(ocr_text)


In [None]:
import json

data = extracted_info

json_file_path = "invoice_data.json"

# Write the dictionary to the JSON file
with open(json_file_path, "w") as json_file:
    json.dump(data, json_file, indent=4)

print(f"JSON data has been written to '{json_file_path}'")