# Importing Necessary Libraries


In [17]:
import cv2
import pytesseract
import re
import json

# Load Image Function

In [18]:
def load_image(image_path):
    try:
        # Attempt to read the image from the specified path using OpenCV
        img = cv2.imread(image_path, cv2.IMREAD_COLOR)
        # Return the loaded image if successful
        return img
    except Exception as e:
        # If an error occurs during image loading, print the error message
        print("Error loading image:", e)
        # Return None to indicate failure
        return None


# Preprocess Image Function

In [19]:
def preprocess_image(image):
    try:
        # Convert the input image to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
#         cv2.imshow('gray image',gray)
#         cv2.waitKey(0)
#         cv2.destroyAllWindows()

        # Apply Gaussian blur to the grayscale image to reduce noise
        blurred = cv2.GaussianBlur(gray, (7, 7), 0)
#         cv2.imshow('blurred image',blurred)
#         cv2.waitKey(0)
#         cv2.destroyAllWindows()

        # Apply adaptive thresholding to create a binary image
        thresh = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 17, 3)
#         cv2.imshow('thresh image',thresh)
#         cv2.waitKey(0)
#         cv2.destroyAllWindows()

        # Invert the binary image to make the text regions black and background white
        processed_image = cv2.bitwise_not(thresh)
        
#         cv2.imshow('Preprocessed image',processed_image)
#         cv2.waitKey(0)
#         cv2.destroyAllWindows()

        # Return the preprocessed image
        return processed_image
    except Exception as e:
        # If an error occurs during image preprocessing, print the error message
        print("Error preprocessing image:", e)
        # Return None to indicate failure
        return None


# Extract Text Function

In [20]:
def extract_text(image):
    try:
        # Use pytesseract to extract text from the input image
        extracted_text = pytesseract.image_to_string(image)
        # Return the extracted text
        return extracted_text
    except Exception as e:
        # If an error occurs during text extraction, print the error message
        print("Error extracting text:", e)
        # Return an empty string to indicate failure
        return ""


# Parse Item Line Function


In [21]:
def parse_item_line(line):
    # Split the line into two parts based on the last occurrence of space
    parts = line.rsplit(' ', 1)
    # Check if there are two parts and if the second part matches the price format
    if len(parts) == 2 and re.match(r'^\d+\.\d{2}$', parts[1]):
        # Split the first part into two parts based on the first occurrence of space
        item_parts = parts[0].split(' ', 1)
        # Extract the item name from the last part of the split
        item = item_parts[-1].strip(' =\,')
        # Remove non-ASCII characters from a string
        item_name= re.sub(r'[^\x00-\x7F]+', '', item)
        # Return a dictionary containing the item name and price
        return {"item": item_name, "price": parts[1]}
    # Return None if the line does not match the expected format
    return None


# Extract Key-Value Pairs Function

In [22]:
def extract_key_value_pairs(line):
    # Check if the line contains a colon
    if ':' in line:
        # Split the line into two parts based on the first occurrence of colon
        parts = line.split(':', 1)
        # Check if there are two parts after splitting
        if len(parts) == 2:
            # Extract the key and value, removing leading and trailing whitespace and specific characters
            key, value = parts[0].strip("+- "), parts[1].strip("+- ")
            return key, value
    # Return None if the line does not contain a colon or does not match the expected format
    return None, None


# Main Function

In [23]:
def main():
    # Set the Tesseract executable path
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
    # Path to the image file
    image_path = "bill.jpg"
    
    # Load the image
    img = load_image(image_path)
    if img is None:
        return  # Return if image loading fails
    
    # Preprocess the image
    processed_image = preprocess_image(img)
    if processed_image is None:
        return  # Return if image preprocessing fails
    
    # Extract text from the processed image
    extracted_text = extract_text(processed_image)
    
    # Parse the extracted text
    lines = extracted_text.split('\n')
    # Initialize the data dictionary with default values
    data = {
        "VAT REG TIN": "",
        "Table": "",
        "Staff": "",
        "consumed_items": [],
    }

    # Define items that are considered as outside items
    outside_items = ["sub-total", "vatable", "10% s.c.", "fd/by"]
    for line in lines:
        line = line.strip()
        # Extract key-value pairs from the line
        key, value = extract_key_value_pairs(line)
        # Update data dictionary with key-value pairs
        if key and value:
            if key.lower() in outside_items:
                data[key.lower()] = value
            else:
                data[key] = value
        else:
            # Parse item lines and add them to consumed_items list
            item_data = parse_item_line(line)
            if item_data:
                if item_data["item"].lower() in outside_items:
                    data[item_data["item"].lower()] = item_data["price"]
                else:
                    data["consumed_items"].append(item_data)

    # Output the extracted data in JSON format
    print(json.dumps(data, indent=4))


# Execute Main Function

In [24]:
if __name__ == "__main__":
    main()

{
    "VAT REG TIN": "008-811-772-000",
    "Table": "B1 OR@ 3333099-00",
    "Staff": "ALEMIR Cover :3",
    "consumed_items": [
        {
            "item": "JASMINE",
            "price": "178.57"
        },
        {
            "item": "ST.CHX FEET x0",
            "price": "160.71"
        },
        {
            "item": "ST.PORK SIOMAI",
            "price": "196.43"
        },
        {
            "item": "ST.G RICE ABALONE",
            "price": "198.43"
        },
        {
            "item": "ST.SPINACH DUMPLING ",
            "price": "196.43"
        },
        {
            "item": "BAKED BBQ PORK BUN",
            "price": "198.43"
        },
        {
            "item": "PANFRIED ONION CAKE",
            "price": "198.43"
        }
    ],
    "sub-total": "1770.93",
    "vatable": "1770.93",
    "10% s.c.": "280.72",
    "fd/by": "49.59",
    "Total": "2293.75"
}
