## Tasks:
Extract invoice number, invoice date, line items from invoice images. Also, the model
structure should be scalable where we can train a model for any field and extract the
key-value pair. Find sample data from the kaggle, huggingface, github. you can convert
pdf files to image files.

# Install Required Python Packages

In [1]:
pip install pytesseract pillow opencv-python

Note: you may need to restart the kernel to use updated packages.


## Confirm Tesseract Path in Python


In [2]:
import pytesseract

# Set this to where Tesseract is installed on your machine
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Test version call
print(pytesseract.get_tesseract_version())

5.5.0.20241111


# Extract Text from Invoice Images

#### What it will do :
                      Read invoice images from My dataset folder

                      Extract text using Tesseract OCR

                      Save the extracted text as .txt files in my output folder

## Extract text from one image(Testing)

In [3]:
import os
from PIL import Image
import pytesseract

# Set the Tesseract executable path (adjust if needed)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_text_from_image(image_path, output_text_path):
    """
    Extract text from a single image and save it to a text file.
    """
    try:
        img = Image.open(image_path)
        text = pytesseract.image_to_string(img)
        
        with open(output_text_path, 'w', encoding='utf-8') as f:
            f.write(text)
        
        print(f"[SUCCESS] Extracted text saved to: {output_text_path}")
    except Exception as e:
        print(f"[ERROR] Could not process {image_path}: {e}")

if __name__ == "__main__":
    # Replace these paths with your actual file paths
    input_image_path = r'C:\Users\Gouthum\Downloads\Project\batch_3\batch_1\batch1_1\batch1-0479.jpg'
    output_text_path = r'C:\Users\Gouthum\Downloads\Project\output\ocr_text\batch1-0479.txt'

    
    extract_text_from_image(input_image_path, output_text_path)

[SUCCESS] Extracted text saved to: C:\Users\Gouthum\Downloads\Project\output\ocr_text\batch1-0479.txt


## Image2

In [4]:
import os
from PIL import Image
import pytesseract

# Set the Tesseract executable path (adjust if needed)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_text_from_image(image_path, output_text_path):
    """
    Extract text from a single image and save it to a text file.
    """
    try:
        img = Image.open(image_path)
        text = pytesseract.image_to_string(img)
        
        with open(output_text_path, 'w', encoding='utf-8') as f:
            f.write(text)
        
        print(f"[SUCCESS] Extracted text saved to: {output_text_path}")
    except Exception as e:
        print(f"[ERROR] Could not process {image_path}: {e}")

if __name__ == "__main__":
    # Replace these paths with your actual file paths
    input_image_path = r'C:\Users\Gouthum\Downloads\Project\batch_3\batch_1\batch1_1\batch1-0477.jpg'
    output_text_path = r'C:\Users\Gouthum\Downloads\Project\output\ocr_text\batch1-0477.txt'

    
    extract_text_from_image(input_image_path, output_text_path)

[SUCCESS] Extracted text saved to: C:\Users\Gouthum\Downloads\Project\output\ocr_text\batch1-0477.txt


#### OCR is working perfectly and Im  already extracting real invoice text from the image

# Extract Key Fields from OCR Text File

##### Invoice Number

      Invoice Date

      Line Items (products, quantity, price, etc.)

##### Im using  regular expressions (regex) or other text parsing methods

In [5]:
import re

def extract_invoice_number(text):
    
    match = re.search(r'Invoice\s*no[:\s]*([A-Za-z0-9\-]+)', text, re.IGNORECASE)
    if match:
        return match.group(1)
    return None

In [6]:
def extract_invoice_date(text):
    # Match dates like 05/07/2019, 05-07-2019, 2019/07/05, 2019-07-05
    date_patterns = [
        r'(\b\d{2}[/-]\d{2}[/-]\d{4}\b)',  # DD/MM/YYYY or MM/DD/YYYY
        r'(\b\d{4}[/-]\d{2}[/-]\d{2}\b)'   # YYYY/MM/DD or YYYY-MM-DD
    ]
    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(1)
    return None

In [7]:
def extract_line_items(text):
    # Very basic example: extract lines under "ITEMS" section until "SUMMARY"
    items = []
    try:
        # Extract text between ITEMS and SUMMARY sections
        items_section = re.search(r'ITEMS(.*)SUMMARY', text, re.DOTALL | re.IGNORECASE)
        if items_section:
            lines = items_section.group(1).strip().split('\n')
            # Clean empty lines and strip spaces
            lines = [line.strip() for line in lines if line.strip()]
            # For simplicity, return the lines directly (you can parse further)
            return lines
    except Exception as e:
        print(f"Error extracting line items: {e}")
    return items

In [8]:
def extract_fields_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    
    invoice_no = extract_invoice_number(text)
    invoice_date = extract_invoice_date(text)
    line_items = extract_line_items(text)

    return {
        'invoice_number': invoice_no,
        'invoice_date': invoice_date,
        'line_items': line_items
    }


In [9]:
if __name__ == "__main__":
    file_path = r'C:\Users\Gouthum\Downloads\Project\output\ocr_text\batch1-0479.txt'

    fields = extract_fields_from_file(file_path)
    print("Extracted Fields:")
    print(f"Invoice Number: {fields['invoice_number']}")
    print(f"Invoice Date: {fields['invoice_date']}")
    print(f"Line Items:\n")
    for item in fields['line_items']:
        print(item)


Extracted Fields:
Invoice Number: 69721323
Invoice Date: 05/07/2019
Line Items:

No. Description Qty UM Net price Net worth VAT [%] Gross
worth
ils Tie Dyeing Fluffy Rugs Anti-Skid 4,00 each 25,48 101,92 10% 112,11
Area Rug Dining Room Carpet
Floor Bedroom Mat
2. Galaxy Butterfly Area Rugs 2,00 each 13,69 27,38 10% 30,12
Children Bedroom Non-Slip
Doormat Floor Mat Carpet
oo Colorful Marble Printed Living 2,00 each 12,49 24,98 10% 27,48
Room and Bedroom Area Rugs
Carpet CNK2413
4. Red Traditional Oriental 4,00 each 39,98 159,92 10% 175,91
Medallion 8x10 Area Rug Carpet
2x3 Mat 5x7 Rugs
Dy YILONG 2.5'x4' Small Hand 4,00 each 2 000,00 8 000,00 10% 8 800,00
Knotted Silk Carpets Antistatic
Floor Area Rug 844B


## Image2

In [10]:
if __name__ == "__main__":
    file_path = r'C:\Users\Gouthum\Downloads\Project\output\ocr_text\batch1-0477.txt'

    fields = extract_fields_from_file(file_path)
    print("Extracted Fields:")
    print(f"Invoice Number: {fields['invoice_number']}")
    print(f"Invoice Date: {fields['invoice_date']}")
    print(f"Line Items:\n")
    for item in fields['line_items']:
        print(item)


Extracted Fields:
Invoice Number: 65981953
Invoice Date: 05/14/2020
Line Items:

No. Description Qty UM Net price Net worth VAT [%] Gross
worth
ils iBUYPOWER Gaming Computer 1,00 each 509,95 509,95 10% 560,95
AMD Ryzen 3 3100, Radeon RX
550, 240GB SSD, WiFi Ready
2. Dell Desktop Computer PC Core 1,00 each 124,95 124,95 10% 137,45
i5 3.1GHz 8GB RAM 500GB HD
Wifi DVD Windows 10
2 HP 8300 Slim Desktop Small 3,00 each 99,99 299,97 10% 329,97
Computer PC i5 3.2GHz 4GB
500GB Windows 10 Pro WiFi


### Invoice Number and Invoice Date are now extracted correctly

# parsing the Line Items in a structured way

### Item number & description (sometimes multiline)

    Quantity (Qty)

    Unit of measure (UM)

    Net price

    Net worth

    VAT %

    Gross worth

In [11]:
import re

def extract_line_items(text):
    # Extract text between ITEMS and SUMMARY (as before)
    items = []
    try:
        items_section = re.search(r'ITEMS(.*)SUMMARY', text, re.DOTALL | re.IGNORECASE)
        if items_section:
            lines = items_section.group(1).strip().split('\n')
            lines = [line.strip() for line in lines if line.strip()]
            return lines
    except Exception as e:
        print(f"Error extracting line items: {e}")
    return items

def parse_line_items(lines):
    items = []
    current_item_lines = []
    
    for line in lines:
        if re.match(r'^\d+\.', line.strip()):
            if current_item_lines:
                items.append(' '.join(current_item_lines))
                current_item_lines = []
            current_item_lines.append(line.strip())
        else:
            current_item_lines.append(line.strip())
    if current_item_lines:
        items.append(' '.join(current_item_lines))
    return items

# Read the OCR text file first
file_path = r'C:\Users\Gouthum\Downloads\Project\output\ocr_text\batch1-0479.txt'
with open(file_path, 'r', encoding='utf-8') as f:
    text = f.read()

# Extract and parse line items
line_items = extract_line_items(text)
parsed_items = parse_line_items(line_items)

for i, item in enumerate(parsed_items, 1):
    print(f"Item {i}: {item}\n")


Item 1: No. Description Qty UM Net price Net worth VAT [%] Gross worth ils Tie Dyeing Fluffy Rugs Anti-Skid 4,00 each 25,48 101,92 10% 112,11 Area Rug Dining Room Carpet Floor Bedroom Mat

Item 2: 2. Galaxy Butterfly Area Rugs 2,00 each 13,69 27,38 10% 30,12 Children Bedroom Non-Slip Doormat Floor Mat Carpet oo Colorful Marble Printed Living 2,00 each 12,49 24,98 10% 27,48 Room and Bedroom Area Rugs Carpet CNK2413

Item 3: 4. Red Traditional Oriental 4,00 each 39,98 159,92 10% 175,91 Medallion 8x10 Area Rug Carpet 2x3 Mat 5x7 Rugs Dy YILONG 2.5'x4' Small Hand 4,00 each 2 000,00 8 000,00 10% 8 800,00 Knotted Silk Carpets Antistatic Floor Area Rug 844B



## Image2

In [12]:
import re

def extract_line_items(text):
    # Extract text between ITEMS and SUMMARY (as before)
    items = []
    try:
        items_section = re.search(r'ITEMS(.*)SUMMARY', text, re.DOTALL | re.IGNORECASE)
        if items_section:
            lines = items_section.group(1).strip().split('\n')
            lines = [line.strip() for line in lines if line.strip()]
            return lines
    except Exception as e:
        print(f"Error extracting line items: {e}")
    return items

def parse_line_items(lines):
    items = []
    current_item_lines = []
    
    for line in lines:
        if re.match(r'^\d+\.', line.strip()):
            if current_item_lines:
                items.append(' '.join(current_item_lines))
                current_item_lines = []
            current_item_lines.append(line.strip())
        else:
            current_item_lines.append(line.strip())
    if current_item_lines:
        items.append(' '.join(current_item_lines))
    return items

# Read the OCR text file first
file_path = r'C:\Users\Gouthum\Downloads\Project\output\ocr_text\batch1-0477.txt'
with open(file_path, 'r', encoding='utf-8') as f:
    text = f.read()

# Extract and parse line items
line_items = extract_line_items(text)
parsed_items = parse_line_items(line_items)

for i, item in enumerate(parsed_items, 1):
    print(f"Item {i}: {item}\n")


Item 1: No. Description Qty UM Net price Net worth VAT [%] Gross worth ils iBUYPOWER Gaming Computer 1,00 each 509,95 509,95 10% 560,95 AMD Ryzen 3 3100, Radeon RX 550, 240GB SSD, WiFi Ready

Item 2: 2. Dell Desktop Computer PC Core 1,00 each 124,95 124,95 10% 137,45 i5 3.1GHz 8GB RAM 500GB HD Wifi DVD Windows 10 2 HP 8300 Slim Desktop Small 3,00 each 99,99 299,97 10% 329,97 Computer PC i5 3.2GHz 4GB 500GB Windows 10 Pro WiFi



# Parsing Based on Line Item Structure

In [13]:
import re

def extract_invoice_fields(text):
    fields = {
        "invoice_number": None,
        "invoice_date": None,
        "seller": {},
        "client": {},
        "iban": None,
    }

    # Extract Invoice Number
    invoice_no_match = re.search(r"Invoice\s+no[:\-]?\s*(\d+)", text, re.IGNORECASE)
    if invoice_no_match:
        fields["invoice_number"] = invoice_no_match.group(1)

    # Extract Invoice Date
    date_match = re.search(r"\b(\d{2}/\d{2}/\d{4})\b", text)
    if date_match:
        fields["invoice_date"] = date_match.group(1)

    # Extract Seller Info (multiline between 'Seller:' and 'Tax Id')
    seller_match = re.search(r"Seller:\s*(.*?)\s*Tax Id:\s*([\d\-]+)", text, re.IGNORECASE | re.DOTALL)
    if seller_match:
        seller_info = seller_match.group(1).strip().replace("\n", " ")
        fields["seller"]["name_address"] = ' '.join(seller_info.split())
        fields["seller"]["tax_id"] = seller_match.group(2).strip()

    # Extract Client Info (multiline between 'Client:' and 'Tax Id')
    client_match = re.search(r"Client:\s*(.*?)\s*Tax Id:\s*([\d\-]+)", text, re.IGNORECASE | re.DOTALL)
    if client_match:
        client_info = client_match.group(1).strip().replace("\n", " ")
        fields["client"]["name_address"] = ' '.join(client_info.split())
        fields["client"]["tax_id"] = client_match.group(2).strip()

    # Extract IBAN
    iban_match = re.search(r"IBAN:\s*([A-Z0-9]+)", text, re.IGNORECASE)
    if iban_match:
        fields["iban"] = iban_match.group(1).strip()

    return fields


# Sample input
text_line = """
Invoice no: 69721323

Date of issue:

Seller:

Murray-Eaton
773 Joseph Plains
West Nicoleville, AZ 46136

Tax Id: 936-71-8228

05/07/2019

Client:

Cuevas, Reid and Hurst
98071 Daniel Heights
Careyside, MS 59400

Tax Id: 949-88-4885

IBAN: GB22XZGA27411153163644

ITEMS
...
"""

# Extracted fields
fields = extract_invoice_fields(text_line)

# Print results
print("Extracted Fields:\n")
print(f"Invoice Number: {fields['invoice_number']}")
print(f"Invoice Date: {fields['invoice_date']}")
print(f"Seller Info:\n  Name & Address: {fields['seller'].get('name_address')}\n  Tax ID: {fields['seller'].get('tax_id')}")
print(f"Client Info:\n  Name & Address: {fields['client'].get('name_address')}\n  Tax ID: {fields['client'].get('tax_id')}")
print(f"IBAN: {fields['iban']}")


Extracted Fields:

Invoice Number: 69721323
Invoice Date: 05/07/2019
Seller Info:
  Name & Address: Murray-Eaton 773 Joseph Plains West Nicoleville, AZ 46136
  Tax ID: 936-71-8228
Client Info:
  Name & Address: Cuevas, Reid and Hurst 98071 Daniel Heights Careyside, MS 59400
  Tax ID: 949-88-4885
IBAN: GB22XZGA27411153163644


## Example2

In [14]:
import re

def extract_invoice_fields(text):
    fields = {
        "invoice_number": None,
        "invoice_date": None,
        "seller": {},
        "client": {},
        "iban": None,
    }

    # Extract Invoice Number
    invoice_no_match = re.search(r"Invoice\s+no[:\-]?\s*(\d+)", text, re.IGNORECASE)
    if invoice_no_match:
        fields["invoice_number"] = invoice_no_match.group(1)

    # Extract Invoice Date
    date_match = re.search(r"\b(\d{2}/\d{2}/\d{4})\b", text)
    if date_match:
        fields["invoice_date"] = date_match.group(1)

    # Extract Seller Info (multiline between 'Seller:' and 'Tax Id')
    seller_match = re.search(r"Seller:\s*(.*?)\s*Tax Id:\s*([\d\-]+)", text, re.IGNORECASE | re.DOTALL)
    if seller_match:
        seller_info = seller_match.group(1).strip().replace("\n", " ")
        fields["seller"]["name_address"] = ' '.join(seller_info.split())
        fields["seller"]["tax_id"] = seller_match.group(2).strip()

    # Extract Client Info (multiline between 'Client:' and 'Tax Id')
    client_match = re.search(r"Client:\s*(.*?)\s*Tax Id:\s*([\d\-]+)", text, re.IGNORECASE | re.DOTALL)
    if client_match:
        client_info = client_match.group(1).strip().replace("\n", " ")
        fields["client"]["name_address"] = ' '.join(client_info.split())
        fields["client"]["tax_id"] = client_match.group(2).strip()

    # Extract IBAN
    iban_match = re.search(r"IBAN:\s*([A-Z0-9]+)", text, re.IGNORECASE)
    if iban_match:
        fields["iban"] = iban_match.group(1).strip()

    return fields


# Sample input
text_line = """
Invoice no: 65981953

Date of issue:

Seller:

Horton LLC
44374 Watkins Points
Norriston, IL 77707

05/14/2020

Client:

Jackson-Holland
167 Howard Place Suite 420
Gordonville, NM 91832

Tax Id: 932-86-9428
IBAN: GB9OSKPB52785824368203

Tax Id: 978-99-3941

ITEMS

...
"""

# Extracted fields
fields = extract_invoice_fields(text_line)

# Print results
print("Extracted Fields:\n")
print(f"Invoice Number: {fields['invoice_number']}")
print(f"Invoice Date: {fields['invoice_date']}")
print(f"Seller Info:\n  Name & Address: {fields['seller'].get('name_address')}\n  Tax ID: {fields['seller'].get('tax_id')}")
print(f"Client Info:\n  Name & Address: {fields['client'].get('name_address')}\n  Tax ID: {fields['client'].get('tax_id')}")
print(f"IBAN: {fields['iban']}")


Extracted Fields:

Invoice Number: 65981953
Invoice Date: 05/14/2020
Seller Info:
  Name & Address: Horton LLC 44374 Watkins Points Norriston, IL 77707 05/14/2020 Client: Jackson-Holland 167 Howard Place Suite 420 Gordonville, NM 91832
  Tax ID: 932-86-9428
Client Info:
  Name & Address: Jackson-Holland 167 Howard Place Suite 420 Gordonville, NM 91832
  Tax ID: 932-86-9428
IBAN: GB9OSKPB52785824368203


# Extracting line items from Invoice Number, Invoice Date, Seller Info, Client Info, IBAN  From Image

In [15]:
import pytesseract
from PIL import Image
import re

# Path to tesseract executable (update path if needed)
# Example for Windows:
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_invoice_fields(text):
    fields = {
        "invoice_number": None,
        "invoice_date": None,
        "seller": {"name": None, "address": None, "tax_id": None},
        "client": {"name": None, "address": None, "tax_id": None},
        "iban": None,
    }

    # Invoice Number
    invoice_no_match = re.search(r"Invoice\s+no[:\-]?\s*(\d+)", text, re.IGNORECASE)
    if invoice_no_match:
        fields["invoice_number"] = invoice_no_match.group(1)

    # Invoice Date
    date_match = re.search(r"\b(\d{2}/\d{2}/\d{4})\b", text)
    if date_match:
        fields["invoice_date"] = date_match.group(1)

    # Seller
    seller_match = re.search(r"Seller:\s*(.*?)\s*Tax Id:\s*([\d\-]+)", text, re.IGNORECASE | re.DOTALL)
    if seller_match:
        seller_block = seller_match.group(1).strip().split("\n", 1)
        if len(seller_block) == 2:
            fields["seller"]["name"] = seller_block[0].strip()
            fields["seller"]["address"] = ' '.join(seller_block[1].strip().split())
        else:
            fields["seller"]["name"] = seller_block[0].strip()
        fields["seller"]["tax_id"] = seller_match.group(2).strip()

    # Client
    client_match = re.search(r"Client:\s*(.*?)\s*Tax Id:\s*([\d\-]+)", text, re.IGNORECASE | re.DOTALL)
    if client_match:
        client_block = client_match.group(1).strip().split("\n", 1)
        if len(client_block) == 2:
            fields["client"]["name"] = client_block[0].strip()
            fields["client"]["address"] = ' '.join(client_block[1].strip().split())
        else:
            fields["client"]["name"] = client_block[0].strip()
        fields["client"]["tax_id"] = client_match.group(2).strip()

    # IBAN
    iban_match = re.search(r"IBAN:\s*([A-Z0-9]+)", text, re.IGNORECASE)
    if iban_match:
        fields["iban"] = iban_match.group(1).strip()

    return fields

# ----------- MAIN IMAGE PROCESSING -----------

# Load image file
image_path = "C:/Users/Gouthum/Downloads/Project/batch_3/batch_1/batch1_1/batch1-0479.jpg"  # ← Replace with my invoice image path
image = Image.open(image_path)

# Extract text from image using OCR
ocr_text = pytesseract.image_to_string(image)

# Apply extraction logic
invoice_data = extract_invoice_fields(ocr_text)

# Print extracted data
print("Extracted Fields:\n")
print(f"Invoice Number: {invoice_data['invoice_number']}")
print(f"Invoice Date: {invoice_data['invoice_date']}")
print("Seller Info:")
print(f"  Name: {invoice_data['seller'].get('name')}")
print(f"  Address: {invoice_data['seller'].get('address')}")
print(f"  Tax ID: {invoice_data['seller'].get('tax_id')}")
print("Client Info:")
print(f"  Name: {invoice_data['client'].get('name')}")
print(f"  Address: {invoice_data['client'].get('address')}")
print(f"  Tax ID: {invoice_data['client'].get('tax_id')}")
print(f"IBAN: {invoice_data['iban']}")


Extracted Fields:

Invoice Number: 69721323
Invoice Date: 05/07/2019
Seller Info:
  Name: Murray-Eaton
  Address: 773 Joseph Plains West Nicoleville, AZ 46136
  Tax ID: 936-71-8228
Client Info:
  Name: Cuevas, Reid and Hurst
  Address: 98071 Daniel Heights Careyside, MS 59400
  Tax ID: 949-88-4885
IBAN: GB22XZGA27411153163644


## image3

In [16]:
import pytesseract
from PIL import Image
import re

# Path to tesseract executable (update path if needed)
# Example for Windows:
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_invoice_fields(text):
    fields = {
        "invoice_number": None,
        "invoice_date": None,
        "seller": {"name": None, "address": None, "tax_id": None},
        "client": {"name": None, "address": None, "tax_id": None},
        "iban": None,
    }

    # Invoice Number
    invoice_no_match = re.search(r"Invoice\s+no[:\-]?\s*(\d+)", text, re.IGNORECASE)
    if invoice_no_match:
        fields["invoice_number"] = invoice_no_match.group(1)

    # Invoice Date
    date_match = re.search(r"\b(\d{2}/\d{2}/\d{4})\b", text)
    if date_match:
        fields["invoice_date"] = date_match.group(1)

    # Seller
    seller_match = re.search(r"Seller:\s*(.*?)\s*Tax Id:\s*([\d\-]+)", text, re.IGNORECASE | re.DOTALL)
    if seller_match:
        seller_block = seller_match.group(1).strip().split("\n", 1)
        if len(seller_block) == 2:
            fields["seller"]["name"] = seller_block[0].strip()
            fields["seller"]["address"] = ' '.join(seller_block[1].strip().split())
        else:
            fields["seller"]["name"] = seller_block[0].strip()
        fields["seller"]["tax_id"] = seller_match.group(2).strip()

    # Client
    client_match = re.search(r"Client:\s*(.*?)\s*Tax Id:\s*([\d\-]+)", text, re.IGNORECASE | re.DOTALL)
    if client_match:
        client_block = client_match.group(1).strip().split("\n", 1)
        if len(client_block) == 2:
            fields["client"]["name"] = client_block[0].strip()
            fields["client"]["address"] = ' '.join(client_block[1].strip().split())
        else:
            fields["client"]["name"] = client_block[0].strip()
        fields["client"]["tax_id"] = client_match.group(2).strip()

    # IBAN
    iban_match = re.search(r"IBAN:\s*([A-Z0-9]+)", text, re.IGNORECASE)
    if iban_match:
        fields["iban"] = iban_match.group(1).strip()

    return fields

# ----------- MAIN IMAGE PROCESSING -----------

# Load image file
image_path = "C:/Users/Gouthum/Downloads/Project/batch_3/batch_1/batch1_1/batch1-0477.jpg"  # ← Replace with my invoice image path
image = Image.open(image_path)

# Extract text from image using OCR
ocr_text = pytesseract.image_to_string(image)

# Apply extraction logic
invoice_data = extract_invoice_fields(ocr_text)

# Print extracted data
print("Extracted Fields:\n")
print(f"Invoice Number: {invoice_data['invoice_number']}")
print(f"Invoice Date: {invoice_data['invoice_date']}")
print("Seller Info:")
print(f"  Name: {invoice_data['seller'].get('name')}")
print(f"  Address: {invoice_data['seller'].get('address')}")
print(f"  Tax ID: {invoice_data['seller'].get('tax_id')}")
print("Client Info:")
print(f"  Name: {invoice_data['client'].get('name')}")
print(f"  Address: {invoice_data['client'].get('address')}")
print(f"  Tax ID: {invoice_data['client'].get('tax_id')}")
print(f"IBAN: {invoice_data['iban']}")


Extracted Fields:

Invoice Number: 65981953
Invoice Date: 05/14/2020
Seller Info:
  Name: Horton LLC
  Address: 44374 Watkins Points Norriston, IL 77707 05/14/2020 Client: Jackson-Holland 167 Howard Place Suite 420 Gordonville, NM 91832
  Tax ID: 932-86-9428
Client Info:
  Name: Jackson-Holland
  Address: 167 Howard Place Suite 420 Gordonville, NM 91832
  Tax ID: 932-86-9428
IBAN: GB9OSKPB52785824368203


# Image2

In [17]:
import pytesseract
from PIL import Image
import re

# Path to tesseract executable (update path if needed)
# Example for Windows:
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_invoice_fields(text):
    fields = {
        "invoice_number": None,
        "invoice_date": None,
        "seller": {"name": None, "address": None, "tax_id": None},
        "client": {"name": None, "address": None, "tax_id": None},
        "iban": None,
    }

    # Invoice Number
    invoice_no_match = re.search(r"Invoice\s+no[:\-]?\s*(\d+)", text, re.IGNORECASE)
    if invoice_no_match:
        fields["invoice_number"] = invoice_no_match.group(1)

    # Invoice Date
    date_match = re.search(r"\b(\d{2}/\d{2}/\d{4})\b", text)
    if date_match:
        fields["invoice_date"] = date_match.group(1)

    # Seller
    seller_match = re.search(r"Seller:\s*(.*?)\s*Tax Id:\s*([\d\-]+)", text, re.IGNORECASE | re.DOTALL)
    if seller_match:
        seller_block = seller_match.group(1).strip().split("\n", 1)
        if len(seller_block) == 2:
            fields["seller"]["name"] = seller_block[0].strip()
            fields["seller"]["address"] = ' '.join(seller_block[1].strip().split())
        else:
            fields["seller"]["name"] = seller_block[0].strip()
        fields["seller"]["tax_id"] = seller_match.group(2).strip()

    # Client
    client_match = re.search(r"Client:\s*(.*?)\s*Tax Id:\s*([\d\-]+)", text, re.IGNORECASE | re.DOTALL)
    if client_match:
        client_block = client_match.group(1).strip().split("\n", 1)
        if len(client_block) == 2:
            fields["client"]["name"] = client_block[0].strip()
            fields["client"]["address"] = ' '.join(client_block[1].strip().split())
        else:
            fields["client"]["name"] = client_block[0].strip()
        fields["client"]["tax_id"] = client_match.group(2).strip()

    # IBAN
    iban_match = re.search(r"IBAN:\s*([A-Z0-9]+)", text, re.IGNORECASE)
    if iban_match:
        fields["iban"] = iban_match.group(1).strip()

    return fields

# ----------- MAIN IMAGE PROCESSING -----------

# Load image file
image_path = "C:/Users/Gouthum/Downloads/Project/batch_3/batch_1/batch1_1/batch1-0021.jpg"
image = Image.open(image_path)

# Extract text from image using OCR
ocr_text = pytesseract.image_to_string(image)

# Apply extraction logic
invoice_data = extract_invoice_fields(ocr_text)

# Print extracted data
print("Extracted Fields:\n")
print(f"Invoice Number: {invoice_data['invoice_number']}")
print(f"Invoice Date: {invoice_data['invoice_date']}")
print("Seller Info:")
print(f"  Name: {invoice_data['seller'].get('name')}")
print(f"  Address: {invoice_data['seller'].get('address')}")
print(f"  Tax ID: {invoice_data['seller'].get('tax_id')}")
print("Client Info:")
print(f"  Name: {invoice_data['client'].get('name')}")
print(f"  Address: {invoice_data['client'].get('address')}")
print(f"  Tax ID: {invoice_data['client'].get('tax_id')}")
print(f"IBAN: {invoice_data['iban']}")


Extracted Fields:

Invoice Number: 96430839
Invoice Date: 01/13/2019
Seller Info:
  Name: Bennett, Chavez and Vazquez
  Address: Unit 8361 Box 7078 DPO AE 69202
  Tax ID: 971-80-5191
Client Info:
  Name: Alexander-Petersen
  Address: 647 Joyce Ville Waltonfort, IA 57670
  Tax ID: 934-91-7455
IBAN: GB9ORYME18980824175499


# Parsing Based on Line Item Structure(More Organised way)

In [18]:
import re

def extract_invoice_fields(text):
    fields = {
        "invoice_number": None,
        "invoice_date": None,
        "seller": {"name": None, "address": None, "tax_id": None},
        "client": {"name": None, "address": None, "tax_id": None},
        "iban": None,
    }
  
    # Extract Invoice Number
    invoice_no_match = re.search(r"Invoice\s+no[:\-]?\s*(\d+)", text, re.IGNORECASE)
    if invoice_no_match:
        fields["invoice_number"] = invoice_no_match.group(1)


    # Extract Invoice Date (accepts common formats like dd/mm/yyyy)
    date_match = re.search(r"\b(\d{2}/\d{2}/\d{4})\b", text)
    if date_match:
        fields["invoice_date"] = date_match.group(1)


    # Extract Seller Info
    seller_match = re.search(r"Seller:\s*(.*?)\s*Tax Id:\s*([\d\-]+)", text, re.IGNORECASE | re.DOTALL)
    if seller_match:
        seller_block = seller_match.group(1).strip().split("\n", 1)
        if len(seller_block) == 2:
            fields["seller"]["name"] = seller_block[0].strip()
            fields["seller"]["address"] = ' '.join(seller_block[1].strip().split())
        else:
            fields["seller"]["name"] = seller_block[0].strip()
        fields["seller"]["tax_id"] = seller_match.group(2).strip()

    
    # Extract Client Info
    client_match = re.search(r"Client:\s*(.*?)\s*Tax Id:\s*([\d\-]+)", text, re.IGNORECASE | re.DOTALL)
    if client_match:
        client_block = client_match.group(1).strip().split("\n", 1)
        if len(client_block) == 2:
            fields["client"]["name"] = client_block[0].strip()
            fields["client"]["address"] = ' '.join(client_block[1].strip().split())
        else:
            fields["client"]["name"] = client_block[0].strip()
        fields["client"]["tax_id"] = client_match.group(2).strip()


    # Extract IBAN
    iban_match = re.search(r"IBAN:\s*([A-Z0-9]+)", text, re.IGNORECASE)
    if iban_match:
        fields["iban"] = iban_match.group(1).strip()

    return fields

# Sample input text
text_line = """
Invoice no: 69721323

Date of issue:

Seller:

Murray-Eaton
773 Joseph Plains
West Nicoleville, AZ 46136

Tax Id: 936-71-8228

05/07/2019

Client:

Cuevas, Reid and Hurst
98071 Daniel Heights
Careyside, MS 59400

Tax Id: 949-88-4885

IBAN: GB22XZGA27411153163644

ITEMS
...
"""

# Extract and print fields
fields = extract_invoice_fields(text_line)

print("Extracted Fields:\n")
print(f"Invoice Number: {fields['invoice_number']}")
print(f"Invoice Date: {fields['invoice_date']}")
print("Seller Info:")
print(f"  Name: {fields['seller'].get('name')}")
print(f"  Address: {fields['seller'].get('address')}")
print(f"  Tax ID: {fields['seller'].get('tax_id')}")
print("Client Info:")
print(f"  Name: {fields['client'].get('name')}")
print(f"  Address: {fields['client'].get('address')}")
print(f"  Tax ID: {fields['client'].get('tax_id')}")
print(f"IBAN: {fields['iban']}")


Extracted Fields:

Invoice Number: 69721323
Invoice Date: 05/07/2019
Seller Info:
  Name: Murray-Eaton
  Address: 773 Joseph Plains West Nicoleville, AZ 46136
  Tax ID: 936-71-8228
Client Info:
  Name: Cuevas, Reid and Hurst
  Address: 98071 Daniel Heights Careyside, MS 59400
  Tax ID: 949-88-4885
IBAN: GB22XZGA27411153163644


# Extracting line items from invoice OCR text

In [19]:
import re
import pandas as pd

# Example OCR text lines for items (replace with your actual OCR text)
ocr_text = """
1. Tie Dyeing Fluffy Rugs Anti-Skid 4,00 each 25,48 101,92 10% 112,11
2. Galaxy Butterfly Area Rugs 2,00 each 13,69 27,38 10% 30,12
3. Colorful Marble Printed Living 2,00 each 12,49 24,98 10% 27,48
4. Red Traditional Oriental 4,00 each 39,98 159,92 10% 175,91
5. YILONG 2.5'x4' Small Hand 4,00 each 2 000,00 8 000,00 10% 8 800,00
"""

# Extract lines (assuming each item is one line)
lines = ocr_text.strip().split('\n')

items = []

# Regex pattern to extract fields except description:
pattern = re.compile(
    r"^(?P<No>\d+)[\.:]?\s.*?\s"               # Item No and skip description (non-greedy)
    r"(?P<Qty>\d+,\d+)\s"                      # Quantity (e.g. 4,00)
    r"(?P<UM>\w+)\s"                           # Unit of Measure (e.g. each)
    r"(?P<NetPrice>[\d\s,]+)\s"                # Net price (e.g. 25,48 or 2 000,00)
    r"(?P<NetWorth>[\d\s,]+)\s"                # Net worth
    r"(?P<VAT>\d+)%\s"                         # VAT %
    r"(?P<Gross>[\d\s,]+)$"                    # Gross worth
)

for line in lines:
    match = pattern.match(line)
    if match:
        data = match.groupdict()
        # Normalize numbers: remove spaces, replace commas with dots
        for key in ['Qty', 'NetPrice', 'NetWorth', 'Gross']:
            data[key] = data[key].replace(" ", "").replace(",", ".")
        items.append(data)
    else:
        print(f"Skipped line (no match): {line}")

# Create DataFrame
df = pd.DataFrame(items)

print(df)


  No   Qty    UM  NetPrice NetWorth VAT    Gross
0  1  4.00  each     25.48   101.92  10   112.11
1  2  2.00  each     13.69    27.38  10    30.12
2  3  2.00  each     12.49    24.98  10    27.48
3  4  4.00  each     39.98   159.92  10   175.91
4  5  4.00  each  2000.008   000.00  10  8800.00


In [20]:
df

Unnamed: 0,No,Qty,UM,NetPrice,NetWorth,VAT,Gross
0,1,4.0,each,25.48,101.92,10,112.11
1,2,2.0,each,13.69,27.38,10,30.12
2,3,2.0,each,12.49,24.98,10,27.48
3,4,4.0,each,39.98,159.92,10,175.91
4,5,4.0,each,2000.008,0.0,10,8800.0


# Extracting line items from invoice images

In [35]:
import cv2
import pytesseract
import re
import pandas as pd

# Path to Tesseract executable (update if needed)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_line_items_from_text(text):
    lines = text.strip().split('\n')
    items = []

    pattern = re.compile(
        r"^(?P<No>\d+)[\.:]?\s.*?\s"               # Item No and skip description (non-greedy)
        r"(?P<Qty>\d+,\d+)\s"                      # Quantity (e.g. 4,00)
        r"(?P<UM>\w+)\s"                           # Unit of Measure (e.g. each)
        r"(?P<NetPrice>[\d\s,]+)\s"                # Net price (e.g. 25,48 or 2 000,00)
        r"(?P<NetWorth>[\d\s,]+)\s"                # Net worth
        r"(?P<VAT>\d+)%\s"                         # VAT %
        r"(?P<Gross>[\d\s,]+)$"                    # Gross worth
    )

    for line in lines:
        match = pattern.match(line)
        if match:
            data = match.groupdict()
            # Normalize numbers: remove spaces, replace commas with dots
            for key in ['Qty', 'NetPrice', 'NetWorth', 'Gross']:
                data[key] = data[key].replace(" ", "").replace(",", ".")
            items.append(data)
        else:
            # Uncomment to debug lines skipped:
            # print(f"Skipped line (no match): {line}")
            pass

    return pd.DataFrame(items)

def extract_line_items_from_image(image_path):
    # Load image
    img = cv2.imread(image_path)

    if img is None:
        raise FileNotFoundError(f"Image not found: {image_path}")

    # Convert to grayscale (better OCR results)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Run OCR
    ocr_text = pytesseract.image_to_string(gray)

    # Extract line items from OCR text
    df_items = extract_line_items_from_text(ocr_text)

    return df_items

# Example usage
if __name__ == "__main__":
    image_path = "C:/Users/Gouthum/Downloads/Project/batch_3/batch_1/batch1_1/batch1-0479.jpg"


    try:
        df = extract_line_items_from_image(image_path)
        if not df.empty:
            print("Extracted Line Items from Image:")
            print(df)
        else:
            print("No line items found in the image OCR.")
    except Exception as e:
        print(f"Error: {e}")


Extracted Line Items from Image:
  No   Qty    UM  NetPrice NetWorth VAT    Gross
0  2  2.00  each     13.69    27.38  10    30.12
1  4  4.00  each     39.98   159.92  10   175.91
2  5  4.00  each  2000.008   000.00  10  8800.00


In [22]:
df

Unnamed: 0,No,Qty,UM,NetPrice,NetWorth,VAT,Gross
0,2,2.0,each,13.69,27.38,10,30.12
1,4,4.0,each,39.98,159.92,10,175.91
2,5,4.0,each,2000.008,0.0,10,8800.0


In [36]:
import os
import cv2
import pytesseract
import re
import pandas as pd

# Configure Tesseract path (update if needed)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Regex pattern for line items
pattern = re.compile(
    r"^(?P<No>\d+)[\.:]?\s.*?\s"
    r"(?P<Qty>\d+,\d+)\s"
    r"(?P<UM>\w+)\s"
    r"(?P<NetPrice>[\d\s,]+)\s"
    r"(?P<NetWorth>[\d\s,]+)\s"
    r"(?P<VAT>\d+)%\s"
    r"(?P<Gross>[\d\s,]+)$"
)

def extract_line_items_from_text(text, filename):
    lines = text.strip().split('\n')
    items = []

    for line in lines:
        match = pattern.match(line)
        if match:
            data = match.groupdict()
            for key in ['Qty', 'NetPrice', 'NetWorth', 'Gross']:
                data[key] = data[key].replace(" ", "").replace(",", ".")
            data["Filename"] = filename
            items.append(data)

    return items

def process_image(image_path):
    img = cv2.imread(image_path)
    if img is None:
        raise FileNotFoundError(f"Image not found: {image_path}")
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray)
    return text

def collect_all_invoice_data(directories):
    all_items = []

    for folder in directories:
        for root, _, files in os.walk(folder):
            for file in files:
                if file.lower().endswith(('.jpg', '.jpeg', '.png', '.tiff')):
                    full_path = os.path.join(root, file)
                    try:
                        print(f"Processing: {full_path}")
                        text = process_image(full_path)
                        items = extract_line_items_from_text(text, file)
                        all_items.extend(items)
                    except Exception as e:
                        print(f"Error processing {full_path}: {e}")

    return pd.DataFrame(all_items)

# Your image directories
directories = [
    r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1",
    r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\batch1_2",
    r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\batch1_1",
    r"C:\Users\Gouthum\Downloads\Project\batch_2\batch_2\batch2_3",
    r"C:\Users\Gouthum\Downloads\Project\batch_2\batch_2\batch2_2",
    r"C:\Users\Gouthum\Downloads\Project\batch_2\batch_2\batch2_1",
    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_3\batch3_5",
    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_3\batch3_4",
    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_3\batch3_3",
    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_3\batch3_2",
    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_3\batch3_1",
    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_2\batch2_3",
    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_2\batch2_2",
    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_2\batch2_1",
    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_1\batch1_3",
    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_1\batch1_2",
    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_1\batch1_1"
]

# Run the extraction
df_all = collect_all_invoice_data(directories)

# Show or save results
print(df_all)

# Optional: Save to CSV/JSON
df_all.to_csv(r"C:\Users\Gouthum\Downloads\Project\output\all_invoiceee_items.csv", index=False)
df_all.to_json(r"C:\Users\Gouthum\Downloads\Project\output\all_invoiceee_items.json", orient='records', indent=2)


Processing: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\batch1_1\batch1-0001.jpg
Processing: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\batch1_1\batch1-0002.jpg
Processing: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\batch1_1\batch1-0003.jpg
Processing: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\batch1_1\batch1-0004.jpg
Processing: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\batch1_1\batch1-0005.jpg
Processing: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\batch1_1\batch1-0006.jpg
Processing: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\batch1_1\batch1-0007.jpg
Processing: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\batch1_1\batch1-0008.jpg
Processing: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\batch1_1\batch1-0009.jpg
Processing: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\batch1_1\batch1-0010.jpg
Processing: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\batch1_1\batch1-0011.jpg
Processing: C:\Users\Gouthum\Dow

In [43]:
import cv2
import pytesseract
import re
import pandas as pd

# Path to Tesseract executable (update if needed)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_line_items_from_text(text):
    lines = text.strip().split('\n')
    items = []

    # Updated regex to capture description
    pattern = re.compile(
        r"^(?P<No>\d+)[\.:]?\s+"                          # Item No
        r"(?P<Description>.+?)\s+"                         # Description (non-greedy)
        r"(?P<Qty>\d+,\d+)\s+"                             # Quantity
        r"(?P<UM>\w+)\s+"                                  # Unit of Measure
        r"(?P<NetPrice>[\d\s,]+)\s+"                       # Net price
        r"(?P<NetWorth>[\d\s,]+)\s+"                       # Net worth
        r"(?P<VAT>\d+)(?:%?)\s+"                           # VAT (optional % sign)
        r"(?P<Gross>[\d\s,]+)$"                            # Gross
    )

    for line in lines:
        match = pattern.match(line)
        if match:
            data = match.groupdict()
            # Normalize numbers
            for key in ['Qty', 'NetPrice', 'NetWorth', 'Gross']:
                data[key] = data[key].replace(" ", "").replace(",", ".")
            items.append(data)

    return pd.DataFrame(items)

def extract_line_items_from_image(image_path):
    # Load image
    img = cv2.imread(image_path)

    if img is None:
        raise FileNotFoundError(f"Image not found: {image_path}")

    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Run OCR
    ocr_text = pytesseract.image_to_string(gray)

    # Extract line items
    df_items = extract_line_items_from_text(ocr_text)

    return df_items

# Example usage
if __name__ == "__main__":
    image_path = "C:/Users/Gouthum/Downloads/Project/batch_3/batch_1/batch1_1/batch1-0479.jpg"

    try:
        df = extract_line_items_from_image(image_path)
        if not df.empty:
            print("Extracted Line Items from Image:")
            print(df)
        else:
            print("No line items found in the image OCR.")
    except Exception as e:
        print(f"Error: {e}")


Extracted Line Items from Image:
  No                 Description   Qty    UM  NetPrice NetWorth VAT    Gross
0  2  Galaxy Butterfly Area Rugs  2.00  each     13.69    27.38  10    30.12
1  4    Red Traditional Oriental  4.00  each     39.98   159.92  10   175.91
2  5   YILONG 2.5'x4' Small Hand  4.00  each  2000.008   000.00  10  8800.00


In [None]:
C:/Users/Gouthum/Downloads/Project/batch_3/batch_1/batch1_1/batch1-0479.jpg

# image2

# Key Value

In [32]:
import os
import cv2
import pytesseract
from PIL import Image
from tqdm import tqdm

# Set up tesseract path (update if needed)
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Define all image folder paths
image_dirs = [
    r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1",
    r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\batch1_2",
    r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\batch1_1",

    r"C:\Users\Gouthum\Downloads\Project\batch_2\batch_2\batch2_3",
    r"C:\Users\Gouthum\Downloads\Project\batch_2\batch_2\batch2_2",
    r"C:\Users\Gouthum\Downloads\Project\batch_2\batch_2\batch2_1",

    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_3\batch3_5",
    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_3\batch3_4",
    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_3\batch3_3",
    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_3\batch3_2",
    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_3\batch3_1",

    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_2\batch2_3",
    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_2\batch2_2",
    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_2\batch2_1",

    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_1\batch1_3",
    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_1\batch1_2",
    r"C:\Users\Gouthum\Downloads\Project\batch_3\batch_1\batch1_1",
]

# Supported image extensions
image_extensions = ['.jpg', '.jpeg', '.png', '.tiff', '.bmp']

# Extract key-value pairs from OCR text (basic example, customize this later)
def extract_key_value_pairs(text):
    key_values = {}
    for line in text.split('\n'):
        if ':' in line:
            parts = line.split(':', 1)
            key = parts[0].strip()
            value = parts[1].strip()
            if key and value:
                key_values[key] = value
    return key_values

# Process all images in the given folders
all_results = []

for image_dir in image_dirs:
    if not os.path.exists(image_dir):
        print(f"Folder not found: {image_dir}")
        continue

    for filename in tqdm(os.listdir(image_dir), desc=f"Processing {image_dir}"):
        ext = os.path.splitext(filename)[-1].lower()
        if ext in image_extensions:
            image_path = os.path.join(image_dir, filename)
            try:
                image = Image.open(image_path)
                ocr_text = pytesseract.image_to_string(image)

                key_values = extract_key_value_pairs(ocr_text)

                all_results.append({
                    'image_path': image_path,
                    'ocr_text': ocr_text,
                    'key_values': key_values
                })

            except Exception as e:
                print(f"Failed to process {image_path}: {e}")

# Example: Print result summary
print("\n--- Extracted Key-Values Summary ---")
for result in all_results:
    print(f"\nImage: {os.path.basename(result['image_path'])}")
    for k, v in result['key_values'].items():
        print(f"{k}: {v}")


Processing C:\Users\Gouthum\Downloads\Project\batch_1\batch_1: 100%|█████████████████████████████| 6/6 [00:00<?, ?it/s]
Processing C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\batch1_2: 100%|████████| 501/501 [18:40<00:00,  2.24s/it]
Processing C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\batch1_1: 100%|████████| 499/499 [19:02<00:00,  2.29s/it]
Processing C:\Users\Gouthum\Downloads\Project\batch_2\batch_2\batch2_3: 100%|████████| 490/490 [13:11<00:00,  1.62s/it]
Processing C:\Users\Gouthum\Downloads\Project\batch_2\batch_2\batch2_2: 100%|████████| 501/501 [14:29<00:00,  1.74s/it]
Processing C:\Users\Gouthum\Downloads\Project\batch_2\batch_2\batch2_1: 100%|████████| 500/500 [19:31<00:00,  2.34s/it]
Processing C:\Users\Gouthum\Downloads\Project\batch_3\batch_3\batch3_5: 100%|████████| 222/222 [07:07<00:00,  1.93s/it]
Processing C:\Users\Gouthum\Downloads\Project\batch_3\batch_3\batch3_4: 100%|████████| 500/500 [25:12<00:00,  3.02s/it]
Processing C:\Users\Gouthum\Downloads\Pr


--- Extracted Key-Values Summary ---

Image: batch1-0500.jpg
Invoice no: 34476404
Date of issue: 07/24/2018
Seller: Client:
Tax Id: 975-86-1582 Tax Id: 902-92-1120
IBAN: GB14UTZB20101973421914

Image: batch1-0501.jpg
Invoice no: 18999056
Tax Id: 934-91-7704
IBAN: GB96KZTB85221072973831

Image: batch1-0502.jpg
Invoice no: 33289699
Tax Id: 953-82-2864
IBAN: GB32HGAS20083011657056

Image: batch1-0503.jpg
Invoice no: 22758345
Tax Id: 942-74-5002
IBAN: GB36KYWK10385657483744

Image: batch1-0504.jpg
Invoice no: 79036267
Date of issue: 09/16/2016
Seller: Client:
Tax Id: 940-81-6212 Tax Id: 903-76-2071
IBAN: GBO6EYSH87132118408113

Image: batch1-0505.jpg
Invoice no: 11556735
Tax Id: 939-73-8621
IBAN: GB82SVCE08751937948089

Image: batch1-0506.jpg
Invoice no: 35382296
Tax Id: 982-92-4270
IBAN: GB97SVBJ13467617537698

Image: batch1-0507.jpg
Invoice no: 64661204
Tax Id: 917-70-4830
IBAN: GB60AMAZ11977251874745

Image: batch1-0508.jpg
Invoice no: 95545504
Tax Id: 998-79-7169
IBAN: GB87IPTJ0558297

## Option 1: Save as JSON (recommended for structured key-value pairs)

In [33]:
import json

output_file_path = r"C:\Users\Gouthum\Downloads\Project\ocr_results.json"

with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(all_results, f, indent=4, ensure_ascii=False)

print(f"\n✅ OCR results saved to: {output_file_path}")



✅ OCR results saved to: C:\Users\Gouthum\Downloads\Project\ocr_results.json


## Save My  structured OCR results

In [38]:
import json

try:
    with open(r'C:\Users\Gouthum\Downloads\Project\ocr_results.json', 'r', encoding='utf-8') as f:
        ocr_data = json.load(f)
    print("✅ Successfully loaded OCR data.")
except json.JSONDecodeError as e:
    print(f"❌ JSON decode error: {e}")
except Exception as e:
    print(f"❌ Other error: {e}")


✅ Successfully loaded OCR data.


In [39]:
import os
import json

json_path = r"C:\Users\Gouthum\Downloads\Project\ocr_results.json"

if os.path.exists(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        ocr_data = json.load(f)
    print("✅ Loaded existing OCR results.")
else:
    # your OCR code here
    ocr_data = run_ocr_and_collect_results()  # hypothetical function
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(ocr_data, f, indent=4, ensure_ascii=False)
    print("✅ OCR results saved.")


✅ Loaded existing OCR results.


In [40]:
with open('ocr_results.json', 'r', encoding='utf-8') as f:
    ocr_data = json.load(f)

In [41]:
ocr_data

[{'image_path': 'C:\\Users\\Gouthum\\Downloads\\Project\\batch_1\\batch_1\\batch1_2\\batch1-0500.jpg',
  'ocr_text': "Invoice no: 34476404\n\nDate of issue: 07/24/2018\n\nSeller: Client:\n\nCarrillo, Lara and Hooper Ellis and Sons\n\nUSNV Jones 85942 Tucker Plains Apt. 982\nFPO AE 13513 Hernandezchester, SC 88596\nTax Id: 975-86-1582 Tax Id: 902-92-1120\n\nIBAN: GB14UTZB20101973421914\n\nITEMS\nNo. Description Qty UM Net price Net worth VAT [%] Gross\nworth\nils Pack 3 Artificial Sheepskin Seat 1,00 each 14,22 14,22 10% 15,64\nPads 30cm Round Pad Floor\nArea Carpets\n2. Soft Sheepskin Mat Faux Fur 2,00 each 8,40 16,80 10% 18,48\nRug Small Shaggy Non Slip\nFloor Carpet Yellow\noo Chindi Rugs Carpet New Design 4,00 each 29,99 119,96 10% 131,96\nBohemian Garden Yoga Mat\nIndian Kilim Coverlet\n4. YILONG 9'x12' Oversize 4,00 each 7 800,00 31 200,00 10% 34 320,00\nHandmade Silk Carpet Antistatic\nLuxurious Area Rug TJ107A\nSUMMARY\nVAT [%] Net worth VAT Gross worth\n10% 31 350,98 3 135,10 3

# NLP

In [53]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [54]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\Gouthum/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gouthum/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gouthum/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# STEP 1: Load All CSVs

#### load and combine them into a single DataFrame

In [55]:
import pandas as pd
import os

# Folder path
folder_path = r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1'

# File names
csv_files = ['batch1_1.csv', 'batch1_2.csv', 'batch1_3.csv']  

# Load and concatenate all CSVs
dfs = []
for file in csv_files:
    full_path = os.path.join(folder_path, file)
    df = pd.read_csv(full_path)
    dfs.append(df)

# Combine into one DataFrame
full_df = pd.concat(dfs, ignore_index=True)
print("✅ Combined CSV shape:", full_df.shape)

# Save the combined DataFrame
output_path = os.path.join(folder_path, 'combined_batch1.csv')
full_df.to_csv(output_path, index=False, encoding='utf-8-sig')
print(f"✅ Combined CSV saved to: {output_path}")

✅ Combined CSV shape: (1414, 3)
✅ Combined CSV saved to: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\combined_batch1.csv


In [56]:
import pandas as pd
import os

# Folder path
folder_path = r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1'

# File names
csv_files = ['batch1_1.csv', 'batch1_2.csv', 'batch1_3.csv']  # Update if different

# Load and concatenate all CSVs
dfs = []
for file in csv_files:
    full_path = os.path.join(folder_path, file)
    df = pd.read_csv(full_path)
    dfs.append(df)

# Combine into one DataFrame
full_df = pd.concat(dfs, ignore_index=True)
print("✅ Combined CSV shape:", full_df.shape)

# Save combined CSV
csv_output_path = os.path.join(folder_path, 'combined_batch1.csv')
full_df.to_csv(csv_output_path, index=False, encoding='utf-8-sig')
print(f"✅ CSV saved to: {csv_output_path}")

# Save as JSON
json_output_path = os.path.join(folder_path, 'combined_batch1.json')
full_df.to_json(json_output_path, orient='records', indent=4, force_ascii=False)
print(f"✅ JSON saved to: {json_output_path}")


✅ Combined CSV shape: (1414, 3)
✅ CSV saved to: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\combined_batch1.csv
✅ JSON saved to: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\combined_batch1.json


# STEP 2: Rename Columns (if needed)

In [57]:
# Let's rename for consistency
full_df.columns = ['file_name', 'json_data', 'ocr_text']

In [58]:
full_df.columns

Index(['file_name', 'json_data', 'ocr_text'], dtype='object')

In [85]:
import pandas as pd
import os

# Folder path
folder_path = r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1'

# File names
csv_files = ['batch1_1.csv', 'batch1_2.csv', 'batch1_3.csv']  # Update if different

# Load and concatenate all CSVs
dfs = []
for file in csv_files:
    full_path = os.path.join(folder_path, file)
    df = pd.read_csv(full_path)
    dfs.append(df)

# Combine into one DataFrame
full_df = pd.concat(dfs, ignore_index=True)
print("✅ Combined CSV shape:", full_df.shape)

# ✅ Rename columns
full_df.rename(columns={
    'File Name': 'file_name',
    'Json Data': 'json_data',
    'OCRed Text': 'ocr_text'
}, inplace=True)

# ✅ Save as CSV
csv_output_path = os.path.join(folder_path, 'combined_batch1.csv')
full_df.to_csv(csv_output_path, index=False, encoding='utf-8-sig')
print(f"✅ CSV saved to: {csv_output_path}")

# ✅ Save as JSON
json_output_path = os.path.join(folder_path, 'combined_batch1.json')
full_df.to_json(json_output_path, orient='records', indent=4, force_ascii=False)
print(f"✅ JSON saved to: {json_output_path}")


✅ Combined CSV shape: (1414, 3)
✅ CSV saved to: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\combined_batch1.csv
✅ JSON saved to: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\combined_batch1.json


In [62]:
print(full_df.columns.tolist())

['File Name', 'Json Data', 'OCRed Text']


In [92]:
import pandas as pd
import os

# Base folder path
base_path = r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1'

# List of CSV files to combine
csv_files = ['batch1_1.csv', 'batch1_2.csv', 'batch1_3.csv']

# Load and concatenate all CSVs
dfs = []
for file in csv_files:
    full_path = os.path.join(base_path, file)
    df = pd.read_csv(full_path)
    dfs.append(df)

# Combine into one DataFrame
full_df = pd.concat(dfs, ignore_index=True)
print("✅ Combined CSV shape:", full_df.shape)

# ✅ Rename columns for NLP readiness
full_df.rename(columns={
    'File Name': 'file_name',
    'Json Data': 'json_data',
    'OCRed Text': 'ocr_text'
}, inplace=True)

# ✅ Preprocessing output folder
preprocess_folder = os.path.join(base_path, 'pre_process')
os.makedirs(preprocess_folder, exist_ok=True)

# ✅ Save as CSV
csv_output_path = os.path.join(preprocess_folder, 'combined_batch1.csv')
full_df.to_csv(csv_output_path, index=False, encoding='utf-8-sig')
print(f"✅ CSV saved to: {csv_output_path}")

# ✅ Save as JSON
json_output_path = os.path.join(preprocess_folder, 'combined_batch1.json')
full_df.to_json(json_output_path, orient='records', indent=4, force_ascii=False)
print(f"✅ JSON saved to: {json_output_path}")


✅ Combined CSV shape: (1414, 3)
✅ CSV saved to: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\pre_process\combined_batch1.csv
✅ JSON saved to: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\pre_process\combined_batch1.json


In [89]:
print(full_df.columns.tolist())

['file_name', 'json_data', 'ocr_text']


In [90]:
preprocess_folder = os.path.join(base_path, 'pre_process')

In [91]:
preprocess_folder

'C:\\Users\\Gouthum\\Downloads\\Project\\batch_1\\batch_1\\pre_process'

# STEP 3: Preprocess OCR Text

##### Clean whitespace, remove \n, etc.

In [93]:
import re

def clean_text(text):
    if pd.isna(text):
        return ''
    text = text.replace('\n', ' ').replace('\r', '')
    text = re.sub(r'\s{2,}', ' ', text)  # collapse extra spaces
    return text.strip()

full_df['cleaned_text'] = full_df['ocr_text'].apply(clean_text)

In [94]:
full_df['cleaned_text']

0       Invoice no: 84652373 Date of issue: 02/23/2021...
1       Invoice no: 37451664 Date of issue: 06/11/2020...
2       Invoice no: 40108666 Date of issue: 02/07/2020...
3       Invoice no: 73285932 Date of issue: 07/25/2017...
4       Invoice no: 15288019 Date of issue: 09/07/2014...
                              ...                        
1409    Invoice no: 87519797 Date of issue: 05/13/2013...
1410    Invoice no: 94223548 Date of issue: 11/19/2012...
1411    Invoice no: 59612541 Date of issue: 08/24/2016...
1412    Invoice no: 34630909 Date of issue: 09/14/2011...
1413    Invoice no: 14287592 Date of issue: 03/05/2021...
Name: cleaned_text, Length: 1414, dtype: object

# STEP 4: OCR Cleaning

In [99]:
import pandas as pd
import re
import os

# Path to the existing combined CSV
csv_path = r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\combined_batch1.csv"
df = pd.read_csv(csv_path)

# Cleaning function
def clean_ocr_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()                                  # lowercase
    text = re.sub(r'[^\w\s/:\-.]', '', text)             # keep only words, spaces, /, :, -, .
    text = re.sub(r'\s+', ' ', text).strip()             # remove extra spaces
    return text

# Apply cleaning
df['clean_text'] = df['ocr_text'].apply(clean_ocr_text)

# Save to new CSV
output_path = r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\combined_batch1_cleaned.csv"
df.to_csv(output_path, index=False, encoding='utf-8-sig')

print(f"✅ Cleaned CSV saved to: {output_path}")
print(f"✅ New column added: 'clean_text'")
print(f"✅ Final shape: {df.shape}")


✅ Cleaned CSV saved to: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\combined_batch1_cleaned.csv
✅ New column added: 'clean_text'
✅ Final shape: (1414, 4)


In [100]:
full_df['ocr_text'] = full_df['ocr_text'].apply(clean_ocr_text)


In [101]:
full_df['ocr_text'] 

0       invoice no: 84652373 date of issue: 02/23/2021...
1       invoice no: 37451664 date of issue: 06/11/2020...
2       invoice no: 40108666 date of issue: 02/07/2020...
3       invoice no: 73285932 date of issue: 07/25/2017...
4       invoice no: 15288019 date of issue: 09/07/2014...
                              ...                        
1409    invoice no: 87519797 date of issue: 05/13/2013...
1410    invoice no: 94223548 date of issue: 11/19/2012...
1411    invoice no: 59612541 date of issue: 08/24/2016...
1412    invoice no: 34630909 date of issue: 09/14/2011...
1413    invoice no: 14287592 date of issue: 03/05/2021...
Name: ocr_text, Length: 1414, dtype: object

In [None]:
Extracted Fields:

Invoice Number: 69721323
Invoice Date: 05/07/2019
Seller Info:
  Name: Murray-Eaton
  Address: 773 Joseph Plains West Nicoleville, AZ 46136
  Tax ID: 936-71-8228
Client Info:
  Name: Cuevas, Reid and Hurst
  Address: 98071 Daniel Heights Careyside, MS 59400
  Tax ID: 949-88-4885
IBAN: GB22XZGA27411153163644

In [109]:
import os
import re
import json
from PIL import Image
import pytesseract

# Set your Tesseract executable path here (update if different)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def extract_invoice_fields(text):
    text = text.replace('\r', '\n').replace('\n\n', '\n')

    invoice_number_match = re.search(r"Invoice (?:no|number)[:\s]*([\w\d\-]+)", text, re.IGNORECASE)
    invoice_number = invoice_number_match.group(1).strip() if invoice_number_match else None

    invoice_date_match = re.search(r"Date of issue[:\s]*([\d/.-]+)", text, re.IGNORECASE)
    invoice_date = invoice_date_match.group(1).strip() if invoice_date_match else None

    iban_match = re.search(r"IBAN[:\s]*([A-Z0-9]+)", text, re.IGNORECASE)
    iban = iban_match.group(1).strip() if iban_match else None

    seller_match = re.search(r"Seller:\s*(.*?)\n(?:Client:|Tax Id:|$)", text, re.DOTALL | re.IGNORECASE)
    seller_text = seller_match.group(1).strip() if seller_match else ''

    seller_name = None
    seller_address = None
    seller_taxid = None
    if seller_text:
        lines = seller_text.split('\n')
        if len(lines) >= 1:
            seller_name = lines[0].strip()
        if len(lines) >= 2:
            seller_address = lines[1].strip()
        taxid_match = re.search(r"Tax Id[:\s]*([\d\-]+)", seller_text, re.IGNORECASE)
        seller_taxid = taxid_match.group(1).strip() if taxid_match else None

    client_match = re.search(r"Client:\s*(.*?)\n(?:Tax Id:|$)", text, re.DOTALL | re.IGNORECASE)
    client_text = client_match.group(1).strip() if client_match else ''

    client_name = None
    client_address = None
    client_taxid = None
    if client_text:
        lines = client_text.split('\n')
        if len(lines) >= 1:
            client_name = lines[0].strip()
        if len(lines) >= 2:
            client_address = lines[1].strip()
        taxid_match = re.search(r"Tax Id[:\s]*([\d\-]+)", client_text, re.IGNORECASE)
        client_taxid = taxid_match.group(1).strip() if taxid_match else None

    vat_total = None
    net_worth_total = None
    gross_worth_total = None

    summary_match = re.search(r"SUMMARY(.*?)(?:Total|$)(.*)", text, re.DOTALL | re.IGNORECASE)
    summary_text = summary_match.group(0) if summary_match else text

    vat_match = re.search(r"VAT\s*\[?%\]?\s*[:\-]?\s*([\d,.]+)", summary_text, re.IGNORECASE)
    if vat_match:
        vat_total = float(vat_match.group(1).replace(',', '').strip())
    else:
        vat_match2 = re.search(r"VAT\s*[:\-]?\s*([\d,.]+)", summary_text, re.IGNORECASE)
        if vat_match2:
            vat_total = float(vat_match2.group(1).replace(',', '').strip())

    net_worth_match = re.search(r"Net worth\s*[:\-]?\s*([\d,.]+)", summary_text, re.IGNORECASE)
    if net_worth_match:
        net_worth_total = float(net_worth_match.group(1).replace(',', '').strip())

    gross_worth_match = re.search(r"Gross worth\s*[:\-]?\s*([\d,.]+)", summary_text, re.IGNORECASE)
    if gross_worth_match:
        gross_worth_total = float(gross_worth_match.group(1).replace(',', '').strip())

    return {
        "Invoice": {
            "Invoice Number": invoice_number,
            "Invoice Date": invoice_date,
            "Seller Name": seller_name,
            "Seller Address": seller_address,
            "Seller Tax ID": seller_taxid,
            "Client Name": client_name,
            "Client Address": client_address,
            "Client Tax ID": client_taxid,
            "IBAN": iban
        },
        "Summary": {
            "VAT": vat_total,
            "Net worth": net_worth_total,
            "Gross worth": gross_worth_total
        }
    }

def save_json(data, output_path):
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
    print(f"✅ JSON saved to: {output_path}")

def process_invoice_image(image_path):
    # OCR the image to text
    text = pytesseract.image_to_string(Image.open(image_path))
    print("📝 OCR Text extracted.")

    # Extract fields from OCR text
    extracted = extract_invoice_fields(text)

    # Save JSON
    output_folder = r"C:\Users\Gouthum\Downloads\Project\output\extracted_fields"
    os.makedirs(output_folder, exist_ok=True)
    base_name = os.path.splitext(os.path.basename(image_path))[0]
    json_file_path = os.path.join(output_folder, f"{base_name}_extracted.json")
    save_json(extracted, json_file_path)

# Example usage:
image_file = r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\batch1_1\batch1-0481.jpg"  # <-- Your image file path here
process_invoice_image(image_file)


📝 OCR Text extracted.
✅ JSON saved to: C:\Users\Gouthum\Downloads\Project\output\extracted_fields\batch1-0481_extracted.json


In [24]:
pip install easyocr opencv-python

Collecting easyocr
  Using cached easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting torchvision>=0.5 (from easyocr)
  Using cached torchvision-0.22.1-cp310-cp310-win_amd64.whl.metadata (6.1 kB)
Collecting scikit-image (from easyocr)
  Using cached scikit_image-0.25.2-cp310-cp310-win_amd64.whl.metadata (14 kB)
Using cached easyocr-1.7.2-py3-none-any.whl (2.9 MB)
Using cached torchvision-0.22.1-cp310-cp310-win_amd64.whl (1.7 MB)
Using cached scikit_image-0.25.2-cp310-cp310-win_amd64.whl (12.8 MB)
Installing collected packages: scikit-image, torchvision, easyocr

   ---------------------------------------- 0/3 [scikit-image]
   ---------------------------------------- 0/3 [scikit-image]
   ---------------------------------------- 0/3 [scikit-image]
   ---------------------------------------- 0/3 [scikit-image]
   ---------------------------------------- 0/3 [scikit-image]
   ---------------------------------------- 0/3 [scikit-image]
   ---------------------------------------- 0/



In [25]:
pip install opencv-python

Note: you may need to restart the kernel to use updated packages.


In [26]:
pip install easyocr

Note: you may need to restart the kernel to use updated packages.


## Preprocess OCR Text
Remove unwanted characters

Normalize whitespaces

Handle newline, tab spacing

Tokenize the OCR text

In [44]:
import re

def preprocess_text(text):
    text = text.replace('\n', ' ').replace('\r', '')
    text = re.sub(r'\s{2,}', ' ', text)
    return text.strip()

## Identify the Line Items Section
You can locate the start of line items using heuristics like:

Keywords: 'ITEMS', 'No.', 'Description', 'Qty', etc.

Start from the line where Description appears

In [45]:
def extract_line_items_section(text):
    lines = text.split('\n')
    start = -1
    for i, line in enumerate(lines):
        if re.search(r'\bDescription\b', line):
            start = i
            break
    return lines[start:] if start != -1 else []

## Chunk Lines into Items
Use NLP + rule-based parsing (e.g., RegEx or SpaCy) to:

Split items by each row

Extract fields (No., Description, Qty, Price, etc.)

In [46]:
def parse_line_items(lines):
    items = []
    for line in lines:
        # Simple example: assume each item is on one line
        match = re.match(r"(\d+)\s+(.*?)\s+(\d+,\d+)\s+each\s+([\d,\.]+)\s+([\d,\.]+)\s+(\d+%)\s+([\d,\.]+)", line)
        if match:
            items.append({
                'No.': match.group(1),
                'Description': match.group(2),
                'Qty': match.group(3),
                'UM': 'each',
                'Net price': match.group(4),
                'Net worth': match.group(5),
                'VAT [%]': match.group(6),
                'Gross worth': match.group(7),
            })
    return items


## Output to Structured CSV or JSON

In [47]:
import pandas as pd

def save_line_items(items, output_path):
    df = pd.DataFrame(items)
    df.to_csv(output_path, index=False)

# Advanced NLP Options(if needed)

### If RegEx isn't robust due to inconsistent formatting, you can try:

NER (Named Entity Recognition) using SpaCy to extract:

Product names

Prices

Units and quantities

Layout-aware models like LayoutLMv3 (if bounding box data is available)

### ✅ Why NLP is Required
Although the OCR already extracted the raw text, NLP is required because:

we must split and identify meaningful key-value pairs and line item blocks.

we need to segment, clean, and structure noisy and multi-line text into structured fields.

Rule-based parsing won’t scale well if the format changes slightly across invoices. NLP with pattern recognition or even ML can help generalize.

### 🔹 Step 1: Preprocess OCR Text
Read OCR text from CSV

Clean unwanted characters (\n, \t, duplicate spaces)

Convert all text to lowercase (optional)

In [50]:
import pandas as pd
import re

# Use raw string (r"...") to avoid unicode escape issues in Windows file paths
input_path = r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\combined_batch1_cleaned.csv"
output_path = r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\combined_batch1_cleaned_textonly.csv"

# Read the CSV file
df = pd.read_csv(input_path)

# Drop NA values from the OCR column and clean the text
ocr_texts = df['clean_text'].dropna().tolist()
cleaned_texts = [re.sub(r'\s+', ' ', text.strip()) for text in ocr_texts]

# Create new DataFrame and save it to a new CSV file
cleaned_df = pd.DataFrame({'Cleaned_OCR_Text': cleaned_texts})
cleaned_df.to_csv(output_path, index=False)

print(f"Cleaned OCR texts saved to: {output_path}")


Cleaned OCR texts saved to: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\combined_batch1_cleaned_textonly.csv


In [54]:
import pandas as pd
import re

# Load and clean OCR text
file_path = r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\combined_batch1_cleaned_textonly.csv"
df = pd.read_csv(file_path)
ocr_texts = df['Cleaned_OCR_Text'].dropna().tolist()
cleaned_texts = [re.sub(r'\s+', ' ', text.strip()) for text in ocr_texts]

# Result lists
invoice_details_list = []
line_items_list = []
summary_list = []

for text in cleaned_texts:
    # Extract invoice number and date
    invoice_no = re.search(r'invoice no[:\-]?\s*([A-Za-z0-9\-\/]+)', text, re.IGNORECASE)
    invoice_date = re.search(r'date of issue[:\-]?\s*([\d\/\-]+)', text, re.IGNORECASE)

    # Seller and Client info
    seller_info = re.search(r'seller[:\-]?\s*(.*?)\s*client[:\-]?', text, re.IGNORECASE)
    client_info = re.search(r'client[:\-]?\s*(.*?)\s*tax id', text, re.IGNORECASE)

    # ----- Extract Structured Line Items -----
    line_items = []
    lines = text.split('\n')
    start = False

    for line in lines:
        if not start:
            if re.search(r'\bNo\b.*\bDescription\b.*\bQty\b.*\bUM\b.*\bNet\b.*\bVAT', line, re.IGNORECASE):
                start = True
                continue
        else:
            if re.search(r'(total|summary|vat)', line, re.IGNORECASE):  # stop when totals start
                break
            # Try extracting values using regex or split
            parts = re.split(r'\s{2,}|\t+', line.strip())  # split by 2+ spaces or tabs
            if len(parts) >= 7:
                line_items.append(f"No: {parts[0]}, Description: {parts[1]}, Qty: {parts[2]}, "
                                  f"UM: {parts[3]}, Net Price: {parts[4]}, Net Worth: {parts[5]}, VAT [%]: {parts[6]}")
            elif len(parts) >= 5:  # fallback
                line_items.append(" | ".join(parts))

    # ----- Extract Summary section -----
    summary_match = re.search(r'(net worth.*?gross worth.*?)$', text, re.IGNORECASE)
    summary_raw = summary_match.group(1).strip() if summary_match else ""

    # ----- Assemble invoice details -----
    details = f"Invoice Number: {invoice_no.group(1) if invoice_no else 'N/A'}, " \
              f"Date: {invoice_date.group(1) if invoice_date else 'N/A'}, " \
              f"Seller: {seller_info.group(1).strip() if seller_info else 'N/A'}, " \
              f"Client: {client_info.group(1).strip() if client_info else 'N/A'}"
    
    invoice_details_list.append(details)
    line_items_list.append("\n".join(line_items) if line_items else "N/A")
    summary_list.append(summary_raw)

# Save output DataFrame
result_df = pd.DataFrame({
    'invoice_details': invoice_details_list,
    'line_items': line_items_list,
    'summary': summary_list
})

output_path = r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\structured_invoice_output.csv"
result_df.to_csv(output_path, index=False)

print("✅ Extraction completed and saved to:", output_path)


✅ Extraction completed and saved to: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\structured_invoice_output.csv


### 🔹 Step 2: Locate Line Item Section
Use regex or keyword heuristics to isolate the portion of the invoice text that contains line items (usually comes after ITEMS, No., or Description).

In [56]:
line_item_sections = []
for text in cleaned_texts:
    match = re.search(r'ITEMS.*?SUMMARY VAT', text, re.IGNORECASE)
    if match:
        line_item_sections.append(match.group())


### 🔹 Step 3: Split Into Lines
Split the block of line items into separate rows — use:

Numbers at the start (1, 2, 3)

Bullet points

Known repeated patterns (e.g., product descriptions often followed by quantity and price)

In [57]:
line_blocks = []
for section in line_item_sections:
    items = re.split(r'\s(?=\d{1,2},\d{2} each)', section)  # crude split pattern
    line_blocks.append(items)


### 🔹 Step 4: Extract Fields from Each Line
Use NLP tools (or regex) to extract:

Description

Quantity

UM

Net price

VAT

Gross price




In [58]:
import re

structured_items = []

for lines in line_blocks:
    for line in lines:
        match = re.search(r'(.+?)\s+(\d+,\d{2})\s+each\s+(\d+,\d{2})\s+(\d+,\d{2})\s+(\d+%)\s+(\d+,\d{2})', line)
        if match:
            structured_items.append({
                'Description': match.group(1).strip(),
                'Qty': match.group(2).replace(',', '.'),
                'Net price': match.group(3).replace(',', '.'),
                'Net worth': match.group(4).replace(',', '.'),
                'VAT': match.group(5),
                'Gross worth': match.group(6).replace(',', '.')
            })


### 🔹 Step 5: Save as Structured CSV



In [59]:
# Step 5: Save to CSV
output_path = r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\line_items_extracted_batch1.csv"
pd.DataFrame(structured_items).to_csv(output_path, index=False)

print(f"[✔] Extraction complete. File saved at:\n{output_path}")


[✔] Extraction complete. File saved at:
C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\line_items_extracted_batch1.csv


In [61]:
import pandas as pd
import re
import os

# Step 1: Read data
file_path = r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\combined_batch1_cleaned.csv"
df = pd.read_csv(file_path)

# Drop nulls and clean OCR texts
ocr_texts = df['clean_text'].dropna().tolist()
cleaned_texts = [re.sub(r'\s+', ' ', text) for text in ocr_texts]

tract line item sections (between ITEMS and SUMMARY VAT)
line_item_sections = []
for text in cleaned_texts:
    match = re.search(r'(ITEMS.*?SUMMARY VAT)', text, re.IGNORECASE)
    if match:
        line_item_sections.append(match.group(1))
    else:
        line_item_sections.append("")  # Append empty if pattern not found

# Step 3: Split into individual line blocks (based on crude pattern of quantity or "each")
line_blocks = []
for section in line_item_sections:
    items = re.split(r'(?=\d+\s+each)', section)  # matches when "100 each" etc. appears
    line_blocks.append(items)

# Step 4: Extract fields using regex (Description, Qty, Net Price, Net Worth, VAT, Gross Worth)
structured_items = []

for lines in line_blocks:
    for line in lines:
        match = re.search(
            r'(.+?)\s+(\d+)\s+each\s+([\d\.]+)\s+([\d\.]+)\s+(\d+)%\s+([\d\.]+)', line
        )
        if match:
            structured_items.append({
                'Description': match.group(1).strip(),
                'Qty': match.group(2),
                'Net price': match.group(3),
                'Net worth': match.group(4),
                'VAT (%)': match.group(5),
                'Gross worth': match.group(6)
            })

# Step 5: Save to CSV
output_path = r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\line_items_key_extracted_batch1.csv"
pd.DataFrame(structured_items).to_csv(output_path, index=False)

print(f"[✔] Extraction complete. File saved at:\n{output_path}")


[✔] Extraction complete. File saved at:
C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\line_items_key_extracted_batch1.csv


##
Still the structure varies between invoices, I  need to train an ML/DL model  (e.g., LayoutLM or spaCy NER fine-tuned on key-value pairs).

