<a href="https://colab.research.google.com/github/Jayatripathi11/Bill-Parser-/blob/main/bill.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import streamlit as st
import pytesseract
import cv2
import numpy as np
import re
from PIL import Image
from dateutil.parser import parse as parse_date
from pdf2image import convert_from_path
import io
import pandas as pd
import os
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


# ------------------ Keyword Dictionary ------------------ #
keywords = {
    'date': ['date', 'issued on', 'bill date', 'invoice date', 'receipt date', 'purchase date', 'transaction date'],
    'cgst': ['cgst', 'central gst', 'central goods and services tax', 'cgst amt'],
    'sgst': ['sgst', 'state gst', 'state goods and services tax', 'sgst amt'],
    'total': ['total', 'grand total', 'total amount', 'amount paid', 'total payable']
}

# ------------------ Helper Functions ------------------ #
def extract_text_from_image(image):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
    return pytesseract.image_to_string(gray)

def extract_date(text):
    date_patterns = re.findall(
        r'(\d{1,2}[-/\.]?\d{1,2}[-/\.]?\d{2,4}|\d{1,2}\s+[A-Za-z]{3,9}\s+\d{2,4}|\d{4}[-/\.]?\d{1,2}[-/\.]?\d{1,2})',
        text
    )
    for date_str in date_patterns:
        try:
            return str(parse_date(date_str, dayfirst=True).date())
        except:
            continue
    return "Not Found"

def extract_field_value(text, field):
    field_keywords = keywords[field]
    for keyword in field_keywords:
        keyword_pattern = re.compile(rf'{keyword}[\s\.:]*([\d,]+[\.,\d]*)', re.IGNORECASE)
        match = keyword_pattern.search(text)
        if match:
            return match.group(1)
    return "Not Found"

def extract_tax_and_total(text):
    return {
        'CGST': extract_field_value(text, 'cgst'),
        'SGST': extract_field_value(text, 'sgst'),
        'Total Amount': extract_field_value(text, 'total')
    }

def convert_pdf_to_images(pdf_file):
    with open("temp.pdf", "wb") as f:
        f.write(pdf_file.read())
    return convert_from_path("temp.pdf")

def process_file(file):
    if file.name.lower().endswith(".pdf"):
        images = convert_pdf_to_images(file)
    else:
        images = [Image.open(file)]

    all_results = []

    for image in images:
        text = extract_text_from_image(image)
        date = extract_date(text)
        taxes = extract_tax_and_total(text)

        result = {
            'Filename': file.name,
            'Date': date,
            'CGST': taxes['CGST'],
            'SGST': taxes['SGST'],
            'Total Amount': taxes['Total Amount']
        }
        all_results.append(result)

    return all_results

# ------------------ Streamlit UI ------------------ #
st.title("🧾 Billing Receipt Extractor")
st.write("Upload images or PDFs of your bills to extract Date, CGST, SGST, and Total Amount.")

uploaded_files = st.file_uploader("Upload multiple receipts (images or PDFs):", type=["png", "jpg", "jpeg", "pdf"], accept_multiple_files=True)

if uploaded_files:
    all_data = []
    for file in uploaded_files:
        results = process_file(file)
        all_data.extend(results)

    df = pd.DataFrame(all_data)
    st.dataframe(df)

    csv = df.to_csv(index=False).encode('utf-8')
    st.download_button("📥 Download as CSV", csv, "billing_data.csv", "text/csv")
