In [None]:
# @title  Intelligent Document Processing (IDP) â€“ Final Robust Version



!pip install -q easyocr pillow matplotlib google-generativeai

import easyocr
import json
import re
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
import google.generativeai as genai
from google.colab import files



def run_easyocr(image_path, reader):
    print("ðŸ”Ž Running OCR...")
    text = reader.readtext(image_path, detail=0, paragraph=True)
    return "\n".join(text)

def visualize_ocr(image_path, reader):
    image = Image.open(image_path)
    bounds = reader.readtext(image_path)
    draw = ImageDraw.Draw(image)

    for bound in bounds:
        p0, p1, p2, p3 = bound[0]
        draw.line([*p0, *p1, *p2, *p3, *p0], fill="red", width=2)

    plt.figure(figsize=(10, 10))
    plt.imshow(image)
    plt.axis("off")
    plt.title("OCR Text Detection")
    plt.show()

def rule_based_nlp(text):
    print("ðŸ§  Applying rule-based NLP...")
    text = text.upper()

    data = {
        "document_type": "GST Tax Invoice",
        "company_name": None,
        "gstin": None,
        "invoice_number": None,
        "invoice_date": None,
        "taxable_amount": None,
        "igst": None,
        "total_amount_after_tax": None
    }

    # Company
    m = re.search(r"GUJARAT\s+FREIGHT\s+TOOLS", text)
    if m:
        data["company_name"] = m.group(0).title()

    # GSTIN
    m = re.search(r"\b\d{2}[A-Z]{5}\d{4}[A-Z][A-Z0-9]Z[A-Z0-9]\b", text)
    if m:
        data["gstin"] = m.group(0)

    # Invoice Number
    m = re.search(r"INVOICE\s+NO\.?\s*[:\-]?\s*([A-Z0-9]+)", text)
    if m:
        data["invoice_number"] = m.group(1)

    # Dates
    dates = re.findall(r"\d{2}-[A-Z]{3}-\d{4}", text)
    if dates:
        data["invoice_date"] = dates[0]

    # Amounts
    amounts = re.findall(r"\b\d{1,3}(?:,\d{3})*(?:\.\d{2})\b", text)
    amounts = sorted({float(a.replace(",", "")) for a in amounts}, reverse=True)

    if amounts:
        data["total_amount_after_tax"] = amounts[0]

    for amt in amounts[1:]:
        igst = round(amt * 0.18, 2)
        if abs((amt + igst) - data["total_amount_after_tax"]) < 2:
            data["taxable_amount"] = amt
            data["igst"] = igst
            break

    return data

def refine_with_gemini(text, api_key):
    print("âœ¨ Refining using Gemini...")
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel("gemini-1.5-flash")

    prompt = f"""
    Extract structured invoice data from the OCR text below.
    Return ONLY valid JSON.

    Fields:
    invoice_number, invoice_date, company_name, gstin,
    taxable_amount, igst, total_amount_after_tax

    OCR Text:
    {text}
    """

    response = model.generate_content(prompt)
    cleaned = response.text.replace("```json", "").replace("```", "").strip()
    return json.loads(cleaned)



# Upload Image
print(" Upload Invoice Image")
uploaded = files.upload()
image_path = list(uploaded.keys())[0]

# Ask Gemini API Key
gemini_key = input("ðŸ”‘ Enter Gemini API Key (press Enter to skip): ").strip()

# OCR Reader
reader = easyocr.Reader(['en'], gpu=True)

# Visualize OCR
visualize_ocr(image_path, reader)

# Extract OCR Text
raw_text = run_easyocr(image_path, reader)

print("\n RAW OCR TEXT")
print("=" * 40)
print(raw_text)

# Rule-based extraction
data = rule_based_nlp(raw_text)

# Optional Gemini refinement
if gemini_key:
    try:
        gemini_data = refine_with_gemini(raw_text, gemini_key)
        data.update({k: v for k, v in gemini_data.items() if v})
    except Exception as e:
        print(" Gemini failed, using rule-based output only")


print("\nðŸ“„ FINAL STRUCTURED OUTPUT (JSON)")
print("=" * 40)
print(json.dumps(data, indent=4))
