In [None]:
# This is where IMAGE_PATH will be set by the executor
# IMAGE_PATH = 'path/to/image.jpg'

In [None]:
# Import necessary libraries
import pytesseract
import cv2
import numpy as np
from PIL import Image
import json
import os

In [None]:
# Simple preprocessing for image enhancement
def preprocess_image(image_path):
    # Read the image
    img = cv2.imread(image_path)
    
    # Check if image was loaded correctly
    if img is None:
        # Fallback to PIL if OpenCV fails
        try:
            pil_img = Image.open(image_path)
            img = np.array(pil_img)
            # Convert RGB to BGR (OpenCV format)
            if len(img.shape) == 3 and img.shape[2] == 3:
                img = img[:, :, ::-1].copy()
        except Exception as e:
            return None, str(e)
            
    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Apply thresholding to handle shadows and variations in lighting
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    
    # Perform morphological operations to remove noise
    kernel = np.ones((1, 1), np.uint8)
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
    
    # Invert back
    result = cv2.bitwise_not(opening)
    
    return result, None

In [None]:
# OCR function to extract text
def perform_ocr(image, lang='tam'):
    try:
        # Use Tamil language model
        text = pytesseract.image_to_string(image, lang=lang)
        return text, None
    except Exception as e:
        # Fall back to English if Tamil isn't available
        try:
            text = pytesseract.image_to_string(image)
            return text + "\n(Note: Processed using English OCR due to missing Tamil language pack)", str(e)
        except Exception as e2:
            return "", f"OCR Error: {str(e2)}"

In [None]:
# Check if the image exists
if 'IMAGE_PATH' in globals() and os.path.exists(IMAGE_PATH):
    # Preprocess the image
    processed_img, preprocess_error = preprocess_image(IMAGE_PATH)
    
    if preprocess_error:
        ocr_results = {
            "error": f"Preprocessing error: {preprocess_error}",
            "extracted_text": "",
            "confidence": 0
        }
    else:
        # Perform OCR
        extracted_text, ocr_error = perform_ocr(processed_img)
        
        # If Tamil OCR is not available, use the warning text from the OCR function
        ocr_results = {
            "extracted_text": extracted_text,
            "confidence": 85 if not ocr_error else 60,
            "warning": ocr_error if ocr_error else None
        }
else:
    ocr_results = {
        "error": f"Image not found: {'IMAGE_PATH' if 'IMAGE_PATH' in globals() else 'No image path specified'}",
        "extracted_text": "",
        "confidence": 0
    }

# Output the results as JSON
print(json.dumps(ocr_results, ensure_ascii=False))