In [1]:
# OCR setup: ensure dependencies, imports, and logging
from __future__ import annotations

import os
import sys
import logging
from pathlib import Path

# Ensure required packages are available inside the current kernel

def ensure_package_installed(package_name: str) -> None:
    try:
        __import__(package_name)
    except ImportError:  # Install into the current environment/kernel
        import subprocess
        logging.info(f"Installing missing package: {package_name}")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])

# Core OCR deps
ensure_package_installed("pytesseract")
ensure_package_installed("Pillow")

import pytesseract  # noqa: E402
from PIL import Image, ImageOps, ImageFilter  # noqa: E402
import csv  # noqa: E402
import re  # noqa: E402
from typing import Optional, Tuple, Dict, List  # noqa: E402

# Logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
)

logging.info("Environment ready. PyTesseract and PIL are available.")


In [2]:
# Tesseract executable detection for Windows and other OS

def detect_tesseract_cmd() -> Optional[str]:
    """Return absolute path to tesseract executable if found, else None.
    Tries common Windows install paths, PATH resolution, and choco/scoop defaults.
    """
    candidates: List[Path] = []
    # 1) Respect existing env var if present
    env_cmd = os.environ.get("TESSERACT_CMD")
    if env_cmd:
        p = Path(env_cmd)
        if p.exists():
            return str(p)

    # 2) Common Windows install locations
    program_files = os.environ.get("PROGRAMFILES", r"C:\\Program Files")
    program_files_x86 = os.environ.get("PROGRAMFILES(X86)", r"C:\\Program Files (x86)")
    local_programs = os.environ.get("LOCALAPPDATA", r"C:\\Users\\%USERNAME%\\AppData\\Local")
    common_roots = [
        Path(program_files) / "Tesseract-OCR" / "tesseract.exe",
        Path(program_files_x86) / "Tesseract-OCR" / "tesseract.exe",
        Path(local_programs) / "Programs" / "Tesseract-OCR" / "tesseract.exe",
    ]
    candidates.extend(common_roots)

    # 3) PATH lookup
    for path_dir in os.environ.get("PATH", "").split(os.pathsep):
        if not path_dir:
            continue
        candidates.append(Path(path_dir) / "tesseract.exe")
        candidates.append(Path(path_dir) / "tesseract")

    for c in candidates:
        try:
            if c.is_file():
                return str(c)
        except Exception:
            continue

    return None

TES_CMD = detect_tesseract_cmd()
if TES_CMD:
    pytesseract.pytesseract.tesseract_cmd = TES_CMD
    logging.info(f"Using tesseract at: {TES_CMD}")
else:
    logging.warning(
        "Tesseract executable not found. Install it and/or set TESSERACT_CMD env var.\n"
        "Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki"
    )


In [3]:
# Preprocessing and OCR helpers

DIGIT_CONFIG = "--oem 3 --psm 7 -c tessedit_char_whitelist=0123456789"


def preprocess_for_digits(img: Image.Image) -> Image.Image:
    """Basic preprocessing to enhance digit OCR.
    - Convert to grayscale
    - Auto-contrast
    - Slight sharpen
    - Resize up if very small
    """
    gray = ImageOps.grayscale(img)
    # Increase contrast and remove noise
    enhanced = ImageOps.autocontrast(gray)
    enhanced = enhanced.filter(ImageFilter.SHARPEN)
    # If small, upscale to help OCR
    min_side = min(enhanced.size)
    if min_side < 40:
        scale = max(2, int(80 / max(1, min_side)))
        new_size = (enhanced.width * scale, enhanced.height * scale)
        enhanced = enhanced.resize(new_size)
    return enhanced


def ocr_digits_from_image_path(image_path: Path) -> Dict[str, str]:
    """Run OCR for digits and return dict with path and text."""
    try:
        with Image.open(image_path) as im:
            pre = preprocess_for_digits(im)
            raw_text = pytesseract.image_to_string(pre, config=DIGIT_CONFIG)
            # Keep only digits, join contiguous sequences
            digits_only = re.findall(r"\d+", raw_text)
            result_text = " ".join(digits_only) if digits_only else ""
            return {"file": str(image_path), "text": result_text}
    except Exception as exc:
        logging.exception(f"Failed OCR for {image_path}: {exc}")
        return {"file": str(image_path), "text": "", "error": str(exc)}


In [6]:
# Safe CSV writer with permission fallback
from datetime import datetime


def safe_write_csv(rows: List[Dict[str, str]], csv_path: Path) -> Path:
    """Attempt to write CSV; if permission denied, write to timestamped file.
    Returns the actual path written.
    """
    csv_path.parent.mkdir(parents=True, exist_ok=True)
    fieldnames = sorted({k for r in rows for k in r.keys()}) or ["file", "text"]
    try:
        with open(csv_path, mode="w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(rows)
        return csv_path
    except PermissionError:
        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
        fallback = csv_path.with_name(csv_path.stem + f"_{ts}" + csv_path.suffix)
        with open(fallback, mode="w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(rows)
        logging.warning(
            f"Permission denied for {csv_path}. Wrote to fallback {fallback} instead."
        )
        return fallback


def run_batch_ocr_to_csv_safe(inputs_dir: Path, csv_path: Path) -> Path:
    rows: List[Dict[str, str]] = []
    if not inputs_dir.exists():
        logging.error(f"Input directory does not exist: {inputs_dir}")
        return csv_path
    for img_path in iter_image_paths(inputs_dir):
        rec = ocr_digits_from_image_path(img_path)
        rows.append(rec)
    written = safe_write_csv(rows, csv_path)
    logging.info(f"Wrote {len(rows)} rows to {written}")
    return written

# Re-run with fallback
written_csv = run_batch_ocr_to_csv_safe(CROPS_DIR, OUTPUT_CSV)
written_csv




WindowsPath('dataset/inference/train_numbers_20250930_075403.csv')