In [4]:
# paste into extract_text_from_images.py or into a notebook cell
#!/usr/bin/env python3
"""
Same script as before, but argparse uses parse_known_args() so it won't crash inside Jupyter.
"""
import argparse
from pathlib import Path
import cv2
import numpy as np
import pandas as pd
from skimage import exposure
from PIL import Image
import pytesseract
import sys

# If tesseract isn't on PATH, uncomment and set the path:
# pytesseract.pytesseract.tesseract_cmd = r"/usr/bin/tesseract"

def ensure_dirs(*dirs):
    for d in dirs:
        d.mkdir(parents=True, exist_ok=True)

def load_input_image(input_path: Path, input_dir: Path):
    if input_path and input_path.exists():
        img = cv2.imread(str(input_path))
        if img is None:
            raise RuntimeError(f"Failed to read image {input_path}")
        return img, str(input_path)
    if input_dir.exists():
        for f in sorted(input_dir.iterdir()):
            if f.suffix.lower() in (".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp"):
                img = cv2.imread(str(f))
                if img is not None:
                    return img, str(f)
    fallback = Path("/Users/jyotirmoy/Desktop/Image/ancient-script-ai/external/tamil_ocr_repo/Input Images/Inscriptions - Wiki1/31.jpg")
    if fallback.exists():
        img = cv2.imread(str(fallback))
        if img is not None:
            return img, str(fallback)
    raise FileNotFoundError("No input image found. Place Original.jpg or images in 'Input Images/' or provide --input path")

def preprocess(img, denoise=True, adaptive=True):
    if img.ndim == 3:
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    else:
        gray = img.copy()
    p2, p98 = np.percentile(gray, (2, 98))
    gray = exposure.rescale_intensity(gray, in_range=(p2, p98))
    if denoise:
        gray = cv2.medianBlur(gray, 3)
    if adaptive:
        th = cv2.adaptiveThreshold(gray.astype(np.uint8), 255,
                                   cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY_INV, 25, 12)
    else:
        _, th = cv2.threshold(gray.astype(np.uint8), 0, 255,
                              cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    return gray, th

def segment_lines(th_img, min_height=12, gap_tol=5):
    hp = np.sum(th_img, axis=1)
    h = th_img.shape[0]
    lines = []
    in_line = False
    start = 0
    empty_rows = 0
    for i in range(h):
        if hp[i] > 0:
            if not in_line:
                in_line = True
                start = i
            empty_rows = 0
        else:
            if in_line:
                empty_rows += 1
                if empty_rows >= gap_tol:
                    end = i - empty_rows + 1
                    if (end - start) >= min_height:
                        s = max(0, start - 2)
                        e = min(h, end + 2)
                        lines.append((s, e))
                    in_line = False
                    empty_rows = 0
    if in_line:
        end = h - 1
        if (end - start) >= min_height:
            s = max(0, start - 2)
            e = min(h, end + 2)
            lines.append((s, e))
    return lines

def segment_words_from_line(line_th, kernel_width=40, kernel_height=12, min_area=500):
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_width, kernel_height))
    dilated = cv2.dilate(line_th, kernel, iterations=1)
    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    boxes = []
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        if w * h >= min_area:
            boxes.append((x, y, w, h))
    boxes = sorted(boxes, key=lambda b: b[0])
    return boxes, dilated

def save_image(path: Path, img):
    cv2.imwrite(str(path), img)

def main(args):
    root = Path('.')
    input_dir = root / "Input Images"
    out_words = root / "extracted_words"
    out_chars = root / "extracted_characters"
    ensure_dirs(out_words, out_chars)

    input_img_path = Path(args.input) if args.input else None
    img, used_path = load_input_image(input_img_path, input_dir)
    print(f"Loaded image: {used_path} shape={img.shape}")

    gray, th = preprocess(img, denoise=True, adaptive=True)
    print("Preprocessing complete.")

    lines = segment_lines(th, min_height=args.line_min_height, gap_tol=args.line_gap_tol)
    print(f"Detected {len(lines)} lines")

    word_records = []
    char_records = []
    word_id = 0
    char_id = 0

    for li, (s, e) in enumerate(lines):
        line_th = th[s:e, :]
        boxes, dil = segment_words_from_line(line_th,
                                             kernel_width=args.kernel_width,
                                             kernel_height=args.kernel_height,
                                             min_area=args.min_area)
        for (x, y, w, h) in boxes:
            full_x = x
            full_y = s + y
            word_crop_inv = th[s + y: s + y + h, x: x + w]
            save_img = 255 - word_crop_inv
            fname = out_words / f"word_line{li:02d}_{word_id:04d}.png"
            save_image(fname, save_img)
            word_records.append({
                "line": li,
                "word_id": word_id,
                "bbox_full": (int(full_y), int(full_x), int(h), int(w)),
                "file": str(fname)
            })
            word_id += 1
        print(f"Line {li}: found {len(boxes)} word(s)")

    for wrec in word_records:
        word_path = Path(wrec["file"])
        wp = cv2.imread(str(word_path), cv2.IMREAD_GRAYSCALE)
        if wp is None:
            continue
        _, bw = cv2.threshold(wp, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        bw_inv = 255 - bw
        contours, _ = cv2.findContours(bw_inv, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        boxes = []
        for c in contours:
            x, y, w, h = cv2.boundingRect(c)
            if w * h >= args.char_min_area:
                boxes.append((x, y, w, h))
        boxes = sorted(boxes, key=lambda b: b[0])
        for (x, y, w, h) in boxes:
            crop = bw[y:y + h, x:x + w]
            fname = out_chars / f"char_{char_id:05d}.png"
            save_image(fname, crop)
            char_records.append({
                "word_file": str(word_path),
                "char_file": str(fname),
                "bbox": (int(x), int(y), int(w), int(h))
            })
            char_id += 1

    print(f"Saved {word_id} words and {char_id} character candidates.")

    ocr_list = []
    use_lang = args.lang
    for wrec in word_records:
        path = wrec["file"]
        pil = Image.open(path).convert("L")
        arr = np.array(pil)
        if arr.mean() < 127:
            arr = 255 - arr
        pil_for_ocr = Image.fromarray(arr)
        try:
            if use_lang:
                txt = pytesseract.image_to_string(pil_for_ocr, lang=use_lang)
            else:
                txt = pytesseract.image_to_string(pil_for_ocr)
        except Exception as e:
            print("Tesseract exception:", e, "-> retrying without lang")
            txt = pytesseract.image_to_string(pil_for_ocr)
        ocr_list.append({"file": path, "text": txt.strip()})

    pd.DataFrame(word_records).to_csv("segmented_words.csv", index=False)
    pd.DataFrame(char_records).to_csv("segmented_characters.csv", index=False)
    pd.DataFrame(ocr_list).to_csv("ocr_results.csv", index=False)

    print("Saved segmented_words.csv, segmented_characters.csv, ocr_results.csv")
    print("Done.")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Extract words/characters/text from images (segmentation + Tesseract OCR)")
    parser.add_argument("--input", "-i", help="Input image path (optional). If not provided, uses Original.jpg or first image in 'Input Images/'.")
    parser.add_argument("--kernel-width", type=int, default=40, help="Morphology kernel width for word segmentation (increase for larger gaps)")
    parser.add_argument("--kernel-height", type=int, default=12, help="Morphology kernel height for word segmentation")
    parser.add_argument("--min-area", type=int, default=500, help="Minimum area (pixels) to keep a word bounding box")
    parser.add_argument("--char-min-area", type=int, default=30, help="Minimum area for character connected-component")
    parser.add_argument("--line-min-height", type=int, default=12, help="Minimum height for a line")
    parser.add_argument("--line-gap-tol", type=int, default=5, help="Allowed small empty-row gaps inside a line")
    parser.add_argument("--lang", type=str, default="tam", help="Language code for tesseract (set to 'tam' for Tamil traineddata). Set empty to use default.")
    # Use parse_known_args so Jupyter kernel args won't break it
    args, unknown = parser.parse_known_args()
    if args.lang == "":
        args.lang = None
    try:
        main(args)
    except Exception as exc:
        print("Error:", exc)
        sys.exit(1)

Error: No input image found. Place Original.jpg or images in 'Input Images/' or provide --input path


SystemExit: 1