In [30]:
import os, csv, re
import datetime
from pdf2image import convert_from_path
from PIL import Image
from kraken import binarization, blla, rpred
from kraken.lib import models
from dateutil import parser
import pandas as pd
import json
import subprocess

In [49]:
# Create base dirs if they don't exist
os.makedirs("images", exist_ok=True)
os.makedirs("ocr", exist_ok=True)

# Create timestamped run dirs
timestamp = datetime.datetime.now().strftime("run_%Y%m%d_%H%M%S")
img_run_dir = os.path.join("images", timestamp)
ocr_run_dir = os.path.join("ocr", timestamp)
os.makedirs(img_run_dir, exist_ok=True)
os.makedirs(ocr_run_dir, exist_ok=True)

print(f"[INFO] Saving images to {img_run_dir}")
print(f"[INFO] Saving OCR text to {ocr_run_dir}")

json_path = os.path.join(ocr_run_dir, "ocr_output.json")
ocr_results = []

model_path = "models/arabic_best.mlmodel"
# Load OCR model
model = models.load_any("models/arabic_best.mlmodel")

os.makedirs("gt", exist_ok=True)

# Convert page 11 from PDF to images
pages = convert_from_path("books/attacks.pdf", dpi=300, first_page=11, last_page=11)

[INFO] Saving images to images/run_20250831_224411
[INFO] Saving OCR text to ocr/run_20250831_224411


In [None]:
import cv2
import numpy as np

def contour_crops(img_path, out_dir, prefix):
    os.makedirs(out_dir, exist_ok=True)
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)

    # binarize (Otsu, or replace with Sauvola)
    _, thresh = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # invert if text is white on black
    if np.mean(thresh) > 127:
        thresh = cv2.bitwise_not(thresh)

    # find contours
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    crops = []
    for j, cnt in enumerate(contours):
        x, y, w, h = cv2.boundingRect(cnt)

        # skip tiny blobs
        if w*h < 500:
            continue

        crop = img[y:y+h, x:x+w]
        out_path = os.path.join(out_dir, f"{prefix}_roi{j}.png")
        cv2.imwrite(out_path, crop)
        crops.append(out_path)

    return crops


In [None]:
page = 11

for i, page in enumerate(pages, start=page):
    w, h = page.size
    halves = {
        "right": page.crop((w // 2, 0, w, h)),
        "left": page.crop((0, 0, w // 2, h)),
    }

    for side, img in halves.items():
        # Convert PIL → OpenCV
        cv_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2GRAY)

        # Binarize for contour detection (only for segmentation, not OCR input)
        _, thresh = cv2.threshold(cv_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        if np.mean(thresh) > 127:
            thresh = cv2.bitwise_not(thresh)

        # Find contours
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        # Sort top-to-bottom, left-to-right
        contours = sorted(contours, key=lambda c: (cv2.boundingRect(c)[1], cv2.boundingRect(c)[0]))

        roi_texts = []
        for j, cnt in enumerate(contours):
            x, y, w, h = cv2.boundingRect(cnt)

            # Skip small blobs
            if w * h < 500:
                continue

            # Crop ROI and convert back to PIL for kraken
            roi = img.crop((x, y, x + w, y + h))

            # Directly segment + OCR (no binarization here)
            seg = blla.segment(roi)
            pred = rpred.rpred(model, roi, seg)
            roi_texts.append("\n".join([line.prediction for line in pred]))

        # Collect results for JSON
        ocr_results.append({
            "page": i,
            "side": side,
            "text": "\n".join(roi_texts)
        })

In [None]:
page = 11

for i, page in enumerate(pages, start=page):
    w, h = page.size
    halves = {
        "right": page.crop((w // 2, 0, w, h)),
        "left": page.crop((0, 0, w // 2, h)),
    }
 

    for side, img in halves.items():
        img_path = os.path.join(img_run_dir, f"page_{i}_{side}.png")
        xml_out = os.path.join("gt", f"page_{i}_{side}.xml")
        img.save(img_path)
        # contour step
        roi_dir = os.path.join(img_run_dir, f"page_{i}_{side}_rois")
        roi_imgs = contour_crops(img_path, roi_dir, f"page_{i}_{side}")
    
        

        cmd = ["kraken", "-x", "-i", img_path, xml_out, "binarize", "segment", "ocr", "-m", model_path]
        
        try:
            subprocess.run(cmd, check=True)
            print(f"[OK] {xml_out}")
        except subprocess.CalledProcessError as e:
            print(f"[ERR] {img_path}: {e}")


Loading ANN models/arabic_best.mlmodel	✓
✓
Binarizing	Binarizing	✓
Segmenting images/run_20250829_164822/page_11_right.png	✓
Segmenting images/run_20250829_164822/page_11_right.png	✓
[2;36m                    [0m         [1m{[0m[32m'baselines'[0m[1m}[0m will be applied to       [2m            [0m
[2;36m                    [0m         segmentation of type bbox. This will   [2m            [0m
[2;36m                    [0m         likely result in severely degraded     [2m            [0m
[2;36m                    [0m         performace                             [2m            [0m
[2KProcessing [38;5;237m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [35m  0%[0m [35m0/32[0m [36m-:--:--[0m [33m0:00:00[0m✓
[2;36m                    [0m         [1m{[0m[32m'baselines'[0m[1m}[0m will be applied to       [2m            [0m
[2;36m                    [0m         segmentation of type bbox. This will   [2m            [0m
[2;36m                    [0m  