In [6]:
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [7]:
!where tesseract

C:\Program Files\Tesseract-OCR\tesseract.exe


In [8]:
import os
import json
import cv2
import pytesseract
from tqdm import tqdm

# -----------------------------
# PATHS
# -----------------------------
INPUT_DIR = r"D:\Y4 Research\datasets\dietary Images\denoised"
OUT_DIR = r"D:\Y4 Research\datasets\dietary Images\denoised_ocr_json"

os.makedirs(OUT_DIR, exist_ok=True)

# -----------------------------
# Tesseract config
# -----------------------------
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
custom_config = r"--oem 3 --psm 6"

# -----------------------------
# OCR loop
# -----------------------------
for img_name in tqdm(os.listdir(INPUT_DIR)):
    if not img_name.lower().endswith((".png", ".jpg", ".jpeg")):
        continue

    img_path = os.path.join(INPUT_DIR, img_name)
    image = cv2.imread(img_path)

    if image is None:
        print("Failed to read:", img_path)
        continue

    # Full OCR text
    ocr_text = pytesseract.image_to_string(image, config=custom_config)

    # OCR with bounding boxes
    data = pytesseract.image_to_data(image, config=custom_config, output_type=pytesseract.Output.DICT)

    words = []
    n = len(data["text"])
    for i in range(n):
        text = data["text"][i].strip()
        if text == "":
            continue
        try:
            conf = int(float(data["conf"][i]))
        except:
            conf = 0
        word_info = {
            "text": text,
            "confidence": conf,
            "bbox": [
                int(data["left"][i]),
                int(data["top"][i]),
                int(data["width"][i]),
                int(data["height"][i])
            ]
        }
        words.append(word_info)

    output_json = {
        "image_id": img_name,
        "ocr_text": ocr_text,
        "words": words
    }

    out_path = os.path.join(OUT_DIR, os.path.splitext(img_name)[0] + ".json")
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(output_json, f, indent=2, ensure_ascii=False)

100%|██████████| 100/100 [02:48<00:00,  1.68s/it]
