In [None]:
# ============================================================
# ocr_extractor.py (이미지 -> 텍스트)
# ============================================================

import os, glob
import easyocr
from tqdm import tqdm

BASE = "/data/ephemeral/home/data"
TRAIN_IMG_DIR = f"{BASE}/processed/stage0_6_train_v6"
TEST_IMG_DIR  = f"{BASE}/processed/stage0_6_test_v6"

SAVE_DIR_TRAIN = f"{BASE}/ocr/train_texts_v6"
SAVE_DIR_TEST  = f"{BASE}/ocr/test_texts_v6"

os.makedirs(SAVE_DIR_TRAIN, exist_ok=True)
os.makedirs(SAVE_DIR_TEST, exist_ok=True)

reader = easyocr.Reader(["ko", "en"], gpu=True)

def extract_ocr_text(img_path):
    result = reader.readtext(img_path, detail=0, paragraph=True)
    return " ".join(result).strip()

def process_folder(raw_dir, save_dir):
    files = glob.glob(os.path.join(raw_dir, "**", "*.jpg"), recursive=True)
    for fp in tqdm(files, desc=f"OCR Extracting {os.path.basename(raw_dir)}"):
        base = os.path.splitext(os.path.basename(fp))[0]
        save_path = os.path.join(save_dir, f"{base}.txt")
        if os.path.exists(save_path):
            continue
        try:
            text = extract_ocr_text(fp)
            with open(save_path, "w", encoding="utf-8") as f:
                f.write(text)
        except Exception as e:
            print(f"❌ {base}: {e}")

process_folder(TRAIN_IMG_DIR, SAVE_DIR_TRAIN)
process_folder(TEST_IMG_DIR, SAVE_DIR_TEST)
print("✅ OCR 텍스트 추출 완료 (v6 기준)")
