In [None]:
# ===============================
# üß† INSTALL + SERVER LAUNCH (STABLE VERSION)
# ===============================
!pip install -U bitsandbytes transformers==4.57.0 matplotlib opencv-python pandas tqdm accelerate gradio_client qwen_vl_utils hf_transfer autoawq docext shapely

In [None]:
import os, re, json, torch, cv2
import numpy as np
import pandas as pd
from difflib import SequenceMatcher
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig, pipeline
from PIL import Image
from shapely.geometry import Polygon
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

In [None]:
# ===============================
# ‚öôÔ∏è Í≤ΩÎ°ú ÏÑ§Ï†ï
# ===============================
IMG_DIR = "Document06/TEST06_image"
JSON_DIR = "Document06/TEST06_json"
OUT_DIR = "vlm_outputs6"
MODEL_NAME = "Qwen/Qwen3-VL-8B-Instruct"
os.makedirs(OUT_DIR, exist_ok=True)

In [None]:
# ===============================
# üì¶ NER Î™®Îç∏ Î°úÎìú
# ===============================
print("üîÑ Loading NER model (joon09/kor-naver-ner-name-v2.1)...")
ner_model_name = "joon09/kor-naver-ner-name-v2.1"
config = AutoConfig.from_pretrained(ner_model_name)
id2label = config.id2label
tokenizer = AutoTokenizer.from_pretrained(ner_model_name)
ner_model = AutoModelForTokenClassification.from_pretrained(ner_model_name)
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=tokenizer, aggregation_strategy="none")
print("‚úÖ NER model loaded!")

# ===============================
# üî∫ VLM MODEL LOAD
# ===============================
print("üîÑ Loading VLM model...")
model = Qwen3VLForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype="auto", device_map="auto")
processor = AutoProcessor.from_pretrained(MODEL_NAME)
print("‚úÖ VLM model loaded!")

üîÑ Loading NER model (joon09/kor-naver-ner-name-v2.1)...


Device set to use cuda:0
`torch_dtype` is deprecated! Use `dtype` instead!


‚úÖ NER model loaded!
üîÑ Loading VLM model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

‚úÖ VLM model loaded!


In [None]:
# ===============================
# üß© UTILS
# ===============================
def natural_key(s: str):
    return [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', s)]

def normalize_ko(s: str) -> str:
    if s is None: return ""
    s = s.lower()
    s = re.sub(r"[„Ñ±-„Öé]", "", s)
    s = re.sub(r"[^0-9a-zÍ∞Ä-Ìû£]", "", s)
    return s

def seq_ratio(a: str, b: str) -> float:
    if not a or not b: return 0.0
    return SequenceMatcher(None, a, b).ratio()

def flatten_parsed_text(text: str) -> str:
    if not text: return ""
    s = re.sub(r"[\n\t]+", " ", text)
    s = re.sub(r"[„Ñ±-„Öé]", "", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()

def flatten_parsed_text2(text: str) -> str:
    if not text: return ""
    s = re.sub(r"[\n\t]+", " ", text)
    return s.strip()

def build_words_list(ocr_lines, y_thresh=15):
    raw_words = []
    for line in ocr_lines:
        for w in line.get("Words", []):
            txt = w.get("WordText", "")
            bbox = w.get("bbox", None)
            if not txt or not bbox:
                continue
            x1, y1 = bbox[0]
            x3, y3 = bbox[2]
            y_center = (y1 + y3) / 2
            raw_words.append({
                "text": txt, "text_norm": normalize_ko(txt),
                "bbox": bbox, "y_center": y_center, "x_left": x1
            })
    if not raw_words:
        return []
    raw_words.sort(key=lambda x: x["y_center"])

    lines, current_line = [], [raw_words[0]]
    prev_y = raw_words[0]["y_center"]
    for w in raw_words[1:]:
        if abs(w["y_center"] - prev_y) < y_thresh:
            current_line.append(w)
        else:
            lines.append(current_line)
            current_line = [w]
        prev_y = w["y_center"]
    lines.append(current_line)

    words = []
    for line in lines:
        sorted_line = sorted(line, key=lambda x: x["x_left"])
        for w in sorted_line:
            words.append({
                "text": w["text"],
                "text_norm": w["text_norm"],
                "bbox": w["bbox"]
            })
    return words

def merge_bboxes(bboxes):
    xs1 = [b[0][0] for b in bboxes]
    ys1 = [b[0][1] for b in bboxes]
    xs2 = [b[2][0] for b in bboxes]
    ys2 = [b[2][1] for b in bboxes]
    return [[min(xs1), min(ys1)], [max(xs2), min(ys1)], [max(xs2), max(ys2)], [min(xs1), max(ys2)]]

def bbox_iou(b1, b2):
    p1, p2 = Polygon(b1), Polygon(b2)
    inter = p1.intersection(p2).area
    union = p1.union(p2).area
    return inter / union if union > 0 else 0.0

# Ïù¥Î¶Ñ/Ï£ºÏÜå Îß§Ïπ≠ Ìï®Ïàò Í∑∏ÎåÄÎ°ú Ïú†ÏßÄ
# ===============================
# Ïù¥Î¶Ñ Îß§Ïπ≠
# ===============================
def find_name_matches_by_length(words, target, masked_bboxes=None,
                                same_thresh=0.9, sim_thresh=0.6):
    if not target or len(words) == 0:
        return []
    target_raw = target.strip()
    target_len_raw = len(target_raw)
    target_norm = normalize_ko(target)
    best_score, best_bbox, best_words = 0.0, None, []

    for i in range(len(words)):
        seg_text, seg_bboxes, seg_words = "", [], []
        for j in range(i, len(words)):
            seg_text += words[j]["text_norm"]
            seg_bboxes.append(words[j]["bbox"])
            seg_words.append(words[j]["text"])
            if len(seg_text) > target_len_raw + 1: break
            if len(seg_text) < target_len_raw: continue
            score = SequenceMatcher(None, target_norm, seg_text).ratio()
            if score > best_score:
                best_score, best_bbox, best_words = score, merge_bboxes(seg_bboxes), seg_words.copy()

    if target_len_raw <= 2 and (not best_bbox or best_score < 0.9):
        #print(f"‚ö†Ô∏è ÏßßÏùÄ Ïù¥Î¶Ñ Ï†ÄÏú†ÏÇ¨ÎèÑ Ïä§ÌÇµ: {target} (sim={best_score:.2f})")
        return []
    if not best_bbox or best_score < sim_thresh:
        #print(f"‚ö†Ô∏è Ïù¥Î¶Ñ ÎØ∏Îß§Ïπ≠ (sim={best_score:.2f}): {target}")
        return []
    if masked_bboxes:
        for mb in masked_bboxes:
            if bbox_iou(mb, best_bbox) > same_thresh:
                print(f"‚è© ÎèôÏùº ÏúÑÏπò Ïù¥Î¶Ñ Ïä§ÌÇµ: {target}")
                return []
    #print(f"‚úÖ Ïù¥Î¶Ñ Îß§Ïπ≠: {target} ‚Üî OCR Îã®Ïñ¥ {best_words} (sim={best_score:.2f})")
    return [best_bbox]

# ===============================
# üß© NER Ïù¥Î¶Ñ Î≥ëÌï© Ìï®Ïàò
# ===============================
def collect_full_names(ner_results):
    """
    NER Í≤∞Í≥ºÏóêÏÑú B-PER, I-PER Ïó∞ÏÜç Íµ¨Í∞ÑÏùÑ ÌïòÎÇòÏùò Ïù¥Î¶ÑÏúºÎ°ú Î≥ëÌï©
    ex) [{'entity':'B-PER','word':'ÏÜ°'}, {'entity':'I-PER','word':'##ÏÑ†'}, {'entity':'I-PER','word':'##Ïû¨'}]
        ‚Üí ['ÏÜ°ÏÑ†Ïû¨']
    """
    names = []
    cur = ""
    for ent in ner_results:
        if "PER" not in ent["entity"]:
            if cur:
                names.append(cur)
                cur = ""
            continue
        w = ent["word"].replace("##", "")
        if ent["entity"].startswith("B-"):
            if cur:
                names.append(cur)
            cur = w
        elif ent["entity"].startswith("I-"):
            cur += w
    if cur:
        names.append(cur)
    return names

# ===============================
# Ï£ºÏÜå Îß§Ïπ≠
# ===============================
def find_address_span_bboxes_strict_skip(words, target, masked_bboxes,
                                         same_thresh=0.9, sim_thresh=0.6,
                                         window=6, max_area_ratio=0.4):
    target_norm = normalize_ko(target)
    if not target_norm or len(words) == 0:
        return []
    best_score, best_bbox = 0.0, None
    for i in range(len(words)):
        for j in range(i + 1, min(i + window, len(words)) + 1):
            seg_text = "".join([w["text_norm"] for w in words[i:j]])
            score = SequenceMatcher(None, target_norm, seg_text).ratio()
            if score > best_score:
                best_score = score
                best_bbox = merge_bboxes([w["bbox"] for w in words[i:j]])
    if best_score < sim_thresh or best_bbox is None:
        #print(f"‚ö†Ô∏è Ï£ºÏÜå ÎØ∏Îß§Ïπ≠ (sim={best_score:.2f}): {target}")
        return []
    max_x = max(w["bbox"][2][0] for w in words)
    max_y = max(w["bbox"][2][1] for w in words)
    total_area = max_x * max_y
    area = abs((best_bbox[2][0]-best_bbox[0][0])*(best_bbox[2][1]-best_bbox[0][1]))
    if area > total_area * max_area_ratio:
        print(f"‚ö†Ô∏è Skipped huge bbox ({area/total_area:.1%} area): {target}")
        return []
    for mb in masked_bboxes:
        if bbox_iou(mb, best_bbox) > same_thresh:
            #print(f"‚è© ÎèôÏùº ÏúÑÏπò Ï£ºÏÜå Ïä§ÌÇµ: {target}")
            return []
    #print(f"‚úÖ Ï£ºÏÜå Îß§Ïπ≠ (sim={best_score:.2f}): {target}")
    return [best_bbox]
# ===============================
# ‚ö° Îπ†Î•∏ OpenCV Ï†ÑÏ≤òÎ¶¨
# ===============================
def preprocess_image_cv2(path):
    img = cv2.imread(path)
    img = cv2.convertScaleAbs(img, alpha=1.3, beta=0)
    kernel = np.array([[0, -1, 0],
                       [-1, 5, -1],
                       [0, -1, 0]])
    img = cv2.filter2D(img, -1, kernel)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return Image.fromarray(img)

In [None]:
# ===============================
# üìÇ ÌååÏùº ÌéòÏñ¥ÎßÅ
# ===============================
img_files = sorted([f for f in os.listdir(IMG_DIR) if f.lower().endswith(".png")], key=natural_key)
json_files = sorted([f for f in os.listdir(JSON_DIR) if f.lower().endswith(".json")], key=natural_key)
img_map = {os.path.splitext(f)[0]: f for f in img_files}
json_map = {os.path.splitext(f)[0]: f for f in json_files}
common = sorted(set(img_map.keys()) & set(json_map.keys()), key=natural_key)
file_pairs = [(img_map[k], json_map[k]) for k in common]
print(f"‚úÖ Ï¥ù {len(file_pairs)}Í∞ú ÌååÏùº ÌéòÏñ¥ÎßÅ ÏôÑÎ£å")

# ===============================
# üß† Î≥ëÎ†¨ Î¨∏ÏÑú Ï≤òÎ¶¨ Ìï®Ïàò
# ===============================
def process_document(pair):
    img_file, json_file = pair
    img_path = os.path.join(IMG_DIR, img_file)
    json_path = os.path.join(JSON_DIR, json_file)
    masked_path = os.path.join(OUT_DIR, f"masked_{os.path.splitext(img_file)[0]}.png")

    image = preprocess_image_cv2(img_path)
    with open(json_path, "r", encoding="utf-8") as f:
        ocr = json.load(f)
    parsed_text_raw = ocr["ParsedResults"][0].get("ParsedText", "")
    ocr_lines = ocr["ParsedResults"][0]["TextOverlay"]["Lines"]
    words = build_words_list(ocr_lines)

    # numpy Í∏∞Î∞ò Ïù¥ÎØ∏ÏßÄ Î∞∞Ïó¥
    np_image = np.array(image)
    mask = np.zeros_like(np_image[:, :, 0], dtype=np.uint8)
    masked_bboxes = []

    # === 1Ô∏è‚É£ VLM Ï∂îÎ°† ===
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": (
                "Carefully examine this Korean document image **from top-left to bottom-right**, "
                "reading it section by section just as a person would read a form.\n\n"
                "Guidelines:\n"
                "- When reading, look for **labels** like 'ÏÑ±Î™Ö', 'ÏòàÍ∏àÏ£ºÎ™Ö', 'Ïã†Ï≤≠Ïù∏', 'ÎåÄÌëúÏûê' Îì±, "
                "and extract the corresponding **Name** to the right or below.\n"
                "- For 'Ï£ºÏÜå', 'ÏÜåÏû¨ÏßÄ', 'Î≥ÄÍ≤ΩÏ†Ñ', 'Î≥ÄÍ≤ΩÌõÑ', 'ÌòÑÏ£ºÏÜå' Îì±, extract the **Address** nearby.\n"
                "- Names are short Korean words (2‚Äì4 syllables).\n"
                "- Addresses contain tokens like 'Ïãú', 'Íµ∞', 'Íµ¨', 'Ïùç', 'Î©¥', 'Îèô', 'Î¶¨', 'Í∏∏', 'Î°ú'.\n\n"
                "Exclude labels and return valid JSON only: "
                '{"names": [...], "addresses": [...]}.\n\n'
            )},
        ],
    }]
    chat = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=[chat], images=[image], return_tensors="pt").to(model.device)

    try:
        with torch.no_grad():
            gen_ids = model.generate(**inputs, max_new_tokens=512, temperature=1.2, top_p=0.98)
        output_text = processor.batch_decode(gen_ids[:, inputs["input_ids"].shape[1]:],
                                             skip_special_tokens=True)[0]
        m = re.search(r"\{[\s\S]*\}", output_text)
        if not m:
            return
        parsed = json.loads(m.group(0))
        names, addresses = parsed.get("names", []), parsed.get("addresses", [])
    except Exception:
        return

    # === 2Ô∏è‚É£ VLM ÎßàÏä§ÌÇπ ===
    for name in names:
        boxes = find_name_matches_by_length(words, name, masked_bboxes)
        for b in boxes:
            x1, y1, x2, y2 = map(int, [b[0][0], b[0][1], b[2][0], b[2][1]])
            mask[y1:y2, x1:x2] = 255
            masked_bboxes.append(b)
    for addr in addresses:
        boxes = find_address_span_bboxes_strict_skip(words, addr, masked_bboxes)
        for b in boxes:
            x1, y1, x2, y2 = map(int, [b[0][0], b[0][1], b[2][0], b[2][1]])
            mask[y1:y2, x1:x2] = 255
            masked_bboxes.append(b)

    # === 3Ô∏è‚É£~4Ô∏è‚É£ NER Î≥¥Ï†ï (Í∏∞Ï°¥ Î°úÏßÅ Í∑∏ÎåÄÎ°ú)
    parsed_text_clean = flatten_parsed_text2(parsed_text_raw)
    words_split = parsed_text_clean.split()
    address_words = [normalize_ko(addr) for addr in addresses]

    for w in words_split:
        norm_w = normalize_ko(w)
        if re.search(r"[„Ñ±-„Öé]", w): continue
        if any(norm_w in addr for addr in address_words): continue

        results = ner_pipeline(w)
        has_per = any("PER" in ent["entity"] for ent in results)
        has_loc = any("LOC" in ent["entity"] for ent in results)
        if has_per or has_loc:
            for word_obj in words:
                if normalize_ko(word_obj["text"]) == norm_w:
                    x1, y1, x2, y2 = map(int, [word_obj["bbox"][0][0], word_obj["bbox"][0][1],
                                               word_obj["bbox"][2][0], word_obj["bbox"][2][1]])
                    mask[y1:y2, x1:x2] = 255
                    masked_bboxes.append(word_obj["bbox"])

    # === Ï†ÄÏû• ===
    np_image[mask == 255] = 0
    Image.fromarray(np_image).save(masked_path)


# ===============================
# üßµ Î≥ëÎ†¨ Ïã§Ìñâ
# ===============================
max_workers = min(4, os.cpu_count())
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    list(tqdm(executor.map(process_document, file_pairs), total=len(file_pairs)))