In [210]:
import os
import cv2
import json
from glob import glob
from tqdm import tqdm
from matplotlib import pyplot as plt
from PIL import Image

In [37]:
HRDH_IMAGES = glob("HRDH/images/**/*.*g", recursive=True)
HRDS_IMAGES = glob("HRDS/images/**/*.*g", recursive=True)

HRDH_LABELS = glob("HRDH/t*/**/*.json", recursive=True)
HRDS_LABELS = glob("HRDS/t*/**/*.json", recursive=True)
LABEL_DICT = {}
for label in HRDH_LABELS + HRDS_LABELS:
    LABEL_DICT[os.path.basename(label).split(".json")[0]] = label

print("HRDH images:", len(HRDH_IMAGES))
print("HRDS images:", len(HRDS_IMAGES))
print("HRDH labels:", len(HRDH_LABELS))
print("HRDS labels:", len(HRDS_LABELS))
print("LABEL_DICT:", len(LABEL_DICT))

HRDH images: 21427
HRDS images: 10224
HRDH labels: 1500
HRDS labels: 1000
LABEL_DICT: 2500


In [191]:
from collections import defaultdict
def simple_page_split(data):
    
    pages = defaultdict(list)
    for item in data:
        pages[item['page']].append(item)
    
    result = {}
    
    for page_num, items in pages.items():
        page_indices = {}
        for new_idx, item in enumerate(items):
            for orig_idx, orig_item in enumerate(data):
                if (item is orig_item or
                    (item['text'] == orig_item['text'] and 
                     item['box'] == orig_item['box'] and 
                     item['page'] == orig_item['page'])):
                    page_indices[orig_idx] = new_idx
                    break
        
        # parent_id 재매핑
        remapped_items = []
        for item in items:
            new_item = item.copy()
            
            if new_item['parent_id'] == -1:
                pass 
            elif new_item['parent_id'] in page_indices:
                new_item['parent_id'] = page_indices[new_item['parent_id']]
            else:
                new_item['parent_id'] = -1
                
            remapped_items.append(new_item)
        
        result[page_num] = remapped_items
    
    return result

In [207]:
from datasets import Dataset

In [249]:
records = {
    "image_path": [],
    "label": [],
    "page": [],
    "original_path": [],
}
for image_path in tqdm(HRDH_IMAGES):
    image_path = image_path.replace("\\", "/")    
    file_ident = os.path.basename(os.path.dirname(image_path))
    label_path = LABEL_DICT.get(file_ident, None)
    if label_path is None:
        print(f"Label not found for {file_ident} in {image_path}")
        continue
    # image = cv2.imread(image_path)
    # pil_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    # pil_image = Image.fromarray(pil_image).convert("RGB")
    page_num = int(os.path.basename(image_path).split(".")[0])
    with open(label_path, "r", encoding="utf-8") as f:
        label = json.load(f)

    label = simple_page_split(label).get(page_num, [])
    # image_path 절대경로
    
    records["image_path"].append(os.path.abspath(image_path).replace("\\", "/"))
    records["label"].append(label) 
    records["page"].append(page_num)
    records["original_path"].append(image_path)

100%|██████████| 21427/21427 [06:34<00:00, 54.27it/s] 


In [251]:
records

KeyboardInterrupt: 

In [None]:
from datasets import load
ds_ = load_from_disk("W:/datalake/staging/pending/hrdoc_raw_image_60394d56_20250623151052_unknown")

In [254]:
for image_path in tqdm(HRDS_IMAGES):
    image_path = image_path.replace("\\", "/")    
    file_ident = os.path.basename(os.path.dirname(image_path))
    label_path = LABEL_DICT.get(file_ident, None)
    if label_path is None:
        print(f"Label not found for {file_ident} in {image_path}")
        continue
    # image = cv2.imread(image_path)
    # pil_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    # pil_image = Image.fromarray(pil_image).convert("RGB")
    page_num = int(os.path.basename(image_path).rsplit(".")[-2].split("_")[-1])
    with open(label_path, "r", encoding="utf-8") as f:
        label = json.load(f)

    label = simple_page_split(label).get(page_num, [])
    # for idx, info in enumerate(label):
    #     box = info["box"]
    #     text = info["text"]
    #     cls_ = info["class"]
        
        
    #     cv2.rectangle(image, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 1)
    #     cv2.putText(image, f"{idx}", (box[0], box[1] - 10), 
    #                 cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
    records["image_path"].append(image_path)
    records["label"].append(label) 
    records["page"].append(page_num)
    records["original_path"].append(image_path)

100%|██████████| 10224/10224 [03:47<00:00, 44.89it/s]


In [255]:
ds = Dataset.from_dict(records)

In [265]:

# image_path를 전부 image로 변환
def image_to_pil(image_path):
    from PIL import Image
    pil_image = Image.open(image_path).convert("RGB")
    # 'image'에 저장
    return {"image": pil_image}

image_ds = ds.map(image_to_pil, 
                    input_columns=["image_path"],
                    remove_columns=["image_path"],
                    num_proc=16,
                    batch_size=512,
)

Map (num_proc=16): 100%|██████████| 31651/31651 [02:03<00:00, 256.04 examples/s]


In [266]:
image_ds.save_to_disk("HRDH_HRDS_dataset",
                        storage_options={"use_tracing": False, "use_compression": True})

Saving the dataset (11/11 shards): 100%|██████████| 31651/31651 [00:07<00:00, 3982.67 examples/s]


In [69]:
label

[{'text': 'EXPLORING THE “RUBIK’S MAGIC” UNIVERSE',
  'box': [159, 150, 435, 160],
  'class': 'title',
  'page': 0,
  'is_meta': True,
  'parent_id': -1,
  'relation': 'meta'},
 {'text': 'MAURIZIO PAOLINI',
  'box': [256, 182, 338, 190],
  'class': 'author',
  'page': 0,
  'is_meta': True,
  'parent_id': -1,
  'relation': 'meta'},
 {'text': 'Abstract. By using two different invariants for the Rubik’s Magic puzzle,',
  'box': [154, 208, 441, 218],
  'class': 'fstline',
  'page': 0,
  'is_meta': False,
  'parent_id': -1,
  'relation': 'contain'},
 {'text': 'one of metric type, the other of top ological type, we can dramatically reduce',
  'box': [154, 220, 441, 227],
  'class': 'para',
  'page': 0,
  'is_meta': False,
  'parent_id': 2,
  'relation': 'connect'},
 {'text': 'the universe of constructible configurations of the puzzle. Finding the set',
  'box': [154, 229, 441, 237],
  'class': 'para',
  'page': 0,
  'is_meta': False,
  'parent_id': 3,
  'relation': 'connect'},
 {'text': 'of 

In [None]:
image_path

'HRDH/images\\1401.3699\\0.png'

In [41]:
LABEL_DICT

{'1401.3699': 'HRDH\\test\\1401.3699.json',
 '1402.2741': 'HRDH\\test\\1402.2741.json',
 '1406.1682': 'HRDH\\test\\1406.1682.json',
 '1410.8366': 'HRDH\\test\\1410.8366.json',
 '1411.3334': 'HRDH\\test\\1411.3334.json',
 '1412.1395': 'HRDH\\test\\1412.1395.json',
 '1412.7419': 'HRDH\\test\\1412.7419.json',
 '1412.7854': 'HRDH\\test\\1412.7854.json',
 '1501.04826': 'HRDH\\test\\1501.04826.json',
 '1502.00973': 'HRDH\\test\\1502.00973.json',
 '1503.05697': 'HRDH\\test\\1503.05697.json',
 '1503.07150': 'HRDH\\test\\1503.07150.json',
 '1504.07339': 'HRDH\\test\\1504.07339.json',
 '1504.08093': 'HRDH\\test\\1504.08093.json',
 '1505.02091': 'HRDH\\test\\1505.02091.json',
 '1505.07717': 'HRDH\\test\\1505.07717.json',
 '1506.04787': 'HRDH\\test\\1506.04787.json',
 '1506.06314': 'HRDH\\test\\1506.06314.json',
 '1507.01067': 'HRDH\\test\\1507.01067.json',
 '1507.01439': 'HRDH\\test\\1507.01439.json',
 '1507.01715': 'HRDH\\test\\1507.01715.json',
 '1507.02346': 'HRDH\\test\\1507.02346.json',
 '15