In [None]:
import os
from glob import glob
import shutil
import json
from tqdm import tqdm
import pandas as pd
from pathlib import Path
import hashlib
import re
src = "/home/kai/workspace/DeepDocs_Project/DataETL/source/provider=inhouse/gangdong_kyunghee_hospital/data"

In [None]:
examinations = glob(f"{src}/examinations/**/*.jpg", recursive=True)
examination_label = glob(f"{src}/examinations/**/*.json", recursive=True)
examination_jsonl = glob(f"{src}/examinations/**/*.jsonl", recursive=True)
len(examinations), len(examination_label), len(examination_jsonl)

In [None]:
exam_labels = {}
for jsonl_path in tqdm(examination_jsonl):
    with open(jsonl_path, "r") as f:
        lines = f.readlines()
    for line in lines:
        data = json.loads(line)
        file_name = data["file_name"]
        ground_truth = json.loads(data["ground_truth"])
        if file_name in exam_labels:
            print(f"Duplicate file name found: {file_name}")
        exam_labels[file_name] = ground_truth

exam_labels = {k: v for k, v in exam_labels.items() if v is not None}

print(f"Exam labels: {len(exam_labels)}")

In [None]:
def remove_repeated_phrase(text):
    text = text.strip()
    norm_text = re.sub(r'\s+', '', text)  # 중복된 공백 제거
    n = len(norm_text)
    for size in range(1, n // 2 + 1):
        phrase = norm_text[:size]
        if phrase * (n // size) == norm_text:
            start = 0 
            end = 0
            count = 0 
            for idx, char in enumerate(text):
                if not char.isspace():
                    count += 1
                if count == size:
                    end = idx + 1
                    break
            return text[start:end].strip()
    return text.strip()  # 반복 구조가 아니면 원문 그대로 반환

def get_sha256(file_path):
    with open(file_path, "rb") as f:
        bytes = f.read()
        hash = hashlib.sha256(bytes).hexdigest()
    return hash

In [None]:
records = []
for idx, image_path in tqdm(enumerate(examinations)):
    file_name = os.path.basename(image_path)
    if file_name not in exam_labels:
        print(f"Missing label for examination image: {file_name}")
        continue
    ground_truth = exam_labels.get(file_name)
    if ground_truth is None:
        print(f"Ground truth is None for examination image: {file_name}")
        continue
    # Process the image as needed, e.g., save or analyze
    img = Image.open(image_path)
    gt = ground_truth.get("gt_parse", {})
    
    kie_label = gt['examinations']
    exam_label = gt['exam_label']['issuer']
    kie_converted = {
        'items': []
    }
    if exam_label.strip():
        kie_converted['name'] = remove_repeated_phrase(exam_label)
    for item in kie_label:
        
        date = item['date']
        if not re.match(r'^\d{4}-\d{2}-\d{2}$', date):
            print(f"Invalid date format in {file_name}: {date}")
        exam_infos = item.get("exam_info", [])
        item_ = {
            "date": date,
            "info": []
        }
        for exam_info_ in exam_infos:
            info_ = {}
            for key, value in exam_info_.items():
                
                if key == "refer_val":
                    # 대괄호가 덜 닫혀있을경우 괄호 제거 ex) [ 2.5 -> 2.5 , [ 2.5, 3.0 ] -> [2.5, 3.0]
                    if value.startswith('[') and not value.endswith(']'):
                        value = value[1:].strip()
                    if value.endswith(']') and not value.startswith('['):
                        value = value[:-1].strip()
                    if re.match(r'^\d+(\.\d+)?\s+\d+(\.\d+)?$', value):
                        value = value.replace(' ', '~')
                        
                if key == "unit":
                    value = value.replace('ul', 'uL')
                    
                value = value.strip().replace("'", '"')
                value = re.sub(r'\s+', ' ', value)
                info_[key] = value
                
            
            item_['info'].append(info_)
        kie_converted['items'].append(item_)
    imgsha256 = get_sha256(image_path)
    save_path = Path(f"images/{imgsha256}.jpg")
    save_path.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy(image_path, save_path)
    records.append({
        "image_path": str(save_path.relative_to(Path(save_path).parent)),
        "width": img.width,
        "height": img.height,
        "label": json.dumps(kie_converted, ensure_ascii=False),
    })
df = pd.DataFrame(records)
df.to_parquet("examinations.parquet", index=False)