In [None]:
import os
from glob import glob
import shutil
import json
from tqdm import tqdm
import pandas as pd
from pathlib import Path
import hashlib
import re
src = "/home/kai/workspace/DeepDocs_Project/DataETL/source/provider=inhouse/gangdong_kyunghee_hospital/data"

In [None]:
prescriptions = glob(f"{src}/prescriptions/**/*.jpg", recursive=True)
prescription_label = glob(f"{src}/prescriptions/**/*.json", recursive=True)
prescription_jsonl = glob(f"{src}/prescriptions/**/*.jsonl", recursive=True)
len(prescriptions), len(prescription_label), len(prescription_jsonl)

In [None]:
pres_labels = {}
for jsonl_path in tqdm(prescription_jsonl):
    with open(jsonl_path, "r") as f:
        lines = f.readlines()
    for line in lines:
        data = json.loads(line)
        file_name = data["file_name"]
        ground_truth = json.loads(data["ground_truth"])
        if file_name in pres_labels:
            print(f"Duplicate file name found: {file_name}")
        pres_labels[file_name] = ground_truth
        
pres_labels = {k: v for k, v in pres_labels.items() if v is not None}
print(f"Prescription labels: {len(pres_labels)}")

In [None]:
def remove_repeated_phrase(text):
    text = text.strip()
    norm_text = re.sub(r'\s+', '', text)  # 중복된 공백 제거
    n = len(norm_text)
    for size in range(1, n // 2 + 1):
        phrase = norm_text[:size]
        if phrase * (n // size) == norm_text:
            start = 0 
            end = 0
            count = 0 
            for idx, char in enumerate(text):
                if not char.isspace():
                    count += 1
                if count == size:
                    end = idx + 1
                    break
            return text[start:end].strip()
    return text.strip()  # 반복 구조가 아니면 원문 그대로 반환

def get_sha256(file_path):
    with open(file_path, "rb") as f:
        bytes = f.read()
        hash = hashlib.sha256(bytes).hexdigest()
    return hash

In [None]:
records = []
for idx, image_path in tqdm(enumerate(prescriptions)):
    file_name = os.path.basename(image_path)
    if file_name not in pres_labels:
        print(f"Missing label for examination image: {file_name}")
        continue
    ground_truth = pres_labels.get(file_name)
    if ground_truth is None:
        print(f"Ground truth is None for examination image: {file_name}")
        continue
    # Process the image as needed, e.g., save or analyze
    img = Image.open(image_path)
    gt = ground_truth.get("gt_parse", {})
    
    kie_label = gt['prescriptions']
    date = gt['date']
    issuer_label = gt['issuer']
    if not re.match(r'^\d{4}-\d{2}-\d{2}$', date):
        print(f"Invalid date format in {file_name}: {date}")
    kie_converted = {
        'date': date,
        'items': []
    }
    if issuer_label.strip():
        kie_converted['name'] = remove_repeated_phrase(issuer_label)
    for item in kie_label:
        info_ = {}
            
        for key, value in item.items():
            if key in ['p_days', 'dosage', 'times_pd']:
                # digit이 아닐떄 pirnt
                if not re.match(r'^\d+(\.\d+)?$', str(value)):
                    value = value.strip().replace(" ","")
                value = str(value)
            value = value.strip().replace("'", '"')
            value = re.sub(r'\s+', ' ', value)
            info_[key] = value
        kie_converted['items'].append(info_)
    
    imgsha256 = get_sha256(image_path)
    save_path = Path(f"images/{imgsha256}.jpg")
    save_path.parent.mkdir(parents=True, exist_ok=True)
    shutil.copy(image_path, save_path)
    records.append({
        "image_path": str(save_path.relative_to(Path(save_path).parent)),
        "width": img.width,
        "height": img.height,
        "label": json.dumps(kie_converted, ensure_ascii=False),
    })
df = pd.DataFrame(records)
df.to_parquet("prescriptions.parquet", index=False)