In [None]:
from datasets import load_from_disk
from PIL import Image
import os
import numpy as np
import pandas as pd
import ipywidgets as widgets
from IPython.display import display
import io
import base64
import cv2
import json
from tqdm import tqdm
src = "/home/kai/workspace/DeepDocs_Project/datalake/source/finance_legal_mrc_merged_table"

In [None]:
ds = load_from_disk(src)

In [None]:
ds

In [None]:
import hashlib

def get_sha256_size(img_input):
    """
    img_input: 파일 경로(str) 또는 PIL.Image.Image 객체 모두 지원
    리턴: (hash, width, height)
    """
    if isinstance(img_input, str):
        img = Image.open(img_input).convert("RGB")
    elif isinstance(img_input, Image.Image):
        img = img_input.convert("RGB")
    else:
        raise ValueError("img_input은 파일 경로나 PIL.Image 객체여야 합니다.")
    width, height = img.size
    arr = np.array(img)
    hash_val = hashlib.sha256(
        arr.tobytes() + str(arr.shape).encode() + str(arr.dtype).encode()
    ).hexdigest()
    return hash_val, width, height

In [None]:
from bs4 import BeautifulSoup
def extract_tables_with_thead(html):
    soup = BeautifulSoup(html, "html.parser")
    tables = soup.find_all("table")
    for tag in soup(["style", "script"]):
        tag.decompose()
    results = []
    for table in tables:
        tds = table.find_all("th", attrs={"rowspan": "1", "colspan": "1"})
        tds += table.find_all("td", attrs={"rowspan": "1", "colspan": "1"})
        tds += table.find_all("tr", attrs={"rowspan": "1", "colspan": "1"})
        for td in tds:
            td.attrs.pop("rowspan", None)
            td.attrs.pop("colspan", None)
        for tag in table.find_all(True):
            if tag.string:
                tag.string = tag.get_text(strip=True)
                tag.string = tag.string.replace("\u3000", "")
                tag.string = tag.string.replace("\u200b", "")
            
        results.append(str(table))
    return results

In [None]:
records = []
images_dir = "./images"

for row in tqdm(ds):
    image = row["image"]
    table_title = row["table_title"]
    table_html = row["table_html"]
    
    table = extract_tables_with_thead(table_html)
    if len(table) != 1:
        print(f"Error: {len(table)} tables found in {table_title}")
        continue
    table = table[0]
    
    hash_val, width, height = get_sha256_size(image)
    image_path = os.path.join(images_dir, f"{hash_val}.jpg")    
    image.convert("RGB").save(image_path, format="jpeg")
    
    
    records.append({
        "image_path": image_path.replace(images_dir + "/", ""),
        "width": width,
        "height": height,
        "label": table,
    })

In [None]:
df = pd.DataFrame(records)

In [None]:
df_cleaned = df.drop_duplicates(subset=["image_path"], keep='first')
df_cleaned = df.drop_duplicates(subset=["label"], keep='first')
df_cleaned = df_cleaned.reset_index(drop=True)

In [None]:
df.to_parquet("finance_legal_mrc_merged_table.parquet", index=False)

In [None]:
len(df_cleaned)