In [6]:
import os
from glob import glob
import shutil
from zipfile import ZipFile
import json
from tqdm import tqdm
import pandas as pd
src = "/home/kai/workspace/DeepDocs_Project/datalake/source/tech_sci_mrc"

In [7]:
data = glob(f"{src}/**/*.zip", recursive=True)

In [8]:
# unzip to "./"
for file in data:
    print(f"Unzipping {file}...")
    with ZipFile(file, 'r') as zip_ref:
        zip_ref.extractall(os.path.dirname(file).replace(src, "./"))
    print(f"Unzipped {file} to {os.path.dirname(file)}")

Unzipping /home/kai/workspace/DeepDocs_Project/datalake/source/tech_sci_mrc/152.기술과학_문서_기계독해_데이터/01-1.정식개방데이터/Validation/01.원천데이터/VS_생명_LA.zip...
Unzipped /home/kai/workspace/DeepDocs_Project/datalake/source/tech_sci_mrc/152.기술과학_문서_기계독해_데이터/01-1.정식개방데이터/Validation/01.원천데이터/VS_생명_LA.zip to /home/kai/workspace/DeepDocs_Project/datalake/source/tech_sci_mrc/152.기술과학_문서_기계독해_데이터/01-1.정식개방데이터/Validation/01.원천데이터
Unzipping /home/kai/workspace/DeepDocs_Project/datalake/source/tech_sci_mrc/152.기술과학_문서_기계독해_데이터/01-1.정식개방데이터/Validation/01.원천데이터/VS_인공물_EE.zip...
Unzipped /home/kai/workspace/DeepDocs_Project/datalake/source/tech_sci_mrc/152.기술과학_문서_기계독해_데이터/01-1.정식개방데이터/Validation/01.원천데이터/VS_인공물_EE.zip to /home/kai/workspace/DeepDocs_Project/datalake/source/tech_sci_mrc/152.기술과학_문서_기계독해_데이터/01-1.정식개방데이터/Validation/01.원천데이터
Unzipping /home/kai/workspace/DeepDocs_Project/datalake/source/tech_sci_mrc/152.기술과학_문서_기계독해_데이터/01-1.정식개방데이터/Validation/01.원천데이터/VS_인공물_ED.zip...
Unzipped /home/kai/workspace/

In [9]:
json_files = glob("152.기술과학_문서_기계독해_데이터/**/*.json", recursive=True)

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
def extract_tables(html):
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["style", "script"]):
        tag.decompose()

    results = []
    while True:
        all_tables = soup.find_all("table")
        if not all_tables:
            break

        # innermost tables: 더이상 내부 table이 없는 것만 추출
        innermost_tables = [t for t in all_tables if not t.find("table")]
        if not innermost_tables:
            break

        for table in innermost_tables:
            # 불필요한 속성 제거 (예: border)
            if table.attrs is not None:
                table.attrs.pop("border", None)

            # caption 제거
            caption = table.find("caption")
            if caption:
                caption.decompose()

            # 태그 사이 공백 삭제
            str_table = re.sub(r'>\s+<', '><', str(table))

            # tds, ths, trs 후처리(특정 속성 제거 등) 필요 시 여기에 추가
            # 예시: rowspan="1", colspan="1" 속성 삭제
            for tag in table.find_all(["th", "td", "tr"]):
                if tag.attrs.get("rowspan") == "1":
                    tag.attrs.pop("rowspan")
                if tag.attrs.get("colspan") == "1":
                    tag.attrs.pop("colspan")

            # 다시 공백 정리(위에서 속성 pop하면서 soup 트리에서 변동됨)
            # table이 <table></table>이면 빈 테이블이므로 제외
            str_table = re.sub(r'>\s+<', '><', str(table))
            table.extract()  # soup 트리에서 제거(pop)
            if str_table.strip() == "<table></table>":
                continue
            results.append(str_table)
            
    return results

In [None]:
records = []
for json_path in tqdm(json_files):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    for context_info in data['dataset']['context_info']:
        context = context_info['context']
        tables = extract_tables(context)
        if not tables:
            continue
        for table in tables:
            record = {
                "html": table,
            }
            records.append(record)
df = pd.DataFrame(records)

In [None]:
df_unique = df.drop_duplicates(subset=['html'])
print(f"Total unique HTML tables: {len(df_unique)}")
print(f"Total HTML tables: {len(df)}")

In [None]:
df_unique.to_parquet("tech_sci_mrc_tables.parquet", index=False)

In [None]:
import asyncio
from playwright.async_api import async_playwright
async def render_html_and_get_bboxes(html, tags=("table", "tr", "td", "p", "h1", "h2")):
    async with async_playwright() as p:
        if not isinstance(html, str):
            html = str(html)
        browser = await p.chromium.launch()
        page = await browser.new_page(viewport={"width": 1280, "height": 1280})
        await page.set_content(html)
        await page.evaluate("""
            () => {
                // 기본 스타일 강제 적용
                let style = document.createElement('style');
                style.textContent = `
                    table { border-collapse: collapse; }
                    table, th, td { border: 1px solid #333; }
                    th, td { padding: 4px; }
                `;
                document.head.appendChild(style);
            }
        """)
        await page.wait_for_timeout(100)  # 렌더링 안정화
        bboxes = await page.evaluate(
            """
            (tags) => {
                let results = [];
                tags.forEach(tag => {
                    document.querySelectorAll(tag).forEach((el, idx) => {
                        let rect = el.getBoundingClientRect();
                        results.push({
                            tag: tag,
                            idx: idx,
                            text: el.innerText,
                            x: rect.left + window.scrollX,
                            y: rect.top + window.scrollY,
                            width: rect.width,
                            height: rect.height
                        });
                    });
                });
                return results;
            }
            """,
            list(tags)
        )
        img_bytes = await page.screenshot(full_page=True, type='jpeg')
        await browser.close()
        return img_bytes, bboxes

In [None]:
tables[0]

In [None]:
img_bytes, bboxes = await render_html_and_get_bboxes(tables[0])

In [None]:
from PIL import Image, ImageDraw, ImageFont
import io
img = Image.open(io.BytesIO(img_bytes))
draw = ImageDraw.Draw(img)
colors = [f"#{i:02x}{j:02x}00" for i in range(0, 256, 51) for j in range(0, 256, 51)]
for bbox in bboxes:
    tag = bbox['tag']
    text = bbox['text']
    x = bbox['x']
    y = bbox['y']
    width = bbox['width']
    height = bbox['height']
    # random outline color
    color = colors[bbox['idx'] % len(colors)]
    draw.rectangle([x, y, x + width, y + height], outline=color, width=3)

In [None]:
img