In [None]:

import os
from glob import glob
import shutil
from zipfile import ZipFile
import json
from tqdm import tqdm
import pandas as pd
from PIL import Image, ImageDraw, ImageFont
from pptx import Presentation
src = "/home/kai/workspace/DeepDocs_Project/datalake/source/academic_paper_understanding/32.학술논문 이해 데이터/"

In [None]:
data = glob(f"{src}/**/*.zip", recursive=True)

In [None]:
# unzip to "./"
for file in data:
    print(f"Unzipping {file}...")
    with ZipFile(file, 'r') as zip_ref:
        zip_ref.extractall(os.path.dirname(file).replace(src, "./"))
    print(f"Unzipped {file} to {os.path.dirname(file)}")

In [None]:
# extract json files and pptx files
pptx_dirpath = "./pptx_files"
json_dirpath = "./json_files"
os.makedirs(pptx_dirpath, exist_ok=True)
os.makedirs(json_dirpath, exist_ok=True)
for file in tqdm(glob("./3.개방데이터/**/*.pptx", recursive=True)):
    shutil.move(file, pptx_dirpath)
for file in tqdm(glob("./3.개방데이터/**/*.json", recursive=True)):
    shutil.move(file, json_dirpath)

In [None]:
# PPTX 파일을 이미지로 변환하는 함수
import subprocess
import os

def convert_pptx_to_image(pptx_path, output_dir='./', format='jpeg'):
    """PPTX 파일을 이미지로 변환하는 함수
    
    방법 1: LibreOffice 사용 (리눅스/맥/윈도우 모두 가능)
    방법 2: Windows COM API 사용 (Windows 전용)
    """
    # 출력 디렉토리 생성
    os.makedirs(output_dir, exist_ok=True)
    
    # 파일명 추출
    filename = os.path.basename(pptx_path).rsplit('.', 1)[0]
    
    # 방법 1: LibreOffice 사용
    try:
        # libreoffice --headless --convert-to jpeg --outdir . slide.pptx
        cmd = [
            'libreoffice', '--headless', 
            '--convert-to', format, 
            '--outdir', output_dir, 
            pptx_path
        ]
        result = subprocess.run(cmd, check=True, text=True, capture_output=True)
        print(f"변환 성공: {result.stdout}")
        return f"{os.path.join(output_dir, filename)}.{format}"
    
    except subprocess.CalledProcessError as e:
        print("LibreOffice 변환 실패:", e)
        print("Windows나 다른 방법을 시도합니다...")
    
    # 방법 2: Windows COM API 사용 (Windows 전용)
    if os.name == 'nt':  # Windows인 경우
        try:
            import comtypes.client
            powerpoint = comtypes.client.CreateObject("Powerpoint.Application")
            powerpoint.Visible = True
            presentation = powerpoint.Presentations.Open(os.path.abspath(pptx_path))
            
            output_path = os.path.join(output_dir, f"{filename}.{format}")
            presentation.SaveAs(output_path, 18)  # 18 = ppSaveAsPNG
            presentation.Close()
            powerpoint.Quit()
            return output_path
            
        except Exception as e:
            print("Windows COM 변환 실패:", e)
            print("수동으로 변환 후 진행해주세요.")
            return None
    
    print("호환되는 변환 방법을 찾을 수 없습니다. PDF로 변환 후 pdf2image를 사용해보세요.")
    return None

In [None]:
from concurrent.futures import ProcessPoolExecutor, as_completed

pptx_dirpath = "./pptx_files"
images_dirpath = "./pptx_images"
format = 'jpeg'
pptx_files = glob(f"{pptx_dirpath}/*.pptx")

# 병렬 변환
results = []
with ProcessPoolExecutor(max_workers=4) as executor:
    futures = {executor.submit(convert_pptx_to_image, pptx_file, images_dirpath, format): pptx_file for pptx_file in pptx_files}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Converting PPTX to Images (Parallel)"):
        result = future.result()
        results.append(result)

# 결과 요약
success = [r for r in results if r[1]]
failed = [r for r in results if not r[1]]
print(f"변환 성공: {len(success)}개, 실패: {len(failed)}개")
if failed:
    print("실패 파일:", [f[0] for f in failed])

In [None]:
def is_y_overlap(box1, box2, min_overlap_ratio=0.2):
    y0_a, y1_a = box1[1], box1[3]
    y0_b, y1_b = box2[1], box2[3]
    # 교집합 길이
    
    overlap = max(0, min(y1_a, y1_b) - max(y0_a, y0_b))
    if overlap == 0:
        return False
    len_a = y1_a - y0_a
    len_b = y1_b - y0_b
    # 기준: 둘 중 짧은 쪽 대비 20% 이상 겹치면 같은 줄로 인정
    
    if overlap / min(len_a, len_b) >= min_overlap_ratio:
        return True
    return False


def assign_reading_order(
    data,
    column_grouping=True,
    n_col=3,
    location_field="location",
    box_field="box",
    order_field="reading_order"
):
    if location_field == box_field:
        raise ValueError("location_field와 box_field는 다르게 설정해야 합니다.")

    for d in data:
        d[box_field] = list(map(int, ast.literal_eval(d[location_field])))
        x0, y0, w, h = d[box_field]
        x1 = x0 + w
        y1 = y0 + h
        d[box_field] = (x0, y0, x0 + w, y0 + h)  # (x0, y0, x1, y1)
        d["x_center"] = (x0 + x1) / 2
        d["y_center"] = (y0 + y1) / 2

    if column_grouping:
        # "양옆 box"를 y_center가 다른 박스의 y0~y1 내에 포함되는지로 판단
        for idx, d in enumerate(data):
            others = [d2 for i2, d2 in enumerate(data) if i2 != idx]
            has_neighbor = False
            for d2 in others:
                if is_y_overlap(d[box_field], d2[box_field], min_overlap_ratio=0.2):
                    has_neighbor = True
                    break
            d["col"] = -1 if not has_neighbor else None

        # col==-1(고립)는 제외, 나머지만 column 등분 적용
        col_group_candidates = [d for d in data if d["col"] is None]
        if col_group_candidates:
            x_centers = np.array([d["x_center"] for d in col_group_candidates])
            thresholds = np.linspace(x_centers.min(), x_centers.max(), n_col+1)
            for d in col_group_candidates:
                for i in range(n_col):
                    if thresholds[i] <= d["x_center"] < thresholds[i+1] or (i == n_col-1 and thresholds[i] <= d["x_center"] <= thresholds[i+1]):
                        d["col"] = i
                        break
        # 정렬: isolated 박스(y 오름차순) → 컬럼별(좌→우), 컬럼 내 y 오름차순
        isolated_sorted = sorted([d for d in data if d["col"] == -1], key=lambda d: d["y_center"])
        data_sorted = isolated_sorted
        for col in range(n_col):
            col_data = [d for d in data if d.get("col") == col]
            col_sorted = sorted(col_data, key=lambda d: d["y_center"])
            data_sorted.extend(col_sorted)
    else:
        data_sorted = sorted(data, key=lambda d: (d["y_center"], d["x_center"]))

    for i, d in enumerate(data_sorted, 1):
        d[order_field] = i

    return data_sorted


In [None]:
from sklearn.cluster import KMeans

def assign_reading_order_kmeans(
    data,
    n_col=2,       # column 개수
    location_field="location",
    box_field="box",
    order_field="reading_order",
    min_col_size=2 # 최소 column 그룹 크기(작으면 강제 merge)
):
    # 1. 좌표 파싱 및 중심점 계산
    for d in data:
        d[box_field] = list(map(int, ast.literal_eval(d[location_field])))
        x0, y0, w, h = d[box_field]
        x1 = x0 + w
        y1 = y0 + h
        d[box_field] = (x0, y0, x0 + w, y0 + h)  # (x0, y0, x1, y1)
        d["x_center"] = (x0 + x1) / 2
        d["y_center"] = (y0 + y1) / 2

    # 3. column 그룹핑 (KMeans)
    
    for idx, d in enumerate(data):
        others = [d2 for i2, d2 in enumerate(data) if i2 != idx]
        has_neighbor = False
        for d2 in others:
            print(f"Checking overlap between {d['box']} and {d2['box']}")
            if is_y_overlap(d[box_field], d2[box_field], min_overlap_ratio=0.2):
                has_neighbor = True
                break
        d["col"] = -1 if not has_neighbor else None
    
    col_group_candidates = [d for d in data if d["col"] is None]
    if col_group_candidates and len(col_group_candidates) >= n_col:
        x_centers = np.array([[d["x_center"]] for d in col_group_candidates])
        kmeans = KMeans(n_clusters=n_col, random_state=42, n_init=10).fit(x_centers)
        labels = kmeans.labels_
        for d, lbl in zip(col_group_candidates, labels):
            d["col"] = int(lbl)
        # cluster centroid 오름차순(left→right)로 column index 정렬
        centroids = kmeans.cluster_centers_.flatten()
        centroid_order = np.argsort(centroids)
        old2new_col = {old: new for new, old in enumerate(centroid_order)}
        for d in col_group_candidates:
            d["col"] = old2new_col[d["col"]]
        # 소수 박스 column fallback 처리
        from collections import Counter
        col_counts = Counter([d["col"] for d in col_group_candidates])
        for col, count in col_counts.items():
            if count < min_col_size:
                # 가장 가까운 대형 column에 병합
                main_col = max(col_counts, key=lambda k: (col_counts[k], -abs(k - col)))
                for d in col_group_candidates:
                    if d["col"] == col:
                        d["col"] = main_col

    # 4. 컬럼 내 y 오름차순
    isolated_sorted = sorted([d for d in data if d["col"] == -1], key=lambda d: d["y_center"])
    data_sorted = isolated_sorted
    for col in range(n_col):
        col_data = [d for d in data if d.get("col") == col]
        col_sorted = sorted(col_data, key=lambda d: d["y_center"])
        data_sorted.extend(col_sorted)


    # 5. reading_order 할당
    for i, d in enumerate(data_sorted, 1):
        d[order_field] = i

    return data_sorted

In [None]:
class_map = {
    "para": "text_plane",
    "title": "title",
}
json_files = glob(f"{json_dirpath}/ST_0008_0086731.json")
img = Image.open("pptx_images/ST_0008_0086731.jpeg")
draw = ImageDraw.Draw(img)
for json_path in tqdm(json_files, desc="Processing JSON files"):
    pptx_path = json_path.replace(json_dirpath, pptx_dirpath).replace('.json', '.pptx')
    # PPTX 파일 정보
    pptx = Presentation(pptx_path)
    pptx_slide = pptx.slides[0]
    pptx_width = pptx.slide_width
    pptx_height = pptx.slide_height
    # print("PPTX:", pptx_width, pptx_height)


    # JSON 파일 정보
    with open(json_path, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
        
    section_info = json_data['training_data_info']['section_info']
    label = {
        "page": 1,
        "reading_order": True,
        "elements": []
    }
    width, height = img.size
    #section_info = sorted(section_info, key=lambda x: eval(x['summary_cnt']))
    #section_info = assign_reading_order(section_info,n_col=3)
    #section_info = assign_reading_order_kmeans(section_info, n_col=3)
    random_colors = np.random.randint(0, 256, size=(len(section_info), 3))
    for idx, section in enumerate(section_info):
        bbox = eval(section['location'])  # 예: [x1, y1, x2, y2]
        class_name = section['paragraph_id'].split("_")[0]  # 예: "para_1"에서 "para" 추출
        class_name = class_map.get(class_name, None)
        # if class_name is None:
        #     continue
        x, y, w, h= bbox[0]/ pptx_width, bbox[1]/pptx_height, bbox[2]/pptx_width, bbox[3]/pptx_height
        
        bbox = (x, y, x + w, y + h)
        bbox = (int(bbox[0] * width), int(bbox[1] * height),
                int(bbox[2] * width), int(bbox[3] * height))
        draw.rectangle(bbox, outline=tuple(random_colors[idx]), width=5)
        # draw.text((bbox[0], bbox[1]), class_name, fill=(255, 0, 0), font=ImageFont.load_default(size=30))
        draw.text((bbox[0], bbox[1]-20), f"{idx} {class_name}",fill=(255, 0, 0), font=ImageFont.load_default(size=30))

In [None]:
img

In [None]:
json_data

In [None]:
pptx_width, pptx_height

In [None]:
img_width, img_height

In [None]:
import numpy as np
img = Image.open("SS_0025_0032078.jpeg")
img_width, img_height = img.size
print(f"Image Size: {img_width}, {img_height}")
with open("SS_0025_0032078.json", 'r', encoding='utf-8') as f:
    data = json.load(f)
    data = data[0]  # 첫 번째 슬라이드 데이터만 사용
boxes = data['boxes']
draw = ImageDraw.Draw(img)

random_colors = np.random.randint(0, 256, size=(len(boxes), 3))
for idx, box_info in tqdm(enumerate(boxes), desc="Processing sections"):
    box = box_info['box']
    value = box_info['value']
    cls = box_info['class']
    sx, sy, sw, sh = box
    box = (sx * img_width, sy * img_height, 
           (sx + sw) * img_width, (sy + sh) * img_height)
    # 6. 이미지에 bbox 그리기 (PIL)
    draw.rectangle(box, outline=tuple(random_colors[idx]), width=5)
    draw.text((box[0], box[1]), f"{idx}", fill=(255, 0, 0), font=ImageFont.load_default(size=30))


# 7. 리사이즈 (최대 한 변 1280)
max_len = 1280
ratio = min(max_len / img_width, max_len / img_height)
resized_img = img.resize((int(img_width * ratio), int(img_height * ratio)), Image.LANCZOS)

In [None]:
import numpy as np
img = Image.open(slide_image)
img_width, img_height = img.size
print(f"Image Size: {img_width}, {img_height}")

scale_x = img_width / pptx_width
scale_y = img_height / pptx_height
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)
draw = ImageDraw.Draw(img)

section_info = data['training_data_info']['section_info']

random_colors = np.random.randint(0, 256, size=(len(section_info), 3))
for idx, section in tqdm(enumerate(section_info), desc="Processing sections"):
    bbox = eval(section['location'])  # 예: [x1, y1, x2, y2]
    class_name = section['paragraph_id']
    # EMU → 픽셀 변환
    x, y, w, h= [int(bbox[0]*scale_x), int(bbox[1]*scale_y),
                      int(bbox[2]*scale_x), int(bbox[3]*scale_y)]
    bbox = (x, y, x + w, y + h)
    # 6. 이미지에 bbox 그리기 (PIL)
    draw.rectangle(bbox, outline=tuple(random_colors[idx]), width=5)
    draw.text((x, y), class_name, fill=(255, 0, 0), font=ImageFont.load_default(size=30))

# 7. 리사이즈 (최대 한 변 1280)
max_len = 1280
ratio = min(max_len / img_width, max_len / img_height)
resized_img = img.resize((int(img_width * ratio), int(img_height * ratio)), Image.LANCZOS)

# 8. bbox도 동일하게 스케일링(위에 그린 bbox가 함께 resize됨)

In [None]:
resized_img

In [None]:
json_path

In [None]:
def scale_bbox(bbox, scale_x, scale_y):
    return [
        int(bbox[0] * scale_x), 
        int(bbox[1] * scale_y), 
        int(bbox[2] * scale_x), 
        int(bbox[3] * scale_y)
    ]

# 슬라이드 범위에 맞추기 (예: 9144000 × 6858000)
image = Image.open(slide_image)
bbox = [1715509, 11773448, 1715509 + 26691962, 11773448 + 3318171]  # 예시 bbox
scale_x = image.width / pptx_width
scale_y = image.height / pptx_height
bbox = scale_bbox(bbox, scale_x, scale_y)
draw = ImageDraw.Draw(image)
draw.rectangle(bbox, outline="red", width=5)
image

In [None]:
from bs4 import BeautifulSoup
from PIL import Image, ImageDraw
import os
import json

def extract_tables_with_thead(html):
    soup = BeautifulSoup(html, "html.parser")
    tables = soup.find_all("table")
    results = []
    for table in tables:
        results.append(str(table))
    return results

# 3. 변환된 이미지 불러오기
# 이미지 파일명 구하기 (LibreOffice는 기본적으로 '파일명.png' 형태로 저장)
pptx_basename = os.path.basename(pptx_path).rsplit('.', 1)[0]
img_path = f"{pptx_basename}.png"  # PPTX 첫 슬라이드를 PNG로 변환한 경로

# 이미지가 존재하는지 확인 후 불러오기
if not os.path.exists(img_path):
    print(f"경고: {img_path} 파일이 없습니다. PPTX를 PNG로 먼저 변환하세요.")
    # 이전 셀의 convert_pptx_to_image 함수 사용 권장

img = Image.open(img_path)
img_width, img_height = img.size
print(f"Image Size: {img_width}, {img_height}")

scale_x = img_width / pptx_width
scale_y = img_height / pptx_height
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

for section in data['training_data_info']['section_info']:
    bbox = eval(section['location'])  # 예: [x1, y1, x2, y2]
    # EMU → 픽셀 변환
    x1, y1, x2, y2 = [int(bbox[0]*scale_x), int(bbox[1]*scale_y),
                      int(bbox[2]*scale_x), int(bbox[3]*scale_y)]

    # 6. 이미지에 bbox 그리기 (PIL)
    draw = ImageDraw.Draw(img)
    draw.rectangle([x1, y1, x2, y2], outline='red', width=3)

# 7. 리사이즈 (최대 한 변 1280)
max_len = 1280
ratio = min(max_len / img_width, max_len / img_height)
resized_img = img.resize((int(img_width * ratio), int(img_height * ratio)), Image.LANCZOS)

# 8. bbox도 동일하게 스케일링(위에 그린 bbox가 함께 resize됨)

# 9. 저장
resized_img.save('slide_with_bbox.png')
print("완료: slide_with_bbox.png")

In [None]:
records = []
for json_path in json_files:
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    for item in tqdm(data['data']):
        for paragraph in item['paragraphs']:
            html = paragraph['context']
            tables = extract_tables_with_thead(html)
            if not tables:
                continue
            for table in tables:
                record = {
                    "html": table,
                }
                records.append(record)
df = pd.DataFrame(records)

In [None]:
# df에서 html이 중복인거 제거후 개수
df_unique = df.drop_duplicates(subset=['html'])
print(f"Total unique HTML tables: {len(df_unique)}")
print(f"Total HTML tables: {len(df)}")

In [None]:
df_unique.to_parquet("admindocs_mrc_tables.parquet", index=False)

In [None]:
r"\\"

In [None]:
#df_unique['html'] 에 '\\' 이 있는것만 
df_unique[df_unique['html'].str.contains("\\\\")]

In [None]:
import asyncio
from playwright.async_api import async_playwright
async def render_html_and_get_bboxes(html, tags=("table", "tr", "td", "p", "h1", "h2")):
    async with async_playwright() as p:
        if not isinstance(html, str):
            html = str(html)
        browser = await p.chromium.launch()
        page = await browser.new_page(viewport={"width": 1280, "height": 1280})
        await page.set_content(html)
        await page.evaluate("""
            () => {
                // 기본 스타일 강제 적용
                let style = document.createElement('style');
                style.textContent = `
                    table { border-collapse: collapse; }
                    table, th, td { border: 1px solid #333; }
                    th, td { padding: 4px; }
                `;
                document.head.appendChild(style);
            }
        """)
        await page.wait_for_timeout(100)  # 렌더링 안정화
        bboxes = await page.evaluate(
            """
            (tags) => {
                let results = [];
                tags.forEach(tag => {
                    document.querySelectorAll(tag).forEach((el, idx) => {
                        let rect = el.getBoundingClientRect();
                        results.push({
                            tag: tag,
                            idx: idx,
                            text: el.innerText,
                            x: rect.left + window.scrollX,
                            y: rect.top + window.scrollY,
                            width: rect.width,
                            height: rect.height
                        });
                    });
                });
                return results;
            }
            """,
            list(tags)
        )
        img = await page.screenshot(full_page=True, type='jpeg')
        await browser.close()
        return img, bboxes

In [None]:
# img_bytes, bboxes = await render_html_and_get_bboxes(str(tables[0]))

In [None]:
# from PIL import Image, ImageDraw, ImageFont
# import io
# img = Image.open(io.BytesIO(img_bytes))
# draw = ImageDraw.Draw(img)
# colors = [f"#{i:02x}{j:02x}00" for i in range(0, 256, 51) for j in range(0, 256, 51)]
# for bbox in bboxes:
#     tag = bbox['tag']
#     text = bbox['text']
#     x = bbox['x']
#     y = bbox['y']
#     width = bbox['width']
#     height = bbox['height']
#     # random outline color
#     color = colors[bbox['idx'] % len(colors)]
#     draw.rectangle([x, y, x + width, y + height], outline=color, width=3)
#     # draw.text((x + 5, y + 5), f"{tag}", fill="blue", font=ImageFont.load_default(size=20))

In [None]:
# import ipywidgets as widgets
# from PIL import Image, ImageDraw, ImageFont
# import matplotlib.pyplot as plt
# from io import BytesIO

# img = Image.open(io.BytesIO(img_bytes))
# bboxes = sorted(bboxes, key=lambda x: (x['y'], x['x']))  # y, x 순으로 정렬
# def show_bbox(idx):
#     im = img.copy()
#     draw = ImageDraw.Draw(im)
#     bbox = bboxes[idx]
#     tag = bbox['tag']
#     x = bbox['x']
#     y = bbox['y']
#     width = bbox['width']
#     height = bbox['height']
#     color = "red"
#     draw.rectangle([x, y, x + width, y + height], outline=color, width=3)
#     display(im)


In [None]:

# # 슬라이더 위젯
# slider = widgets.IntSlider(value=0, min=0, max=len(bboxes)-1, step=1, description='BBox')

# widgets.interact(show_bbox, idx=slider)