In [None]:
from pptx import Presentation, enum
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER_TYPE
import json
import os

In [None]:
sorted(list(MSO_SHAPE_TYPE), key=lambda x: x.value), sorted(list(PP_PLACEHOLDER_TYPE), key=lambda x: x.value)

In [None]:

# PPTX 파일 경로
pptx_path = "pptx_files/ST_0008_0086731.pptx"

# PPTX 열기
pptx = Presentation(pptx_path)
# Placeholder type to class 매핑 (원하는대로 확장 가능)
PLACEHOLDER_CLASS_MAP = {
    PP_PLACEHOLDER_TYPE.TITLE: "title",
    PP_PLACEHOLDER_TYPE.SUBTITLE: "subtitle",
    PP_PLACEHOLDER_TYPE.SLIDE_NUMBER: "slide_number",
    PP_PLACEHOLDER_TYPE.TABLE: "table",
    PP_PLACEHOLDER_TYPE.HEADER: "header",
    # ...
} # 추가해야함

MSD_SHAPE_TYPE_MAP = {
    MSO_SHAPE_TYPE.AUTO_SHAPE: "text",
    MSO_SHAPE_TYPE.GROUP: "text",
    MSO_SHAPE_TYPE.PICTURE: "figure",
    MSO_SHAPE_TYPE.CHART: "chart",
    MSO_SHAPE_TYPE.TABLE: "table",
    MSO_SHAPE_TYPE.TEXT_BOX: "text",
    # ...
} # 추가해야함

layout_data = []

for page_num, slide in enumerate(pptx.slides, start=1):
    page_info = {
        "page": page_num,
        "layout": slide.slide_layout.name,
        "boxes": []
    }
    for shape in slide.shapes:

        sx, sy, sw, sh = shape.left, shape.top, shape.width, shape.height
        sx /= pptx.slide_width
        sy /= pptx.slide_height
        sw /= pptx.slide_width
        sh /= pptx.slide_height

        # 텍스트 (없으면 "")
        value = shape.text if shape.has_text_frame else ""
        
        # class 분류 (Placeholder → 지정, 아니면 shape_type 사용)
        if shape.is_placeholder:
            ph_type = shape.placeholder_format.type
            cls = PLACEHOLDER_CLASS_MAP.get(ph_type, "text")
        else:
            cls = MSD_SHAPE_TYPE_MAP.get(shape.shape_type, "text")
        box_info = {
            "box": [sx, sy, sx + sw, sy + sh],
            "value": value,
            "class": cls
        }
        page_info["boxes"].append(box_info)
    layout_data.append(page_info)

# 결과 layout_data는 list of dict, JSON으로 저장 가능

output_path = os.path.basename(pptx_path).replace(".pptx", ".json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(layout_data, f, ensure_ascii=False, indent=2)


In [None]:
# PPTX 파일을 이미지로 변환하는 함수
import subprocess
import os

def convert_pptx_to_image(pptx_path, output_dir='./', format='pdf'):
    """PPTX 파일을 이미지로 변환하는 함수
    
    방법 1: LibreOffice 사용 (리눅스/맥/윈도우 모두 가능)
    방법 2: Windows COM API 사용 (Windows 전용)
    """
    # 출력 디렉토리 생성
    os.makedirs(output_dir, exist_ok=True)
    
    # 파일명 추출
    filename = os.path.basename(pptx_path).rsplit('.', 1)[0]
    
    # 방법 1: LibreOffice 사용
    try:
        cmd = [
            'libreoffice', '--headless', 
            '--convert-to', format, 
            '--outdir', output_dir, 
            pptx_path
        ]
        result = subprocess.run(cmd, check=True, text=True, capture_output=True)
        print(f"변환 성공: {result.stdout}")
        return f"{os.path.join(output_dir, filename)}.{format}"
    
    except subprocess.CalledProcessError as e:
        print("LibreOffice 변환 실패:", e)
        print("Windows나 다른 방법을 시도합니다...")
    
    # 방법 2: Windows COM API 사용 (Windows 전용)
    if os.name == 'nt':  # Windows인 경우
        try:
            import comtypes.client
            powerpoint = comtypes.client.CreateObject("Powerpoint.Application")
            powerpoint.Visible = True
            presentation = powerpoint.Presentations.Open(os.path.abspath(pptx_path))
            
            output_path = os.path.join(output_dir, f"{filename}.{format}")
            presentation.SaveAs(output_path, 18)  # 18 = ppSaveAsPNG
            presentation.Close()
            powerpoint.Quit()
            return output_path
            
        except Exception as e:
            print("Windows COM 변환 실패:", e)
            print("수동으로 변환 후 진행해주세요.")
            return None
    
    print("호환되는 변환 방법을 찾을 수 없습니다. PDF로 변환 후 pdf2image를 사용해보세요.")
    return None

# PPTX를 이미지로 변환
slide_image = convert_pptx_to_image(pptx_path)
print(f"변환된 이미지 경로: {slide_image}")

**PPTX(layout class 추출) -> PDF(정확한 text box 위치 추출) -> image**


In [None]:
import os
import subprocess
import json
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER_TYPE
import pdfplumber

# --- Step 1: PPTX Shape 데이터 추출 ---
def extract_pptx_shapes(pptx_path):
    prs = Presentation(pptx_path)
    pptx_width, pptx_height = prs.slide_width, prs.slide_height

    PLACEHOLDER_CLASS_MAP = {
        PP_PLACEHOLDER_TYPE.TITLE: "title",
        PP_PLACEHOLDER_TYPE.SUBTITLE: "subtitle",
        PP_PLACEHOLDER_TYPE.SLIDE_NUMBER: "slide_number",
        PP_PLACEHOLDER_TYPE.TABLE: "table",
        PP_PLACEHOLDER_TYPE.HEADER: "header",
        # ...
    } # 추가해야함

    MSD_SHAPE_TYPE_MAP = {
        MSO_SHAPE_TYPE.AUTO_SHAPE: "text",
        MSO_SHAPE_TYPE.GROUP: "text",
        MSO_SHAPE_TYPE.PICTURE: "figure",
        MSO_SHAPE_TYPE.CHART: "chart",
        MSO_SHAPE_TYPE.TABLE: "table",
        MSO_SHAPE_TYPE.TEXT_BOX: "text",
        # ...
    } # 추가해야함

    shape_list = []
    for page_num, slide in enumerate(prs.slides, start=1):
        for shape in slide.shapes:
            sx, sy, sw, sh = float(shape.left), float(shape.top), float(shape.width), float(shape.height)
            value = shape.text if shape.has_text_frame else ""
            # class 분류
            if shape.is_placeholder:
                ph_type = shape.placeholder_format.type
                cls = PLACEHOLDER_CLASS_MAP.get(ph_type, "content")
            else:
                cls = MSD_SHAPE_TYPE_MAP.get(shape.shape_type, "text")
            shape_list.append({
                "page": page_num,
                "box": [sx, sy, sx+sw, sy+sh],
                "class": cls,
                "value": value,
                "pptx_width": float(pptx_width),
                "pptx_height": float(pptx_height)
            })
    return shape_list, float(pptx_width), float(pptx_height)

# --- Step 2: PPTX -> PDF 변환 ---
def pptx_to_pdf(pptx_path, output_dir="./"):
    basename = os.path.splitext(os.path.basename(pptx_path))[0]
    output_pdf = os.path.join(output_dir, f"{basename}.pdf")
    cmd = [
        "libreoffice", "--headless",
        "--convert-to", "pdf",
        "--outdir", output_dir,
        pptx_path
    ]
    subprocess.run(cmd, check=True)
    return output_pdf

# --- Step 3: PDF에서 단어 bbox 추출 ---
def extract_pdf_words(pdf_path):
    word_list = []
    with pdfplumber.open(pdf_path) as pdf:
        pdf_width, pdf_height = pdf.pages[0].width, pdf.pages[0].height
        for page_num, page in enumerate(pdf.pages, start=1):
            
            for word in page.extract_text_lines():
                word_list.append({
                    "page": page_num,
                    "text": word["text"],
                    "box": [float(word["x0"]), float(word["top"]), float(word["x1"]), float(word["bottom"])],
                    "pdf_width": float(pdf_width),
                    "pdf_height": float(pdf_height),
                })
    return word_list, float(pdf_width), float(pdf_height)

# --- Step 4: 좌표계 정규화 함수 ---
def normalize_box(box, w, h):
    x0, y0, x1, y1 = box
    return [x0 / w, y0 / h, x1 / w, y1 / h]

# --- Step 5: shape-단어 spatial 매칭 함수 ---
def box_in_box(inner, outer, threshold=0.85):
    ix0, iy0, ix1, iy1 = inner
    ox0, oy0, ox1, oy1 = outer
    inter_x0, inter_y0 = max(ix0, ox0), max(iy0, oy0)
    inter_x1, inter_y1 = min(ix1, ox1), min(iy1, oy1)
    inter_area = max(0, inter_x1 - inter_x0) * max(0, inter_y1 - inter_y0)
    inner_area = (ix1 - ix0) * (iy1 - iy0)
    if inner_area == 0:
        return False
    return inter_area / inner_area > threshold

def match_shapes_and_words(shape_list, word_list):
    # 정규화
    for s in shape_list:
        s["norm_box"] = normalize_box(s["box"], s["pptx_width"], s["pptx_height"])
    for w in word_list:
        w["norm_box"] = normalize_box(w["box"], w["pdf_width"], w["pdf_height"])
    # 매칭
    print(s)
    print(w)
    shape2words = {i: [] for i in range(len(shape_list))}
    for wi, word in enumerate(word_list):
        for si, shape in enumerate(shape_list):
            if word["page"] != shape["page"]:
                continue
            if box_in_box(word["norm_box"], shape["norm_box"]):
                shape2words[si].append(wi)
                break
    return shape2words

# --- Step 6: 전체 파이프라인 실행 ---
def pptx_shape_word_matching_pipeline(pptx_path, tmp_dir="./"):
    # 1. PPTX shape 정보 추출
    shape_list, pptx_w, pptx_h = extract_pptx_shapes(pptx_path)
    # 2. PPTX → PDF
    pdf_path = pptx_to_pdf(pptx_path, output_dir=tmp_dir)
    # 3. PDF 단어 bbox 추출
    word_list, pdf_w, pdf_h = extract_pdf_words(pdf_path)
    # 4. shape-단어 매칭
    shape2words = match_shapes_and_words(shape_list, word_list)
    # 5. 결과 출력/저장
    results = []
    for si, word_indices in shape2words.items():
        shape = shape_list[si]
        words = [word_list[wi]["text"] for wi in word_indices]
        results.append({
            "page": shape["page"],
            "shape_class": shape["class"],
            "shape_value": shape["value"],
            "shape_box": shape["box"],
            "words": words
        })
    return results

if __name__ == "__main__":
    pptx_path = "HA_0032_0011165.pptx"  # 파일명 변경
    tmp_dir = "./tmp"
    os.makedirs(tmp_dir, exist_ok=True)
    results = pptx_shape_word_matching_pipeline(pptx_path, tmp_dir)
    # 결과를 JSON으로 저장
    with open(pptx_path.replace(".pptx", "_matched_layout.json"), "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    print("완료. 결과 예시:")
    print(json.dumps(results[:2], ensure_ascii=False, indent=2))


In [None]:
with pdfplumber.open("HA_0032_0011165.pdf") as pdf:
        pdf_width, pdf_height = pdf.pages[0].width, pdf.pages[0].height
        print(pdf_width, pdf_height)
        for page_num, page in enumerate(pdf.pages, start=1):
            print(page.extract_text_lines())
            # for word in page.extract_text_lines():
            #     word_list.append({
            #         "page": page_num,
            #         "text": word["text"],
            #         "box": [float(word["x0"]), float(word["top"]), float(word["x1"]), float(word["bottom"])],
            #         "pdf_width": float(pdf_width),
            #         "pdf_height": float(pdf_height),
            #     })