In [1]:
from pptx import Presentation

In [19]:
pptx_path = "test.pptx"
prs = Presentation(pptx_path)

In [24]:
def extract_node_boxes(slide, slide_num):
    """
    Extract all text boxes from a single slide and sort by position.
    Also identifies the Name (ID) of the slide as the topmost-leftmost text.
    
    Args:
        slide: The slide object from pptx.
        slide_num (int): The slide number.

    Returns:
        tuple: (List of node_boxes, Name of the slide)
    """
    node_boxes = []
    respondent = None # Store the name
    
    def extract_text_from_shape(shape, slide_num):
        """Recursively extract text from shapes, including grouped shapes."""
        if shape.has_text_frame:
            text = shape.text.strip()
            if text:
                top, left, right, bottom = shape.top, shape.left, shape.left + shape.width, shape.top + shape.height
                node_boxes.append((slide_num, top, left, right, bottom, text))

        # Handle GroupShape elements (shapes that contain other shapes)
        if shape.shape_type == 6:  # MSO_SHAPE.GROUP
            for sub_shape in shape.shapes:
                extract_text_from_shape(sub_shape, slide_num)
    
    # Extract text from all shapes on the slide
    for shape in slide.shapes:
        extract_text_from_shape(shape, slide_num)

    # Identify the name (ID) → The topmost-leftmost text box
    if node_boxes:
        name_box = min(node_boxes, key=lambda x: (x[1], x[2]))  # Sort by top, then left by (slide_num, shape.top, shape.left, text)
        respondent = name_box[5]  # Extract the text as the slide name
        node_boxes.remove(name_box)  # Remove name from nodes to avoid duplication

    # Count valid text nodes (after removing respondent)
    node_count = len(node_boxes)
    
    # Sort nodes by (slide number, vertical position, horizontal position)
    node_boxes.sort(key=lambda x: (x[0], x[1], x[2]))

    return node_boxes, respondent, node_count

In [25]:
# Run the function to extract text boxes and respondents from each slide
all_nodes = []  # Store all text boxes from all slides
resps = {}  # Store the respondent (top-leftmost text) for each slide

# Iterate through all slides in the PowerPoint presentation
for slide_num, slide in enumerate(prs.slides, start=1):
    # Extract text boxes (nodes) and the respondent (title/ID) from the current slide
    node_boxes, respondent, node_count = extract_node_boxes(slide, slide_num)
    
    # Store extracted text boxes for further processing
    all_nodes.extend(node_boxes)
    
    # Store the respondent name for this slide
    resps[slide_num] = respondent  
    
    # Print the respondent name (top-leftmost text of the slide)
    print(f"Respondent: {respondent} has {node_count} values")
    
    # Print all extracted text boxes for the current slide
    for slide_num, _, _, _, _, text in node_boxes:
        print(f"Slide {slide_num}: {text}")

Respondent: Bleecker Alexander has 8 values
Slide 1: Serenity
Slide 1: General LoveFriend + Romantic
Slide 1: Humor
Slide 1: Intellectualism
Slide 1: Make a Difference
Slide 1: Accomplishment
Slide 1: Organization
Slide 1: Discipline
Respondent: Bleecker Alexander has 8 values
Slide 2: Serenity
Slide 2: General LoveFriend + Romantic
Slide 2: Humor
Slide 2: Intellectualism
Slide 2: Make a Difference
Slide 2: Accomplishment
Slide 2: Organization
Slide 2: Discipline
Respondent: Laura Tejeda has 0 values
Respondent: None has 0 values


In [3]:
def extract_textbox_positions(slide, slide_num):
    """
    Extracts the position coordinates (top, left, right, bottom) of each text box in a PPTX file.

    Args:
        pptx_path (str): The file path of the PowerPoint (.pptx) file.

    Returns:
        list: A list of tuples, where each tuple contains:
            - slide_num (int): The slide number where the text box is located.
            - text (str): The text content of the text box.
            - top (int): The top Y-coordinate of the text box.
            - left (int): The left X-coordinate of the text box.
            - right (int): The right X-coordinate (left + width).
            - bottom (int): The bottom Y-coordinate (top + height).
    """
    text_boxes = []

    # Iterate through all shapes on the current slide
    for shape in slide.shapes:
        if shape.has_text_frame:  # Check if the shape contains text
            text = shape.text.strip()  # Extract and clean the text
            if text:  # Only process non-empty text boxes
                top = shape.top  # Get the top Y-coordinate
                left = shape.left  # Get the left X-coordinate
                right = left + shape.width
                bottom = top + shape.height
                # Store the text box details
                text_boxes.append((slide_num, top, left, right, bottom, text))

    return text_boxes

In [9]:
# Run the function

# Initialize a list to store text boxes from all slides
all_text_boxes = []

for slide_num, slide in enumerate(prs.slides, start=1):
    text_boxes = extract_textbox_positions(slide, slide_num)
    all_text_boxes.extend(text_boxes)
    
# Print extracted text box details
for slide_num, top, left, right, bottom, text in all_text_boxes:
    print(f"Slide {slide_num}: {text} - top={top}, left={left}, right={right}, bottom={bottom}")

Slide 1: Serenity - top=295321, left=1123044, right=2009063, bottom=427306
Slide 1: Make a Difference - top=925145, left=1702184, right=2559754, bottom=1057130
Slide 1: Humor - top=611780, left=582506, right=1440076, bottom=743765
Slide 1: Discipline - top=1484708, left=1142346, right=1999916, bottom=1616693
Slide 1: General LoveFriend + Romantic - top=594001, left=1702183, right=2559753, bottom=778381
Slide 1: Bleecker Alexander - top=89047, left=26670, right=1007475, bottom=282936
Slide 1: Intellectualism - top=925142, left=582508, right=1440078, bottom=1057127
Slide 1: Accomplishment - top=1185064, left=582508, right=1440078, bottom=1317049
Slide 1: Organization - top=1185064, left=1702184, right=2559754, bottom=1317049
Slide 2: Serenity - top=295321, left=1123044, right=2009063, bottom=427306
Slide 2: Make a Difference - top=925145, left=1702184, right=2559754, bottom=1057130
Slide 2: Humor - top=611780, left=582506, right=1440076, bottom=743765
Slide 2: Discipline - top=1484708, 

# Plotting for check

In [None]:
import matplotlib.pyplot as plt
from matplotlib import patches

In [None]:
from LinesShape import extract_lines_positions

In [None]:
def plot_boxes_and_lines_from_pptx(prs):
    """
    Plots all extracted text boxes and lines from the PPTX file to verify correct extraction.

    Args:
        prs: The PowerPoint presentation object.
    """
    for slide_num, slide in enumerate(prs.slides, start=1):
        # 提取文本框
        node_boxes, slide_name = extract_node_boxes(slide, slide_num)
        # 提取线条
        lines = extract_lines_positions(slide)  # 你的线条提取函数

        if not node_boxes and not lines:
            continue  # 如果没有 box 或 line，跳过该页

        # 计算最大Y值（即页面高度）
        max_y = max([bottom for _, _, _, _, bottom, _ in node_boxes] + 
                    [max(y1, y2) for _, (x1, y1, x2, y2) in lines.items()], 
                    default=1000)

        # 创建画布
        plt.figure(figsize=(10, 6))
        ax = plt.gca()

        # 设定X轴范围（确保能容纳所有元素）
        max_x = max([right for _, _, _, right, _, _ in node_boxes] + 
                    [max(x1, x2) for _, (x1, _, x2, _) in lines.items()], 
                    default=1000)
        
        ax.set_xlim(0, max_x + 100)  # X 坐标扩展
        ax.set_ylim(0, max_y + 100)  # Y 轴修正后从 0 到 max_y
        ax.set_title(f"Slide {slide_num}: {slide_name}" if slide_name else f"Slide {slide_num}")
        ax.set_xlabel("X Coordinate")
        ax.set_ylabel("Y Coordinate")

        # 绘制文本框
        for box in node_boxes:
            _, top, left, right, bottom, text = box
            width = right - left
            height = bottom - top

            # 修正 Y 坐标
            new_top = max_y - bottom  # PPT的bottom变成Matplotlib的top

            # 画矩形框
            rect = patches.Rectangle((left, top), width, height, edgecolor='red', facecolor='none', linewidth=1.5)
            ax.add_patch(rect)

            # 在 box 内添加文字
            ax.text(left + 5, top + 15, text, fontsize=8, verticalalignment='top',
                    bbox=dict(facecolor='white', alpha=0.5))

        # 绘制线条
        for line_id, (x1, y1, x2, y2) in lines.items():
            # 修正 Y 坐标
            new_y1 = max_y - y1
            new_y2 = max_y - y2

            plt.plot([x1, x2], [y1, y2], marker="o", label=f"Line {line_id}", linestyle="-", color="blue")

        plt.grid(True)
        plt.legend(loc="best", fontsize="small", markerscale=0.5)  # 添加图例
        plt.show()

# 运行函数来绘制提取的文本框和线条
plot_boxes_and_lines_from_pptx(prs)