In [None]:
import cv2
import numpy as np
from paddleocr import PaddleOCR
from io import BytesIO
from PIL import Image

class ImageProcessor:
    """
    A class to extract text boxes and relationships from images inside a PPT slide.
    """

    def __init__(self, image):
        """
        Initializes the text box processor.
        
        :param image_path: Path to the image to process.
        """
        self.image = image

        # Convert to grayscale
        self.gray = cv2.cvtColor(self.image, cv2.COLOR_RGB2GRAY)
        
        # Initialize PaddleOCR
        self.ocr = PaddleOCR(use_angle_cls=True, lang="en")

        # Extract initial text boxes
        self.boxes, self.texts, self.scores = self._extract_text_boxes()

        # Mask text regions
        self.text_boxes = self.find_boxes()
        self.filtered_boxes = self.filter_boxes(self.text_boxes)

        # Extract text boxes and relationships
        self.text_boxes_dict = self.get_combined_text_dict()
        self.box_centers = self._calculate_box_centers()

        self.lines = self.find_lines()

        self.relationships = self.match_relationships()
    
    def _extract_text_boxes(self):
            """
            Extract text boxes, recognized texts, and confidence scores using PaddleOCR.

            :return: Tuple of (boxes, texts, scores).
            """
            result = self.ocr.ocr(self.image, cls=True)
            
            # Extract bounding boxes, recognized text, and confidence scores
            boxes = [line[0] for line in result[0]]  # Detected text regions
            texts = [line[1][0] for line in result[0]]  # Recognized texts
            scores = [line[1][1] for line in result[0]]  # Confidence scores

            return boxes, texts, scores

    def _boxes_distance(self, box1, box2):
        """
        Compute the Euclidean distance between the centers of two text boxes.

        :param box1: Coordinates of the first text box.
        :param box2: Coordinates of the second text box.
        :return: Euclidean distance between the two box centers.
        """
        center1 = np.mean(box1, axis=0)  # Center of first box
        center2 = np.mean(box2, axis=0)  # Center of second box

        return np.linalg.norm(center1 - center2)  # Euclidean distance

    # Get Text Dictionary
    def get_combined_text_dict(self):
        """
        Merges nearby text boxes and returns a dictionary mapping coordinates to merged text.

        :return: Dictionary {tuple(box_coordinates): merged_text}
        """
        distance_threshold = 50  # Maximum distance between two text boxes to be merged
        combined_dict = {}  # Store box coordinates and text mapping
        used = [False] * len(self.boxes)

        for i in range(len(self.boxes)):
            if used[i]:
                continue  # Skip already merged boxes

            # Initialize current box and text
            current_box = np.array(self.boxes[i], dtype=np.float32)
            current_text = [self.texts[i]]  # Store text as a list to append later
            used[i] = True  # Mark as merged

            for j in range(i + 1, len(self.boxes)):
                if used[j]:
                    continue

                # Compute the distance between boxes
                distance = self._boxes_distance(current_box, self.boxes[j])

                # Merge if within threshold
                if distance < distance_threshold:
                    # Combine box coordinates
                    combined_points = np.vstack((current_box, self.boxes[j]))
                    combined_points = np.array(combined_points, dtype=np.float32)

                    # Ensure valid bounding box
                    if len(combined_points) >= 2:
                        try:
                            rect = cv2.minAreaRect(combined_points)
                            current_box = cv2.boxPoints(rect)  # Get the updated box

                            # Merge text
                            current_text.append(self.texts[j])  # Append the text
                            used[j] = True
                        except Exception as e:
                            print(f"Error in cv2.minAreaRect: {e}")
                            continue

            # Convert NumPy array to a tuple (so it can be used as a dictionary key)
            box_tuple = tuple(map(tuple, current_box))  # Convert to ((x1, y1), (x2, y2), ...)
            combined_dict[box_tuple] = " ".join(current_text)  # Store as key-value pair

        return combined_dict
    
    # Get All Possible Text Boxes
    def find_boxes(self):
        """
        Detects bounding boxes of potential text areas in the image.

        :return: A list of bounding boxes in the format [(x1, y1, x2, y2), ...]
        """
        # Convert to binary threshold (inverse)
        _, binary = cv2.threshold(self.gray, 128, 255, cv2.THRESH_BINARY_INV)

        # Find contours in the binary image
        contours, _ = cv2.findContours(binary, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)

        # Apply morphological operations to clean noise
        kernel = np.ones((3, 3), np.uint8)
        binary = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel)  # Remove noise
        binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)  # Fill gaps

        # Detect bounding boxes
        text_boxes = []
        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)
            if w * h > 1000:  # Filter out small regions
                text_boxes.append((x, y, x + w, y + h))

        return text_boxes

    def iou(self, box1, box2):
        """
        Computes the Intersection over Union (IoU) between two bounding boxes.

        :param box1: First bounding box (x1, y1, x2, y2)
        :param box2: Second bounding box (x1, y1, x2, y2)
        :return: IoU score (value between 0 and 1)
        """
        x1, y1, x2, y2 = box1
        x1_, y1_, x2_, y2_ = box2

        inter_x1 = max(x1, x1_)
        inter_y1 = max(y1, y1_)
        inter_x2 = min(x2, x2_)
        inter_y2 = min(y2, y2_)

        inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
        box1_area = (x2 - x1) * (y2 - y1)
        box2_area = (x2_ - x1_) * (y2_ - y1_)

        return inter_area / float(box1_area + box2_area - inter_area)

    def filter_boxes(self, text_boxes):
        """
        Filters out overlapping bounding boxes.

        :param text_boxes: List of bounding boxes.
        :return: Filtered list of bounding boxes.
        """
        filtered_boxes = []
        for i, box1 in enumerate(text_boxes):
            keep = True
            for j, box2 in enumerate(text_boxes):
                if i != j and self.iou(box1, box2) > 0.4:  # Overlapping threshold
                    if (box1[2] - box1[0]) * (box1[3] - box1[1]) > (box2[2] - box2[0]) * (box2[3] - box2[1]):
                        keep = False  # Remove the larger box if overlap is high
            if keep:
                filtered_boxes.append(box1)

        return [box for box in filtered_boxes if not (box[0] == 0 and box[1] == 0)]

    # Filter out Detected Boxes to Get Lines
    def apply_mask(self):
        """
        Applies a mask to remove detected text areas from the image.
        
        :return: Masked grayscale image and its edge-detected version.
        """
        mask = np.ones_like(self.gray) * 255
        for box in self.filtered_boxes:
            x1, y1, x2, y2 = box
            mask[y1:y2, x1:x2] = 0  # Mask out text areas

        return cv2.bitwise_and(self.gray, self.gray, mask=mask), cv2.Canny(mask, 100, 200)

    @staticmethod
    def is_point_near_edge(x, y, mask_edges, neighborhood_size=5):
        """
        Checks if a point (x, y) is near an edge in the masked image.

        :param x: X-coordinate of the point.
        :param y: Y-coordinate of the point.
        :param mask_edges: Edge-detected version of the masked image.
        :param neighborhood_size: Search window size.
        :return: Boolean indicating whether the point is near an edge.
        """
        x_min = max(0, x - neighborhood_size // 2)
        x_max = min(mask_edges.shape[1] - 1, x + neighborhood_size // 2)
        y_min = max(0, y - neighborhood_size // 2)
        y_max = min(mask_edges.shape[0] - 1, y + neighborhood_size // 2)

        return np.any(mask_edges[y_min:y_max + 1, x_min:x_max + 1] == 255)

    @staticmethod
    def are_lines_similar(line1, line2, length_threshold=10, midpoint_threshold=10, angle_threshold=10):
        """
        Determines whether two lines are similar based on length, midpoint, and angle.

        :param line1: First line segment (x1, y1, x2, y2)
        :param line2: Second line segment (x1, y1, x2, y2)
        :return: Boolean indicating similarity.
        """
        def normalize_line(line):
            x0, y0, x1, y1 = line
            return (x0, y0, x1, y1) if x0 < x1 else (x1, y1, x0, y0)

        line1, line2 = normalize_line(line1), normalize_line(line2)

        def line_length(line):
            return np.linalg.norm([line[2] - line[0], line[3] - line[1]])

        def line_midpoint(line):
            return ((line[0] + line[2]) / 2, (line[1] + line[3]) / 2)

        def line_angle(line):
            dx, dy = line[2] - line[0], line[3] - line[1]
            return np.degrees(np.arctan2(dy, dx))

        length_diff = abs(line_length(line1) - line_length(line2))
        midpoint_dist = np.linalg.norm(np.array(line_midpoint(line1)) - np.array(line_midpoint(line2)))
        angle_diff = abs(line_angle(line1) - line_angle(line2))
        angle_diff = min(angle_diff, 360 - angle_diff)  # Handle circular angle differences

        return (length_diff < length_threshold and midpoint_dist < midpoint_threshold and angle_diff < angle_threshold)
    
    def find_lines(self):
        """
        Detects unique line segments in the image, filtering out redundant lines.

        :return: List of detected lines in the format [(x1, y1, x2, y2), ...]
        """
        masked_gray, mask_edges = self.apply_mask()

        # Detect lines using Line Segment Detector
        lsd = cv2.createLineSegmentDetector(0)
        dlines = lsd.detect(masked_gray)

        unique_lines = []

        if dlines is not None:
            for dline in dlines[0]:
                x0, y0, x1, y1 = map(int, dline[0])

                if self.is_point_near_edge(x0, y0, mask_edges) and self.is_point_near_edge(x1, y1, mask_edges):
                    continue  # Ignore lines near text regions

                if np.linalg.norm([x1 - x0, y1 - y0]) > 10:  # Minimum line length
                    current_line = (x0, y0, x1, y1)

                    if not any(self.are_lines_similar(current_line, line) for line in unique_lines):
                        unique_lines.append(current_line)

        return unique_lines

    def _calculate_box_centers(self):
            """
            Computes the center points for all text boxes.

            :return: Dictionary {(cx, cy): text}
                    - Key: Center coordinates (cx, cy)
                    - Value: Corresponding text inside the bounding box
            """
            centers_dict = {}
            for box, text in self.text_boxes_dict.items():
                # Compute the center of the bounding box
                cx = np.mean([p[0] for p in box])  # Average x-coordinates
                cy = np.mean([p[1] for p in box])  # Average y-coordinates
                centers_dict[(cx, cy)] = text  # Store as dictionary {center: text}
            return centers_dict

    def _find_nearest_textbox(self, x, y):
        """
        Finds the nearest text box center for a given point (x, y).

        :param x: X-coordinate of the point
        :param y: Y-coordinate of the point
        :return: The text of the nearest box and its center (cx, cy),
                 or None if no match is found
        """
        min_distance = float("inf")
        nearest_text = None
        nearest_center = None  # Store the nearest box center

        for (cx, cy), text in self.box_centers.items():
            distance = np.sqrt((cx - x) ** 2 + (cy - y) ** 2)
            if distance < min_distance:
                min_distance = distance
                nearest_text = text
                nearest_center = (cx, cy)  # Save the nearest center

        return nearest_text, nearest_center

    def match_relationships(self):
        """
        Matches text boxes based on line segment connections (from-bottom-to-top)
        and removes duplicates while ensuring correct ordering.

        :return: List of unique (from_text, to_text) relationships sorted bottom-to-top, left-to-right.
        """
        relationships = set()  # Use a set to remove duplicates

        for line in self.lines:
            x0, y0, x1, y1 = line  # Extract line segment endpoints

            # Find the nearest text box for both endpoints
            text_from, center_from = self._find_nearest_textbox(x0, y0)
            text_to, center_to = self._find_nearest_textbox(x1, y1)

            # Ensure valid matches and maintain "bottom-to-top" relationship
            if text_from and text_to and (text_from != text_to):
                if y0 > y1:  # Ensure "from" is at the bottom
                    relationships.add((text_from, text_to, center_from, center_to))
                else:  # Swap if necessary
                    relationships.add((text_to, text_from, center_to, center_from))

        # **Sorting rules**
        # 1. Sort primarily by `from_center[1]` in descending order (bottom-to-top)
        # 2. If `y` coordinates are the same, sort by `from_center[0]` in ascending order (left-to-right)
        sorted_relationships = sorted(
            relationships, key=lambda item: (-item[2][1], item[2][0])
        )

        # Return the final relationship list
        return [(from_text, to_text) for from_text, to_text, _, _ in sorted_relationships]

In [51]:
image = cv2.imread("image1.png")  # 你可以换成PPT提取的图片

In [52]:
processor = ImageProcessor(image)

[2025/03/19 17:29:36] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/Users/jiazhengtian/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/Users/jiazhengtian/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320',

In [53]:
processor.match_relationships()

[('Strength', 'Discovery'),
 ('Strength', 'Bravery'),
 ('Strength', 'Ecstasy'),
 ('Strength', 'Calm'),
 ('Calm', 'Connectedto the universe'),
 ('Calm', 'Discovery'),
 ('Ecstasy', 'Connectedto the universe'),
 ('Bravery', 'Growth'),
 ('Discovery', 'Growth'),
 ('Connectedto the universe', 'Zen'),
 ('Growth', 'Zen')]

In [54]:
lines = processor.find_lines()
len(lines)

13

In [44]:
processor.get_combined_text_dict()

{((274.0, 24.0), (304.0, 24.0), (304.0, 39.0), (274.0, 39.0)): 'Zen',
 ((169.0, 126.0), (226.0, 126.0), (226.0, 144.0), (169.0, 144.0)): 'Growth',
 ((341.99997, 159.0),
  (341.99997, 119.0),
  (466.99997, 119.0),
  (466.99997, 159.0)): 'Connectedto the universe',
 ((30.0, 201.0), (104.0, 205.0), (103.0, 222.0), (29.0, 217.0)): 'Discovery',
 ((185.0, 213.0), (244.0, 216.0), (243.0, 234.0), (184.0, 230.0)): 'Bravery',
 ((304.0, 230.0), (359.0, 234.0), (358.0, 251.0), (302.0, 248.0)): 'Ecstasy',
 ((492.0, 243.0), (530.0, 243.0), (530.0, 258.0), (492.0, 258.0)): 'Calm',
 ((253.0, 339.0), (319.0, 339.0), (319.0, 356.0), (253.0, 356.0)): 'Strength'}