In [1]:
import fitz

In [None]:
import ImageProcessor

In [115]:
from PIL import Image
import io
import numpy as np

# PDF Extraction Module

In [122]:
class PDFTextExtractor:
    """
    A class to extract text and their bounding boxes from a PDF file.
    """
    def __init__(self, pdf_path):
        """
        Initializes the PDFTextExtractor with the path to the PDF file.

        Args:
            pdf_path (str): The path to the PDF file.
        """
        self.pdf_path = pdf_path
        self.pdf_doc = fitz.open(pdf_path)
        self.pages = self.pdf_doc.page_count

    def extract_text(self, page_number):
        """
        Extracts text and bounding boxes from a specific page of the PDF.

        Args:
            page_number (int): The zero-based index of the page to extract text from.

        Returns:
            list: A list of dictionaries, where each dictionary represents a text block
                  and contains the text content and its bounding box. Each dictionary
                  has the keys 'text' and 'bbox'. Returns None if an error occurs.
                  The 'bbox' value is a tuple: (x0, y0, x1, y1), representing the
                  coordinates of the top-left and bottom-right corners of the text block.
        """
        try:
            page = self.pdf_doc.load_page(page_number)
            blocks = page.get_text('blocks')  # Get text blocks with layout information

            page_blocks = []
            for block in blocks:
                # Each 'block' is a tuple: (x0, y0, x1, y1, text, block_no, block_type)
                bbox_position = block[:4]  # Extract the bounding box coordinates (x0, y0, x1, y1)
                bbox_text = block[4].replace('\n', ' ').replace('  ', ' ').strip()  # Extract text and clean it
                page_blocks.append({'text': bbox_text, 'bbox': bbox_position})
        
        except Exception as e:
            print(f"Error extracting text from page {page_number}: {e}")
            return None
        return page_blocks
    
    def extract_respondent(self, page_number):
        """
        Extracts the respondent from a specific page of the PDF.

        Args:
            page_number (int): The zero-based index of the page to extract text from.

        Returns:
            str: The text of the respondent. Returns None if an error occurs.
        """
        respondent = None
        try:
            page_blocks = self.extract_text(page_number)
            if page_blocks is None:
                return None
            # Assuming the first block contains the respondent's name
            respondent = page_blocks[0]
            # If the first block is 'Our Cluster\'s Values', return None
            if respondent['text'] == 'Our Cluster\'s Values':
                return None
            # Otherwise, return the text of the first block
            respondent = respondent['text']
            return respondent if respondent else None
        except Exception as e:
            print(f"Error extracting respondent from page {page_number}: {e}")
            return None
    
    def all_respondents(self):
        """
        Extracts all respondents from all pages of the PDF.

        Returns:
            list: A list of respondents. Returns None if an error occurs.
        """
        try:
            all_respondents = []
            for page_number in range(self.pages):
                respondent = self.extract_respondent(page_number)
                if respondent:
                    all_respondents.append(respondent)
            return all_respondents
        except Exception as e:
            print(f"Error extracting all respondents: {e}")
            return None
        
    def relationships_from_image(self, page_number):
        """
        Extracts relationships from the image on a specific page of the PDF.

        Args:
            page_number (int): The zero-based index of the page to extract relationships from.

        Returns:
            list: A list of relationships. Returns None if an error occurs.
        """
        try:
            page = self.pdf_doc[page_number]
            image_list = page.get_images(full=True)
            if not image_list:
                return None
            xref = image_list[0][0]
            base_image = self.pdf_doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))
            
            # Process the image to extract relationships
            image_processor = ImageProcessor.ImageProcessor(image)
            relationships = image_processor.match_relationships()
            return relationships
        except Exception as e:
            print(f"Error extracting relationships from page {page_number}: {e}")
            return None

# Example Usage

In [None]:
if __name__ == "__main__":
    pdf_file_path = "pdftest.pdf"
    extractor = PDFTextExtractor(pdf_file_path)
    num_pages = extractor.pages

    for page_number in range(num_pages):
        respondent = extractor.extract_respondent(page_number)
        if respondent:
            print(f"----- Page {page_number + 1} -----")
            print(f"Respondent: {respondent}")
            relations = extractor.relationships_from_image(page_number)
            print(f"Relationships: {relations}")

----- Page 1 -----
Respondent: Chike Agba
[2025/04/25 00:27:24] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/Users/jiazhengtian/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/Users/jiazhengtian/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_i

In [None]:
relations

[('Critical Thinking', 'Discipline'),
 ('Discipline', 'Communication'),
 ('Discipline', 'Humor'),
 ('Communication', 'Humor'),
 ('Humor', 'Teamwork'),
 ('Teamwork', 'Communication'),
 ('Teamwork', 'Perseverance'),
 ('Perseverance', 'Quality'),
 ('Quality', 'Fulfillment')]

In [131]:
respondent