In [None]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF, automatically detecting single or two-column layout, and outputs plain text."""
    doc = fitz.open(pdf_path)
    pages_text = []

    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)

        # Get the dimensions of the page
        page_width = page.rect.width
        page_height = page.rect.height

        # Get the text blocks on the page to determine layout
        blocks = page.get_text("blocks")
        left_count, right_count = 0, 0

        for block in blocks:
            # Calculate the center position of the text block
            block_center_x = (block[0] + block[2]) / 2
            if block_center_x < page_width / 2:
                left_count += 1
            else:
                right_count += 1

        # If there are significantly more blocks on the right, assume a two-column layout
        is_two_columns = right_count > left_count * 0.3  # Threshold can be adjusted

        if is_two_columns:
            # Handle the text as two-column layout
            left_rect = fitz.Rect(0, 0, page_width / 2, page_height)  # Left side region
            right_rect = fitz.Rect(page_width / 2, 0, page_width, page_height)  # Right side region

            # Extract text from the left and right columns, removing unnecessary line breaks
            left_text = page.get_text("text", clip=left_rect).replace('\n', ' ')
            right_text = page.get_text("text", clip=right_rect).replace('\n', ' ')

            # Combine the left and right column text into a continuous paragraph
            page_text = left_text + " " + right_text
        else:
            # Handle the text as a single-column layout, removing line breaks
            page_text = page.get_text("text").replace('\n', ' ')

        # Add the processed text to the total text collection
        pages_text.append(page_text)

    return pages_text  # Keep the text of each page separate