In [None]:
!pip install pymupdf pdfplumber -q

In [None]:
from typing import List
import fitz
import pdfplumber
import os

class NativePdfExtractor:
    def __init__(self, prefer_plumber: bool = False):
        self.prefer_plumber = prefer_plumber

    def extract_with_pymupdf(self, path: str) -> List[str]:
        doc = fitz.open(path)
        pages = []
        for page in doc:
            text = page.get_text("text")
            pages.append(text)
        return pages

    def extract_with_pdfplumber(self, path: str) -> List[str]:
        pages = []
        with pdfplumber.open(path) as pdf:
            for p in pdf.pages:
                pages.append(p.extract_text() or "")
        return pages

    def extract(self, path: str) -> List[str]:
        try:
            if self.prefer_plumber:
                pages = self.extract_with_pdfplumber(path)
            else:
                pages = self.extract_with_pymupdf(path)
            non_empty_pages = sum(1 for p in pages if p.strip())
            if non_empty_pages < max(1, len(pages) // 2):
                return []

            return pages

        except Exception:
            return []

    def extract_to_file(self, pdf_path: str, output_path: str) -> bool:
        pages = self.extract(pdf_path)

        if not pages:
            return False
        full_text = "\n\n".join(pages)

        os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)

        with open(output_path, "w", encoding="utf-8") as f:
            f.write(full_text)
        return True


In [None]:
NativePdfExtractor().extract_to_file("/content/RESOLUCAO (COPP) n 1035, de 23-06-2025..pdf", "/content/resultado/texto.txt" )

True