In [1]:
import os
import PyPDF2
import json
import difflib
import os

In [None]:
class PDFParser:
    def __init__(self, file_path):
        self.file_path = file_path
        self.file_name = os.path.basename(file_path)
    
    def is_scanned_pdf(self):
        """Проверяем, является ли PDF файлом отсканированным."""
        with open(self.file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                page = reader.pages[page_num]
                text = page.extract_text()
                if text and text.strip():
                    return False 
        return True

    def parse_page_to_json(self, page_number, table_extractor):
        """Парсим указанную страницу PDF и возвращаем JSON, игнорируя таблицы."""
        with open(self.file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            
            if page_number < 1 or page_number > len(reader.pages):
                raise ValueError(f"Page number {page_number} is out of range.")
            
            page = reader.pages[page_number - 1]
            raw_text = page.extract_text()

            if raw_text and raw_text.strip():
                
                tables_ocr_text = table_extractor.extract_table_text(self.file_path, page_number - 1)
                clean_text = self.remove_tables_from_text(raw_text,  tables_ocr_text)

                page_data = {
                    "raw_text": clean_text.strip(),  
                    "file_path": self.file_path,
                    "doc_type": "text",
                    "filename": self.file_name,
                    "page_number": page_number
                }
                return page_data
            else:
                return {}

    def remove_tables_from_text(self, raw_text, tables):
        """Удаляет таблицы из текста, используя результат OCR и построчное сравнение."""
        idx_to_del = []
        for table in tables:
            clean_text_lines = list(filter(lambda x: x.strip(), raw_text.splitlines()))
            table_ocr_lines = list(filter(lambda x: x.strip(), table.splitlines()))
            
            start_idx, end_idx = self.find_table_boundaries(raw_text, clean_text_lines, table_ocr_lines)
            
            if start_idx is not None and end_idx is not None:
                idx_to_del.append((start_idx, end_idx))
            else:
                idx_to_del.append((None, None))

        clean_text = self.remove_text_by_indices(raw_text, idx_to_del)
        return clean_text

    def remove_text_by_indices(self, raw_text, indices):
        """Удаляет текст из raw_text по заданным индексам."""
        if not indices:
            return raw_text
        
        indices.sort()
        
        result = []
        last_end = 0
        
        for start_idx, end_idx in indices:
            if start_idx is None or end_idx is None:
                continue
            if last_end < start_idx:
                result.append(raw_text[last_end:start_idx])
            last_end = end_idx

        if last_end < len(raw_text):
            result.append(raw_text[last_end:])
        
        return ''.join(result)


    def find_table_boundaries(self, raw_text, text_lines, table_lines):
        """Находит индексы начала и конца таблицы в исходном тексте (raw_text)."""
        start_idx = None
        end_idx = None

        for i, line in enumerate(text_lines):
            table_cat = ""
            line_cat = ""
            
            if len(table_lines[0]) <= len(line):
                line_cat = line[:len(table_lines[0])]
                table_cat = table_lines[0]
            else:
                line_cat = line
                table_cat = table_lines[0][:len(line)]

            matches = difflib.get_close_matches(line_cat, [table_cat], n=1, cutoff=0.9)
            if matches:
                start_idx = raw_text.find(line_cat) 
                break

        for i, line in enumerate(text_lines):
            table_cat = ""
            line_cat = ""
            
            if len(table_lines[-1]) <= len(line):
                line_cat = line[:len(table_lines[-1])]
                table_cat = table_lines[-1]
            else:
                line_cat = line
                table_cat = table_lines[-1][:len(line)]

            matches = difflib.get_close_matches(line_cat, [table_cat], n=1, cutoff=0.9)
            if matches:
                end_idx = raw_text.find(line_cat) + len(line) 
                break  

        return start_idx, end_idx

In [None]:
if __name__ == "__main__":
    file_path = "example.pdf"
    parser = PDFParser(file_path)
    
    output_dir = "extracted_tables"
    table_extractor = PDFTableExtractor(output_dir) 

    if parser.is_scanned_pdf():
        print("The PDF is scanned.")
    else:
        print("The PDF contains text.")
    
    page_number = 12
    page_data = parser.parse_page_to_json(page_number, table_extractor)
    print(json.dumps(page_data, indent=4, ensure_ascii=False))