In [1]:
import sys
import logging
from pathlib import Path

project_root = Path.cwd()

if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

from config.settings import INPUT_DIR, OUTPUT_DIR, EXTRACTOR_CONFIG
from src.managers.file_manager import FileTypeManager
from src.extractors.pdf_extractor import PDFTextExtractor
from src.extractors.docx_extractor import DocxExtractor




AVAILABLE_EXTRACTORS = {
    '.pdf': PDFTextExtractor,
    '.docx': DocxExtractor,
    #'.xlsx': XLSXExtractor
}

manager = FileTypeManager()


In [2]:
logging.info(f"Registrando extratores suportados pela configuração: {EXTRACTOR_CONFIG.supported_extensions}")

for extension in EXTRACTOR_CONFIG.supported_extensions:
    extractor_class = AVAILABLE_EXTRACTORS.get(extension)
    if extractor_class:
        manager.register_extractor(extension, extractor_class)
    else:
        logging.warning(f"A extensão '{extension}' é suportada na configuração, mas não há uma classe de extrator mapeada para ela em AVAILABLE_EXTRACTORS.")
logging.info("="*50 + "\n")

2025-06-15 19:13:42,663 - root - INFO - Registrando extratores suportados pela configuração: ['.pdf', '.docx', '.xlsx', '.csv']
2025-06-15 19:13:42,663 - FileTypeManager - INFO - Registering extractor 'PDFTextExtractor' for extension '.pdf'
2025-06-15 19:13:42,663 - FileTypeManager - INFO - Registering extractor 'DocxExtractor' for extension '.docx'



In [3]:
logging.info(f"Iniciando processamento de arquivos do diretório: {INPUT_DIR}")
if not any(INPUT_DIR.iterdir()):
    logging.warning(f"O diretório de entrada '{INPUT_DIR}' está vazio.")
else:
    for file_path in sorted(INPUT_DIR.iterdir()):
        if file_path.is_file():
            logging.info(f"--- Processando: {file_path.name} ---")

            # O manager usa os caminhos importados
            success = manager.process_file(file_path, OUTPUT_DIR)

            if success:
                print(f"✅ Sucesso ao processar {file_path.name}")
            else:
                print(f"❌ Falha ou arquivo pulado: {file_path.name}")
            print("-" * 30 + "\n")

logging.info(f"Processamento concluído. Verifique os resultados em: {OUTPUT_DIR}")


2025-06-15 19:13:42,780 - root - INFO - Iniciando processamento de arquivos do diretório: C:\Users\osono\PycharmProjects\Pymupdf\data\input
2025-06-15 19:13:42,780 - root - INFO - --- Processando: image_ocr_test.pdf ---
2025-06-15 19:13:42,780 - FileTypeManager - INFO - Processing 'image_ocr_test.pdf' with 'PDFTextExtractor'
2025-06-15 19:13:42,780 - PDFTextExtractor - INFO - 'image_ocr_test.pdf' has 1 pages. Extracting all content.
2025-06-15 19:13:42,786 - PDFTextExtractor - INFO - 🔍 Poor quality detected for 'image_ocr_test.pdf'. Applying OCR...
2025-06-15 19:13:45,659 - src.utils.ocr_processor - INFO - 🔍 Iniciando OCR em 'image_ocr_test.pdf' (1 páginas)
2025-06-15 19:13:45,675 - src.utils.ocr_processor - ERROR - ❌ Erro durante OCR de 'C:\Users\osono\PycharmProjects\Pymupdf\data\input\image_ocr_test.pdf': code=2: cannot remove file 'C:\Users\osono\AppData\Local\Temp\tmpeu1buutm.png': Permission denied
2025-06-15 19:13:45,675 - PDFTextExtractor - INFO - 📊 Final quality for 'image_ocr

✅ Sucesso ao processar image_ocr_test.pdf
------------------------------

✅ Sucesso ao processar Introduction to neural networks.docx
------------------------------

✅ Sucesso ao processar lec21_history_neural_networks_typednotes.pdf
------------------------------

✅ Sucesso ao processar pdf_para_ocr.pdf
------------------------------



In [4]:
import json

json_path = r'C:\Users\osono\PycharmProjects\Pymupdf\data\output\Introduction to neural networks.json'

with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

content = data.get('content', '')
num_chars = len(content)

print(f"Número de caracteres em 'content': {num_chars}")

Número de caracteres em 'content': 12092
