In [5]:
import sys
import os
sys.path.append(os.path.abspath(".."))
from Preprocessing_data.pdf_loader_pre import CustomPDFLoader

In [6]:
import logging
from langchain.schema import Document

class ModulePDFLoader:
    def __init__(self, module_directory, ocr_lang="eng"):
        """
        Initialize the module PDF loader.

        Args:
            module_directory (str): Path to the module directory containing PDFs.
            ocr_lang (str): OCR language for text extraction.
        """
        self.module_directory = module_directory
        self.ocr_lang = ocr_lang
        self.documents = []

    def load_module_pdfs(self):
        """
        Walk through all PDFs in the module directory, process them using NLP-powered loader,
        and store the LangChain Documents.
        """
        logging.info("Starting module PDF loading with advanced NLP preprocessing...")
        pdf_count = 0

        for root, _, files in os.walk(self.module_directory):
            for file in files:
                if file.lower().endswith(".pdf"):
                    pdf_path = os.path.join(root, file)
                    logging.info(f"➡️ Processing PDF: {pdf_path}")

                    try:
                        # Use advanced NLP-powered PDF loader
                        pdf_loader = CustomPDFLoader(pdf_path, ocr_lang=self.ocr_lang)
                        documents = pdf_loader.load()

                        # Optionally enrich metadata (e.g., file name as metadata tag)
                        for doc in documents:
                            doc.metadata = {"source": pdf_path}

                        self.documents.extend(documents)
                        pdf_count += 1
                    except Exception as e:
                        logging.error(f"Failed to process {pdf_path}: {e}")

        logging.info(f"✅ Completed loading {pdf_count} PDF(s) with NLP preprocessing.")
        return self.documents
    

    def save_all_texts(self, output_dir="module_extracted_texts"):
        """
        Save all extracted documents to individual text files.

        Args:
            output_dir (str): Directory where extracted texts will be saved.
        """
        os.makedirs(output_dir, exist_ok=True)

        for i, doc in enumerate(self.documents):
            source = doc.metadata.get("source", f"doc_{i}")
            base_name = os.path.splitext(os.path.basename(source))[0]
            output_path = os.path.join(output_dir, f"{base_name}_extracted.txt")

            with open(output_path, "w", encoding="utf-8") as f:
                f.write(doc.page_content)

            logging.info(f"📝 Saved: {output_path}")

