In [1]:
import pickle
import json
import os
import logging
from typing import List
from document_loader import load_document_with_unstructured, split_document_with_unstructured
from langchain_core.documents import Document

# Logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Configuration
PDF_PATH = "data/BD_PuppiesForDummies.pdf"
PUPPIES_START_PAGE = 26
PUPPIES_END_PAGE = 403
OUTPUT_FILE_PKL = "preprocessed_chunks.pkl"
OUTPUT_FILE_JSON = "preprocessed_chunks.json"

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# preprocess_and_save.py
import os
import re
import logging
import pickle
import json
from typing import List, Optional
from uuid import uuid4

from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.basic import chunk_elements
from unstructured.documents.elements import Text, ElementMetadata

from langchain_core.documents import Document

# Logging configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Configuration
PDF_PATH = "data/BD_PuppiesForDummies.pdf"
PUPPIES_START_PAGE = 26
PUPPIES_END_PAGE = 403
OUTPUT_FILE_PKL = "preprocessed_chunks.pkl"
OUTPUT_FILE_JSON = "preprocessed_chunks.json"

def load_document_with_unstructured(pdf_path: str, start_page: Optional[int] = None, end_page: Optional[int] = None) -> List[Document]:
    """ Loads a PDF document with Unstructured.io and converts it to LangChain format """
    logger.info(f"Loading PDF with Unstructured.io: {pdf_path}, pages {start_page} to {end_page}")

    page_range = None
    if start_page is not None and end_page is not None:
        page_range = list(range(start_page, end_page + 1))

    try:
        extracted_elements = partition_pdf(
            filename=pdf_path,
            strategy="fast",
            include_page_breaks=False,
        )
    except Exception as e:
        logger.error(f"Error while partitioning PDF {pdf_path}: {e}")
        return []

    logger.info(f"Extraction completed: {len(extracted_elements)} raw elements extracted")

    documents = []
    for element in extracted_elements:
        # Ignore empty or non-textual elements
        if not hasattr(element, 'text') or not element.text.strip():
            continue

        current_page_number = getattr(element.metadata, 'page_number', None)

        # Page filtering
        if start_page is not None and current_page_number is not None and current_page_number < start_page:
            continue
        if end_page is not None and current_page_number is not None and current_page_number > end_page:
            continue
        
        # Build metadata for LangChain Document
        metadata = {
            "source": pdf_path,
            "page": current_page_number,
            "category": str(type(element).__name__),
            "id": str(element.id) if hasattr(element, "id") else str(uuid4()),
        }
        if hasattr(element.metadata, 'filename'):
            metadata["filename"] = element.metadata.filename
        if hasattr(element.metadata, 'filetype'):
            metadata["filetype"] = element.metadata.filetype
        if hasattr(element.metadata, 'parent_id') and element.metadata.parent_id is not None:
            metadata["parent_id"] = str(element.metadata.parent_id)

        documents.append(Document(
            page_content=element.text,
            metadata=metadata
        ))

    logger.info(f"Conversion completed: {len(documents)} LangChain documents created after filtering")
    return documents


def split_document_with_unstructured(documents: List[Document]) -> List[Document]:
    """ Splits documents into chunks using Unstructured.io features """
    if not documents:
        logger.warning("No documents to split.")
        return []
    
    logger.info(f"Splitting {len(documents)} documents into chunks with Unstructured.io")

    chunked_langchain_documents = []

    # Recreate Unstructured elements from LangChain documents
    unstructured_elements_for_chunking = []

    valid_categories = ["NarrativeText", "ListItem", "Title"]

    for doc in documents:
        if doc.metadata.get("category") not in valid_categories:
            continue

        if len(doc.page_content.strip()) < 50:
            continue
            
        # Clean the text
        cleaned_text = doc.page_content
        # Replace references like "FIGURE XX-X:" with an empty string
        cleaned_text = re.sub(r'FIGURE \d+-\d+:', '', cleaned_text)
        # Replace \xad (soft hyphen) with an empty string to fix broken text
        cleaned_text = cleaned_text.replace('\xad', ' ')
        # Reassign the cleaned text to the document
        doc.page_content = cleaned_text
        
        # Create an ElementMetadata object from the metadata dictionary
        element_meta = ElementMetadata()
        if doc.metadata.get("filename"):
            element_meta.filename = doc.metadata.get("filename")
        if doc.metadata.get("filetype"):
            element_meta.filetype = doc.metadata.get("filetype")
        if doc.metadata.get("page"):
            element_meta.page_number = doc.metadata.get("page")
        
        # Create the Text element with appropriate metadata
        element = Text(
            text=doc.page_content, 
            metadata=element_meta, 
            element_id=doc.metadata.get("id", str(uuid4()))
        )
        unstructured_elements_for_chunking.append(element)

    if not unstructured_elements_for_chunking:
        logger.warning("No Unstructured elements could be created from LangChain documents.")
        return []

    try:
        # Apply standard chunking
        chunks = chunk_elements(
            elements=unstructured_elements_for_chunking,
            max_characters=1800,
            new_after_n_chars=1500,  # To avoid chunks that are too long
            overlap=400,  # 400 characters of overlap
        )
    except Exception as e:
        logger.error(f"Error while chunking elements: {str(e)}. Returning unsplit documents.")
        return documents

    # Convert chunks to LangChain Documents
    for i, chunk_element in enumerate(chunks):
        # Chunks are Element objects (or CompositeElement)
        page = getattr(chunk_element.metadata, 'page_number', 0)
        
        metadata = {
            "source": documents[0].metadata.get("source", ""),
            "page": page,
            "chunk_index": i,
            "id": str(chunk_element.id) if hasattr(chunk_element, "id") else str(uuid4()),
            "word_count": len(chunk_element.text.split()) if hasattr(chunk_element, 'text') else 0,
        }
        # Estimate the number of tokens
        metadata["token_count_approx"] = int(metadata["word_count"] * 1.3)
        
        chunked_langchain_documents.append(Document(
            page_content=chunk_element.text if hasattr(chunk_element, 'text') else "",
            metadata=metadata
        ))

    logger.info(f"Splitting completed: {len(chunked_langchain_documents)} chunks created")
    return chunked_langchain_documents

def preprocess_and_save():
    """Prétraite le document et sauvegarde les chunks"""
    # Charger le document
    documents = load_document_with_unstructured(
        pdf_path=PDF_PATH,
        start_page=PUPPIES_START_PAGE,
        end_page=PUPPIES_END_PAGE
    )
    
    # Diviser en chunks
    chunks = split_document_with_unstructured(documents)
    logger.info(f"Nombre de chunks générés: {len(chunks)}")
    
    # Conversion des Documents en dictionnaires sérialisables
    serializable_chunks = []
    for chunk in chunks:
        serializable_chunks.append({
            "page_content": chunk.page_content,
            "metadata": chunk.metadata
        })
    
    # Sauvegarde en format pickle
    logger.info(f"Sauvegarde au format pickle dans {OUTPUT_FILE_PKL}...")
    with open(OUTPUT_FILE_PKL, 'wb') as f:
        pickle.dump(serializable_chunks, f)
    
    # Sauvegarde en format JSON (plus portable)
    logger.info(f"Sauvegarde au format JSON dans {OUTPUT_FILE_JSON}...")
    with open(OUTPUT_FILE_JSON, 'w', encoding='utf-8') as f:
        json.dump(serializable_chunks, f, ensure_ascii=False, indent=2)
    
    logger.info("Prétraitement terminé avec succès!")
    
    # Afficher des statistiques
    total_tokens = sum(chunk["metadata"].get("token_count_approx", 0) for chunk in serializable_chunks)
    logger.info(f"Statistiques:")
    logger.info(f"- Nombre total de chunks: {len(serializable_chunks)}")
    logger.info(f"- Nombre approximatif de tokens: {total_tokens}")
    logger.info(f"- Taille du fichier pickle: {os.path.getsize(OUTPUT_FILE_PKL) / (1024*1024):.2f} MB")
    logger.info(f"- Taille du fichier JSON: {os.path.getsize(OUTPUT_FILE_JSON) / (1024*1024):.2f} MB")

if __name__ == "__main__":
    preprocess_and_save()

2025-05-11 21:26:00,869 - INFO - Loading PDF with Unstructured.io: data/BD_PuppiesForDummies.pdf, pages 26 to 403
2025-05-11 21:26:09,417 - INFO - Extraction completed: 6089 raw elements extracted
2025-05-11 21:26:09,426 - INFO - Conversion completed: 4129 LangChain documents created after filtering
2025-05-11 21:26:09,431 - INFO - Splitting 4129 documents into chunks with Unstructured.io
2025-05-11 21:26:09,470 - INFO - Splitting completed: 461 chunks created
2025-05-11 21:26:09,471 - INFO - Nombre de chunks générés: 461
2025-05-11 21:26:09,471 - INFO - Sauvegarde au format pickle dans preprocessed_chunks.pkl...
2025-05-11 21:26:09,472 - INFO - Sauvegarde au format JSON dans preprocessed_chunks.json...
2025-05-11 21:26:09,476 - INFO - Prétraitement terminé avec succès!
2025-05-11 21:26:09,477 - INFO - Statistiques:
2025-05-11 21:26:09,477 - INFO - - Nombre total de chunks: 461
2025-05-11 21:26:09,477 - INFO - - Nombre approximatif de tokens: 164067
2025-05-11 21:26:09,477 - INFO - - T