In [7]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter

def process_all_pdfs(pdf_directory):
    """Process all PDF files in a directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)
    
    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**.pdf"))
    
    print(f"Found {len(pdf_files)} PDF files to process")
    
    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()
            
            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"  ✓ Loaded {len(documents)} pages")
            
        except Exception as e:
            print(f"  ✗ Error: {e}")
    
    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

# Process all PDFs in the data directory
pdf_document = process_all_pdfs("../data")
# Now pdf_document contains all the loaded PDF documents with metadata


ModuleNotFoundError: No module named 'langchain.text_splitter'

In [5]:
pdf_document

[Document(metadata={'producer': 'Skia/PDF m143', 'creator': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/143.0.0.0 Safari/537.36', 'creationdate': '2025-12-23T12:18:13+00:00', 'title': 'Odysseus - Wikipedia', 'moddate': '2025-12-23T12:18:13+00:00', 'source': '..\\data\\Odysseus.pdf', 'total_pages': 21, 'page': 0, 'page_label': '1', 'source_file': 'Odysseus.pdf', 'file_type': 'pdf'}, page_content='Odysseus\nHead of Odysseus from a Roman period\nHellenistic marble group representing\nOdysseus blinding Polyphemus, found at the\nvilla of Tiberius at Sperlonga, Italy\nIn-universe information\nTitle King of Ithaca\nSpouse Penelope\nChildren Telemachus, Telegonus,\nCassiphone, Agrius, Anteias,\nArdeas, Rhomos, Poliporthes,\nLatinus, Nausinous, Nausithous,\nEuryalus\nRelatives Laertes (father)\nAnticlea (mother)\nCtimene (sister)\nNationalityGreek\nOdysseus\nIn Greek and Roman mythology, Odysseus\n(/əˈdɪsiəs/ ⓘ ə -DISS-ee- ə s;[1] Ancient Greek:\nὈ δυσ

In [6]:
def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs
chunks=split_documents(pdf_document)
chunks

NameError: name 'RecursiveCharacterTextSplitter' is not defined