In [1]:
import os 
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain.document_loaders import DirectoryLoader  
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
### Read all pdfs from the directory:
def proces_all_pdfs(pdf_directory):
    all_docs = []
    pdf_dir = Path(pdf_directory)
    pdf_files = list(pdf_dir.glob("*.pdf"))
    print(f"Found {len(pdf_files)} PDF files in directory {pdf_directory}")
    for pdf_file in pdf_files:
        print(f"Processing file: {pdf_file}")
        loader = PyMuPDFLoader(str(pdf_file)) ## divides per page
        documents = loader.load()
        ## Add information to metadata
        for doc in documents:
            doc.metadata['source_file'] = pdf_file.name
            doc.metadata['file_type'] = 'pdf'
        all_docs.extend(documents)
        print(f"Loaded {len(documents)} documents from {pdf_file}")

    print(f"Total documents loaded from all PDFs: {len(all_docs)}")
    return all_docs

all_documents = proces_all_pdfs("../Data/")

Found 2 PDF files in directory ../Data/
Processing file: ../Data/SystemDesignInterview-v1-alex-xu.pdf
Loaded 269 documents from ../Data/SystemDesignInterview-v1-alex-xu.pdf
Processing file: ../Data/SystemDesignInterview-v2-alex-xu.pdf
Loaded 427 documents from ../Data/SystemDesignInterview-v2-alex-xu.pdf
Total documents loaded from all PDFs: 696


In [3]:
### text splitting into chunks
def split_documents(documents, chunk_size=1000, chunk_overlap=300):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, separators=["\n\n", "\n", " ", ""])
    split_documents = text_splitter.split_documents(documents) # Splitting documents into chunks, not same function as load, its text splitting . object
    print(f"Total documents after splitting: {len(split_documents)}")
    print(f"Split {len(documents)} documents into {len(split_documents)} chunks.")


    if split_documents:
        print("Sample split document metadata and content:")
        sample_doc = split_documents[0]
        print(f"Metadata: {sample_doc.metadata}")
        print(f"Content (first 500 chars): {sample_doc.page_content[:500]}")

In [4]:
chunks = split_documents(all_documents)
chunks


Total documents after splitting: 450
Split 696 documents into 450 chunks.
Sample split document metadata and content:
Metadata: {'producer': 'macOS Version 15.3.1 (Build 24D70) Quartz PDFContext, AppendMode 1.1', 'creator': 'calibre 3.9.0 [https://calibre-ebook.com]', 'creationdate': '2020-10-16T23:12:01+00:00', 'source': '../Data/SystemDesignInterview-v1-alex-xu.pdf', 'file_path': '../Data/SystemDesignInterview-v1-alex-xu.pdf', 'total_pages': 269, 'format': 'PDF 1.4', 'title': "System Design Interview – An insider's guide, Second Edition: Step by Step Guide, Tips and 15 System Design Interview Questions with Detailed Solutions", 'author': 'Alex Xu', 'subject': '', 'keywords': '', 'moddate': "D:20250310164445Z00'00'", 'trapped': '', 'modDate': "D:20250310164445Z00'00'", 'creationDate': "D:20201016231201+00'00'", 'page': 1, 'source_file': 'SystemDesignInterview-v1-alex-xu.pdf', 'file_type': 'pdf'}
Content (first 500 chars): System Design Interview: An Insider’s Guide
All rights reserved