In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_text_splitters import TokenTextSplitter
from typing import List, Dict, Any, Set

In [3]:
def load_pdf(pdf_path: str) -> List[Document]:
    try:
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        
        for idx, doc in enumerate(documents):
            print(f"Page {idx+1}: {len(doc.page_content)} characters, Metadata: {doc.metadata}")
            
        return documents
    except Exception as e: 
        print(f"Error loading PDF: {e}")
        return []
    
pdf_path = '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/document.pdf'
documents = load_pdf(pdf_path)        
print(documents)
# print(documents[0])

Page 1: 1865 characters, Metadata: {'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of 251102 PromptBI Analysts Training Agreement Draft v.2', 'source': '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/document.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}
Page 2: 3041 characters, Metadata: {'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of 251102 PromptBI Analysts Training Agreement Draft v.2', 'source': '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/document.pdf', 'total_pages': 4, 'page': 1, 'page_label': '2'}
Page 3: 1750 characters, Metadata: {'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of 251102 PromptBI Analysts Training Agreement Draft v.2', 'source': '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/docu

In [None]:
def token_text_splitter(
    documents: List[Document],
    chunk_size: int = 512,
    chunk_overlap: int = 50
) -> List[Document]:
    try:
        text_splitter = TokenTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            encoding_name="cl100k_base"
        )
        
        chunks = text_splitter.split_documents(documents=documents)
        
        print(f"Token splitter: Created {len(chunks)} chunks")
        print(chunks)
        
        return chunks
    except Exception as e: 
        print(f"Text splitter failed: {e}")
        return []
    
chunks = token_text_splitter(documents)

for i, chunk in enumerate(chunks):
    print(f"\nChunk number: Chunk {i+1}, \nChunk content: {chunk}\n")
# print(chunks[0])

Token splitter: Created 6 chunks
[Document(metadata={'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of 251102 PromptBI Analysts Training Agreement Draft v.2', 'source': '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/document.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='PromptBI  Analyst  Agreement  \nThis  Agreement  is  made  between:  \n●  PromptBI  Ltd  (“PromptBI”),  and  \n ●  [Jeremiah  Katumo  Kurwa]  (“Analyst”)  \n \nEffective  Date:   \n \n1.  Purpose  \n1.1  PromptBI  operates  a  training  and  placement  programme  (the  “Programme”)  designed  to  \ndevelop\n \nprofessional\n \nskills\n \nin\n \nprompt\n \nengineering,\n \nAI-human\n \ncollaboration,\n \napplied\n \nanalytics,\n \nstakeholder\n \nmanagement,\n \nand\n \nrelated\n \ncompetencies.\n \n1.2  The  Analyst  wishes  to  participate  in  the  Programme  and  to  be  placed  with  a  Host  \nBusiness\