In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from typing import List, Dict, Any

In [10]:
separators = ["\n\n", '\n', '. ', '! ', '? ', ' ', '']

In [8]:
def load_pdf_with_langchain(pdf_path: str) -> List[Document]:
    '''
    Load PDF using LangChain's PyPDFLoader with Error handling
    '''
    try:
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        
        print(f"Loaded {len(documents)} pages.")
        
        for idx, doc in enumerate(documents):
            print(f"Page {idx+1} : {len(doc.page_content)} characters, Metadata: {doc.metadata}")
            
        return documents
    except Exception as e: 
        print(f"Error loading PDF: {e}")
        return []
    
pdf_path = '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/document.pdf'
documents = load_pdf_with_langchain(pdf_path)
print(documents)
documents[0]

Loaded 4 pages.
Page 1 : 1865 characters, Metadata: {'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of 251102 PromptBI Analysts Training Agreement Draft v.2', 'source': '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/document.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}
Page 2 : 3041 characters, Metadata: {'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of 251102 PromptBI Analysts Training Agreement Draft v.2', 'source': '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/document.pdf', 'total_pages': 4, 'page': 1, 'page_label': '2'}
Page 3 : 1750 characters, Metadata: {'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of 251102 PromptBI Analysts Training Agreement Draft v.2', 'source': '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_load

Document(metadata={'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of 251102 PromptBI Analysts Training Agreement Draft v.2', 'source': '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/document.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='PromptBI  Analyst  Agreement  \nThis  Agreement  is  made  between:  \n●  PromptBI  Ltd  (“PromptBI”),  and  \n ●  [Jeremiah  Katumo  Kurwa]  (“Analyst”)  \n \nEffective  Date:   \n \n1.  Purpose  \n1.1  PromptBI  operates  a  training  and  placement  programme  (the  “Programme”)  designed  to  \ndevelop\n \nprofessional\n \nskills\n \nin\n \nprompt\n \nengineering,\n \nAI-human\n \ncollaboration,\n \napplied\n \nanalytics,\n \nstakeholder\n \nmanagement,\n \nand\n \nrelated\n \ncompetencies.\n \n1.2  The  Analyst  wishes  to  participate  in  the  Programme  and  to  be  placed  with  a  Host  \nBusiness\n \nfor\n \na\n \nﬁxed\n \ntwelve-

In [16]:
def recursive_text_splitter(
    documents: List[Document],
    chunk_size: int = 200,
    chunk_overlap: int = 200
) -> List[Document]:
    '''
    Recursive character text splitter - Most commonly used and Effective
    '''
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=separators,
    )
    
    chunks = text_splitter.split_documents(documents)
    
    print(f"Recursive character text splitter: Created {len(chunks)} chunks")
    print(f"Chunk size: {chunk_size}, Overlap: {chunk_overlap}")
    
    for idx, chunk in enumerate(chunks[:2]):
        content_preview = chunk.page_content[:100].replace("\n", " ") + "..."
        print(f"Sample {idx+1}: {content_preview}")
        print(idx, chunks)
        
    return chunks

chunks = recursive_text_splitter(documents=documents, chunk_size=800, chunk_overlap=150)
print(chunks)
chunks[0]

Recursive character text splitter: Created 12 chunks
Chunk size: 800, Overlap: 150
Sample 1: PromptBI  Analyst  Agreement   This  Agreement  is  made  between:   ●  PromptBI  Ltd  (“PromptBI”),...
0 [Document(metadata={'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of 251102 PromptBI Analysts Training Agreement Draft v.2', 'source': '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/document.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='PromptBI  Analyst  Agreement  \nThis  Agreement  is  made  between:  \n●  PromptBI  Ltd  (“PromptBI”),  and  \n ●  [Jeremiah  Katumo  Kurwa]  (“Analyst”)  \n \nEffective  Date:   \n \n1.  Purpose  \n1.1  PromptBI  operates  a  training  and  placement  programme  (the  “Programme”)  designed  to  \ndevelop\n \nprofessional\n \nskills\n \nin\n \nprompt\n \nengineering,\n \nAI-human\n \ncollaboration,\n \napplied\n \nanalytics,\n \nstakeholder\n \

Document(metadata={'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of 251102 PromptBI Analysts Training Agreement Draft v.2', 'source': '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/document.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='PromptBI  Analyst  Agreement  \nThis  Agreement  is  made  between:  \n●  PromptBI  Ltd  (“PromptBI”),  and  \n ●  [Jeremiah  Katumo  Kurwa]  (“Analyst”)  \n \nEffective  Date:   \n \n1.  Purpose  \n1.1  PromptBI  operates  a  training  and  placement  programme  (the  “Programme”)  designed  to  \ndevelop\n \nprofessional\n \nskills\n \nin\n \nprompt\n \nengineering,\n \nAI-human\n \ncollaboration,\n \napplied\n \nanalytics,\n \nstakeholder\n \nmanagement,\n \nand\n \nrelated\n \ncompetencies.\n \n1.2  The  Analyst  wishes  to  participate  in  the  Programme  and  to  be  placed  with  a  Host  \nBusiness\n \nfor\n \na\n \nﬁxed\n \ntwelve-

#### Use recursive text splitter when you have more complex text that requires numerous fallbacks.

In [5]:
def split_text_into_words(text, chunk_max_words):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    
    for word in words:
        if current_length + len(word) + (1 if current_chunk else 0) <= chunk_max_words:
            current_chunk.append(word)
            current_length += len(word) + (1 if current_chunk else 0)
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = len(word)
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

text = "This is a sample text that will be split into chunks based on the maximum number of words allowed in each chunk."
chunk_max_words = 10
split_text_into_words(text, chunk_max_words)

['This is a',
 'sample',
 'text that',
 'will be',
 'split into',
 'chunks',
 'based on',
 'the',
 'maximum',
 'number of',
 'words',
 'allowed in',
 'each',
 'chunk.']

In [6]:
def split_text_into_chunks(text, max_words):
    """Splits text into chunks with a maximum number of words, ensuring sentences are not broken."""
    sentences = [s.strip() for s in text.split('.') if s.strip()]
    chunks = []
    current_chunk = []
    current_word_count = 0
    
    for sentence in sentences:
        sentence_word_count = len(sentence.split())
        if current_word_count + sentence_word_count <= max_words:
            current_chunk.append(sentence)
            current_word_count += sentence_word_count
        else:
            if current_chunk:
                chunks.append('. '.join(current_chunk) + '.')
            current_chunk = [sentence]
            current_word_count = sentence_word_count
    if current_chunk:
        chunks.append('. '.join(current_chunk) + '.')
        
    return chunks

text = """Lorem ipsum dolor sit amet consectetur adipiscing elit. Quisque faucibus ex sapien vitae pellentesque sem placerat. In id cursus mi pretium tellus duis convallis. Tempus leo eu aenean sed diam urna tempor. Pulvinar vivamus fringilla lacus nec metus bibendum egestas. Iaculis massa nisl malesuada lacinia integer nunc posuere. Ut hendrerit semper vel class aptent taciti sociosqu. Ad litora torquent per conubia nostra inceptos himenaeos.
    Lorem ipsum dolor sit amet consectetur adipiscing elit. Quisque faucibus ex sapien vitae pellentesque sem placerat. In id cursus mi pretium tellus duis convallis. Tempus leo eu aenean sed diam urna tempor. Pulvinar vivamus fringilla lacus nec metus bibendum egestas. Iaculis massa nisl malesuada lacinia integer nunc posuere. Ut hendrerit semper vel class aptent taciti sociosqu. Ad litora torquent per conubia nostra inceptos himenaeos.
    Lorem ipsum dolor sit amet consectetur adipiscing elit. Quisque faucibus ex sapien vitae pellentesque sem placerat. In id cursus mi pretium tellus duis convallis. Tempus leo eu aenean sed diam urna tempor. Pulvinar vivamus fringilla lacus nec metus bibendum egestas. Iaculis massa nisl malesuada lacinia integer nunc posuere. Ut hendrerit semper vel class aptent taciti sociosqu. Ad litora torquent per conubia nostra inceptos himenaeos.
    Lorem ipsum dolor sit amet consectetur adipiscing elit. Quisque faucibus ex sapien vitae pellentesque sem placerat. In id cursus mi pretium tellus duis convallis. Tempus leo eu aenean sed diam urna tempor. Pulvinar vivamus fringilla lacus nec metus bibendum egestas. Iaculis massa nisl malesuada lacinia integer nunc posuere. Ut hendrerit semper vel class aptent taciti sociosqu. Ad litora torquent per conubia nostra inceptos himenaeos."""
max_words = 10
split_text_into_chunks(text, 100)

['Lorem ipsum dolor sit amet consectetur adipiscing elit. Quisque faucibus ex sapien vitae pellentesque sem placerat. In id cursus mi pretium tellus duis convallis. Tempus leo eu aenean sed diam urna tempor. Pulvinar vivamus fringilla lacus nec metus bibendum egestas. Iaculis massa nisl malesuada lacinia integer nunc posuere. Ut hendrerit semper vel class aptent taciti sociosqu. Ad litora torquent per conubia nostra inceptos himenaeos. Lorem ipsum dolor sit amet consectetur adipiscing elit. Quisque faucibus ex sapien vitae pellentesque sem placerat. In id cursus mi pretium tellus duis convallis. Tempus leo eu aenean sed diam urna tempor.',
 'Pulvinar vivamus fringilla lacus nec metus bibendum egestas. Iaculis massa nisl malesuada lacinia integer nunc posuere. Ut hendrerit semper vel class aptent taciti sociosqu. Ad litora torquent per conubia nostra inceptos himenaeos. Lorem ipsum dolor sit amet consectetur adipiscing elit. Quisque faucibus ex sapien vitae pellentesque sem placerat. In

In [7]:
def split_text_into_chunks(text, max_words):
    # Split text into sentences by full stop (keeping the dot)
    sentences = [s.strip() + '.' for s in text.split('.') if s.strip()]
    chunks = []
    current_chunk = []
    current_word_count = 0

    for sentence in sentences:
        words_in_sentence = len(sentence.split())
        # If adding this sentence exceeds the limit
        if current_word_count + words_in_sentence > max_words:
            # Save the current chunk
            chunks.append(" ".join(current_chunk).strip())
            # Start a new chunk
            current_chunk = [sentence]
            current_word_count = words_in_sentence
        else:
            # Add sentence to current chunk
            current_chunk.append(sentence)
            current_word_count += words_in_sentence
    # Add the last chunk if not empty
    if current_chunk:
        chunks.append(" ".join(current_chunk).strip())

    return chunks


# Example usage
if __name__ == "__main__":
    sample_text = """Lorem ipsum dolor sit amet consectetur adipiscing elit. Quisque faucibus ex sapien vitae pellentesque sem placerat. In id cursus mi pretium tellus duis convallis. Tempus leo eu aenean sed diam urna tempor. Pulvinar vivamus fringilla lacus nec metus bibendum egestas. Iaculis massa nisl malesuada lacinia integer nunc posuere. Ut hendrerit semper vel class aptent taciti sociosqu. Ad litora torquent per conubia nostra inceptos himenaeos.
    Lorem ipsum dolor sit amet consectetur adipiscing elit. Quisque faucibus ex sapien vitae pellentesque sem placerat. In id cursus mi pretium tellus duis convallis. Tempus leo eu aenean sed diam urna tempor. Pulvinar vivamus fringilla lacus nec metus bibendum egestas. Iaculis massa nisl malesuada lacinia integer nunc posuere. Ut hendrerit semper vel class aptent taciti sociosqu. Ad litora torquent per conubia nostra inceptos himenaeos.
    Lorem ipsum dolor sit amet consectetur adipiscing elit. Quisque faucibus ex sapien vitae pellentesque sem placerat. In id cursus mi pretium tellus duis convallis. Tempus leo eu aenean sed diam urna tempor. Pulvinar vivamus fringilla lacus nec metus bibendum egestas. Iaculis massa nisl malesuada lacinia integer nunc posuere. Ut hendrerit semper vel class aptent taciti sociosqu. Ad litora torquent per conubia nostra inceptos himenaeos.
    Lorem ipsum dolor sit amet consectetur adipiscing elit. Quisque faucibus ex sapien vitae pellentesque sem placerat. In id cursus mi pretium tellus duis convallis. Tempus leo eu aenean sed diam urna tempor. Pulvinar vivamus fringilla lacus nec metus bibendum egestas. Iaculis massa nisl malesuada lacinia integer nunc posuere. Ut hendrerit semper vel class aptent taciti sociosqu. Ad litora torquent per conubia nostra inceptos himenaeos."""
    
    chunks = split_text_into_chunks(sample_text, max_words=100)
    
    for i, chunk in enumerate(chunks, 1):
        print(f"\n--- Chunk {i} ({len(chunk.split())} words) ---\n")
        print(chunk)


--- Chunk 1 (96 words) ---

Lorem ipsum dolor sit amet consectetur adipiscing elit. Quisque faucibus ex sapien vitae pellentesque sem placerat. In id cursus mi pretium tellus duis convallis. Tempus leo eu aenean sed diam urna tempor. Pulvinar vivamus fringilla lacus nec metus bibendum egestas. Iaculis massa nisl malesuada lacinia integer nunc posuere. Ut hendrerit semper vel class aptent taciti sociosqu. Ad litora torquent per conubia nostra inceptos himenaeos. Lorem ipsum dolor sit amet consectetur adipiscing elit. Quisque faucibus ex sapien vitae pellentesque sem placerat. In id cursus mi pretium tellus duis convallis. Tempus leo eu aenean sed diam urna tempor.

--- Chunk 2 (96 words) ---

Pulvinar vivamus fringilla lacus nec metus bibendum egestas. Iaculis massa nisl malesuada lacinia integer nunc posuere. Ut hendrerit semper vel class aptent taciti sociosqu. Ad litora torquent per conubia nostra inceptos himenaeos. Lorem ipsum dolor sit amet consectetur adipiscing elit. Quisque fa