In [None]:
import os
from langchain_community.document_loaders import PyPDFLoader, CSVLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from typing import List, Dict, Any, Tuple, Optional
from enum import Enum
import tiktoken
from copy import deepcopy

In [6]:
def load_pdf(pdf_path: str) -> List[Document]:
    try:
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        
        for i, doc in enumerate(documents):
            print(f"Page {i+1}: {len(doc.page_content)} characters, Metadata: {doc.metadata}")
            
        return documents
    except Exception as e: 
        print(f"Error loading PDF: {e}")
        return []
    
pdf_path = '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/document.pdf'
documents = load_pdf(pdf_path)
print(documents)

Page 1: 1865 characters, Metadata: {'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of 251102 PromptBI Analysts Training Agreement Draft v.2', 'source': '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/document.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}
Page 2: 3041 characters, Metadata: {'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of 251102 PromptBI Analysts Training Agreement Draft v.2', 'source': '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/document.pdf', 'total_pages': 4, 'page': 1, 'page_label': '2'}
Page 3: 1750 characters, Metadata: {'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of 251102 PromptBI Analysts Training Agreement Draft v.2', 'source': '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/docu

In [None]:
# dynamic separators
academic_docs_sep = ["\n\n", "\n", ". ", "? ", "! ", ", ", "; ", " ", ""]
legal_docs_sep = ["\n\n", "\n##", "\n#", "\n", ". ", " ", ""]
general_docs_sep = ["\n\n", ". ", " ", ""]

class DocumentType(str, Enum):
    academic = "academic"
    legal = "legal"
    general = "general"


def advanced_recursive_text_splitter(
    documents: List[Document],
    chunk_size: int = 7000,
    chunk_overlap: int = 140,
    document_type: str = DocumentType
) -> List[Document]:
    try:
        if document_type == DocumentType.academic:
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap,
                separators=academic_docs_sep,
                length_function=len
            )
        elif document_type == DocumentType.legal:
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap,
                separators=legal_docs_sep,
                length_function=len
            )
        else:
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap,
                separators=general_docs_sep,
                length_function=len
            )
            
        all_splitted_docs: List = []
        for doc in documents:
            splitted_docs = text_splitter.split_documents(documents=[doc])
            all_splitted_docs.append(splitted_docs)
            print(f"Original document length: {len(doc.page_content)}, Split into: {len(splitted_docs)} chunks")
            
        return all_splitted_docs
    except Exception as e: 
        print(f"Failed to split document: {e}")
        return []
    
    
chunks = advanced_recursive_text_splitter(documents=documents, document_type="legal")
print(chunks)

Original document length: 1865, Split into: 1 chunks
Original document length: 3041, Split into: 1 chunks
Original document length: 1750, Split into: 1 chunks
Original document length: 106, Split into: 1 chunks
[[Document(metadata={'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of 251102 PromptBI Analysts Training Agreement Draft v.2', 'source': '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/document.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='PromptBI  Analyst  Agreement  \nThis  Agreement  is  made  between:  \n●  PromptBI  Ltd  (“PromptBI”),  and  \n ●  [Jeremiah  Katumo  Kurwa]  (“Analyst”)  \n \nEffective  Date:   \n \n1.  Purpose  \n1.1  PromptBI  operates  a  training  and  placement  programme  (the  “Programme”)  designed  to  \ndevelop\n \nprofessional\n \nskills\n \nin\n \nprompt\n \nengineering,\n \nAI-human\n \ncollaboration,\n \napplied\n \nanalytics,\n \nst

In [21]:
class DocumentType(str, Enum):
    academic = "academic"
    legal = "legal"
    general = "general"


def advanced_recursive_text_splitter(
    documents: List[Document],
    chunk_size: int = 7000,
    chunk_overlap: int = 140,
    document_type: DocumentType = DocumentType.general,
) -> Tuple[List[Document], Dict]:

    try:
        if document_type == DocumentType.academic:
            separators = academic_docs_sep
        elif document_type == DocumentType.legal:
            separators = legal_docs_sep
        else:
            separators = general_docs_sep

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=separators,
            length_function=len,
        )

        all_chunks: List[Document] = []
        page_info: Dict = {}

        for idx, doc in enumerate(documents):
            splits = text_splitter.split_documents([doc])

            # store stats
            page_info[idx] = {
                "original_length": len(doc.page_content),
                "num_chunks": len(splits),
                "chunk_lengths": [len(s.page_content) for s in splits],
            }

            print(
                f"Doc {idx}: len={page_info[idx]['original_length']} "
                f"→ chunks={page_info[idx]['num_chunks']}"
            )

            all_chunks.extend(splits)

        return all_chunks, page_info

    except Exception as e:
        print(f"Failed to split document: {e}")
        return [], {}


chunks, page_info = advanced_recursive_text_splitter(documents=documents, document_type=DocumentType.general)
print("all_chunks:", chunks)
print("page_info:", page_info)

Doc 0: len=1865 → chunks=1
Doc 1: len=3041 → chunks=1
Doc 2: len=1750 → chunks=1
Doc 3: len=106 → chunks=1
all_chunks: [Document(metadata={'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of 251102 PromptBI Analysts Training Agreement Draft v.2', 'source': '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/document.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='PromptBI  Analyst  Agreement  \nThis  Agreement  is  made  between:  \n●  PromptBI  Ltd  (“PromptBI”),  and  \n ●  [Jeremiah  Katumo  Kurwa]  (“Analyst”)  \n \nEffective  Date:   \n \n1.  Purpose  \n1.1  PromptBI  operates  a  training  and  placement  programme  (the  “Programme”)  designed  to  \ndevelop\n \nprofessional\n \nskills\n \nin\n \nprompt\n \nengineering,\n \nAI-human\n \ncollaboration,\n \napplied\n \nanalytics,\n \nstakeholder\n \nmanagement,\n \nand\n \nrelated\n \ncompetencies.\n \n1.2  The  Analyst  wishes

PAGE INFO

{
    0: {
        'original_length': 1865, 
        'num_chunks': 1, 
        'chunk_lengths': [1865]
    }, 
    1: {
        'original_length': 3041, 
        'num_chunks': 1, 
        'chunk_lengths': [3041]
    }, 
    2: {
        'original_length': 1750, 
        'num_chunks': 1, 
        'chunk_lengths': [1750]
    }, 
    3: {
        'original_length': 106, 
        'num_chunks': 1, 
        'chunk_lengths': [106]
    }
}

In [23]:
academic_docs_sep = ["\n\n", "\n", ". ", "? ", "! ", ", ", "; ", " ", ""]
legal_docs_sep = ["\n\n", "\n##", "\n#", "\n", ". ", " ", ""]
general_docs_sep = ["\n\n", ". ", " ", ""]


class DocumentType(str, Enum):
    academic = "academic"
    legal = "legal"
    general = "general"


# ✅ tokenizer helper
encoding = tiktoken.get_encoding("cl100k_base")

def token_len(text: str) -> int:
    return len(encoding.encode(text))


def find_offsets(full_text: str, chunks: List[str]) -> List[Tuple[int, int]]:
    """Compute start/end offsets for chunks inside original text."""
    offsets = []
    cursor = 0

    for chunk in chunks:
        start = full_text.find(chunk, cursor)
        if start == -1:
            start = cursor  # fallback
        end = start + len(chunk)
        offsets.append((start, end))
        cursor = end

    return offsets


def advanced_recursive_text_splitter(
    documents: List[Document],
    chunk_size: int = 7000,
    chunk_overlap: int = 140,
    document_type: DocumentType = DocumentType.general,
) -> Tuple[List[Document], Dict]:

    if document_type == DocumentType.academic:
        separators = academic_docs_sep
    elif document_type == DocumentType.legal:
        separators = legal_docs_sep
    else:
        separators = general_docs_sep

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=separators,
        length_function=len,  # splitting still char-based
    )

    all_chunks: List[Document] = []
    page_info: Dict = {}

    for doc_idx, doc in enumerate(documents):

        split_docs = splitter.split_documents([doc])
        chunk_texts = [d.page_content for d in split_docs]

        # ✅ offsets
        offsets = find_offsets(doc.page_content, chunk_texts)

        chunk_token_counts = [token_len(t) for t in chunk_texts]

        # ✅ enrich chunk metadata
        enriched_chunks = []
        for i, (chunk_doc, (start, end), tok_count) in enumerate(
            zip(split_docs, offsets, chunk_token_counts)
        ):
            meta = dict(chunk_doc.metadata or {})
            meta.update({
                "page_index": doc_idx,
                "chunk_index": i,
                "start_char": start,
                "end_char": end,
                "token_count": tok_count,
            })
            enriched_chunks.append(
                Document(page_content=chunk_doc.page_content, metadata=meta)
            )

        # ✅ page stats
        page_info[doc_idx] = {
            "original_char_length": len(doc.page_content),
            "original_token_length": token_len(doc.page_content),
            "num_chunks": len(enriched_chunks),
            "chunk_token_counts": chunk_token_counts,
            "chunk_offsets": offsets,
        }

        all_chunks.extend(enriched_chunks)

    return all_chunks, page_info


chunks, page_info = advanced_recursive_text_splitter(documents=documents, document_type=DocumentType.general)
print("all_chunks:", chunks)
print("page_info:", page_info)

all_chunks: [Document(metadata={'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of 251102 PromptBI Analysts Training Agreement Draft v.2', 'source': '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/document.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1', 'page_index': 0, 'chunk_index': 0, 'start_char': 0, 'end_char': 1865, 'token_count': 539}, page_content='PromptBI  Analyst  Agreement  \nThis  Agreement  is  made  between:  \n●  PromptBI  Ltd  (“PromptBI”),  and  \n ●  [Jeremiah  Katumo  Kurwa]  (“Analyst”)  \n \nEffective  Date:   \n \n1.  Purpose  \n1.1  PromptBI  operates  a  training  and  placement  programme  (the  “Programme”)  designed  to  \ndevelop\n \nprofessional\n \nskills\n \nin\n \nprompt\n \nengineering,\n \nAI-human\n \ncollaboration,\n \napplied\n \nanalytics,\n \nstakeholder\n \nmanagement,\n \nand\n \nrelated\n \ncompetencies.\n \n1.2  The  Analyst  wishes  to  participate

In [25]:
academic_docs_sep = ["\n\n", "\n", ". ", "? ", "! ", ", ", "; ", " ", ""]
legal_docs_sep = ["\n\n", "\n##", "\n#", "\n", ". ", " ", ""]
general_docs_sep = ["\n\n", ". ", " ", ""]


class DocumentType(str, Enum):
    academic = "academic"
    legal = "legal"
    general = "general"


encoding = tiktoken.get_encoding("cl100k_base")

def token_len(text: str) -> int:
    return len(encoding.encode(text))


def find_offsets(full_text: str, chunks: List[str]):
    offsets = []
    cursor = 0
    for chunk in chunks:
        start = full_text.find(chunk, cursor)
        if start == -1:
            start = cursor
        end = start + len(chunk)
        offsets.append((start, end))
        cursor = end
    return offsets


def inherit_metadata(
    parent_meta: Dict,
    extra_meta: Dict,
    include_keys: Optional[List[str]] = None,
    exclude_keys: Optional[List[str]] = None,
) -> Dict:
    """Controlled metadata inheritance."""
    meta = deepcopy(parent_meta or {})

    if include_keys:
        meta = {k: v for k, v in meta.items() if k in include_keys}

    if exclude_keys:
        for k in exclude_keys:
            meta.pop(k, None)

    meta.update(extra_meta)
    return meta


def advanced_recursive_text_splitter(
    documents: List[Document],
    chunk_size: int = 7000,
    chunk_overlap: int = 140,
    document_type: DocumentType = DocumentType.general,
) -> Tuple[List[Document], Dict]:

    separators = {
        DocumentType.academic: academic_docs_sep,
        DocumentType.legal: legal_docs_sep,
        DocumentType.general: general_docs_sep,
    }[document_type]

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=separators,
        length_function=len,
    )

    all_chunks: List[Document] = []
    page_info: Dict = {}

    for doc_idx, doc in enumerate(documents):

        splits = splitter.split_documents([doc])
        chunk_texts = [d.page_content for d in splits]
        offsets = find_offsets(doc.page_content, chunk_texts)
        token_counts = [token_len(t) for t in chunk_texts]

        enriched = []

        for chunk_idx, (split_doc, (start, end), tok_count) in enumerate(
            zip(splits, offsets, token_counts)
        ):

            chunk_meta = inherit_metadata(
                parent_meta=doc.metadata,
                extra_meta={
                    "parent_doc_index": doc_idx,
                    "chunk_index": chunk_idx,
                    "chunk_id": f"{doc_idx}_{chunk_idx}",
                    "start_char": start,
                    "end_char": end,
                    "token_count": tok_count,
                },
                exclude_keys=None,  # optional filter hook
            )

            enriched.append(
                Document(
                    page_content=split_doc.page_content,
                    metadata=chunk_meta,
                )
            )

        page_info[doc_idx] = {
            "original_tokens": token_len(doc.page_content),
            "num_chunks": len(enriched),
            "chunk_tokens": token_counts,
            "offsets": offsets,
            "inherited_keys": list(doc.metadata.keys()) if doc.metadata else [],
        }

        all_chunks.extend(enriched)

    return all_chunks, page_info


chunks, page_info = advanced_recursive_text_splitter(documents=documents, document_type=DocumentType.general)
print("all_chunks:", chunks)
print("page_info:", page_info)

all_chunks: [Document(metadata={'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of 251102 PromptBI Analysts Training Agreement Draft v.2', 'source': '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/document.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1', 'parent_doc_index': 0, 'chunk_index': 0, 'chunk_id': '0_0', 'start_char': 0, 'end_char': 1865, 'token_count': 539}, page_content='PromptBI  Analyst  Agreement  \nThis  Agreement  is  made  between:  \n●  PromptBI  Ltd  (“PromptBI”),  and  \n ●  [Jeremiah  Katumo  Kurwa]  (“Analyst”)  \n \nEffective  Date:   \n \n1.  Purpose  \n1.1  PromptBI  operates  a  training  and  placement  programme  (the  “Programme”)  designed  to  \ndevelop\n \nprofessional\n \nskills\n \nin\n \nprompt\n \nengineering,\n \nAI-human\n \ncollaboration,\n \napplied\n \nanalytics,\n \nstakeholder\n \nmanagement,\n \nand\n \nrelated\n \ncompetencies.\n \n1.2  The  Analyst

In [26]:
academic_docs_sep = ["\n\n", "\n", ". ", "? ", "! ", ", ", "; ", " ", ""]
legal_docs_sep = ["\n\n", "\n##", "\n#", "\n", ". ", " ", ""]
general_docs_sep = ["\n\n", ". ", " ", ""]


class DocumentType(str, Enum):
    academic = "academic"
    legal = "legal"
    general = "general"


enc = tiktoken.get_encoding("cl100k_base")

def token_len(text: str) -> int:
    return len(enc.encode(text))


def find_offsets(full_text: str, chunks: List[str]):
    offsets = []
    cursor = 0
    for c in chunks:
        start = full_text.find(c, cursor)
        if start == -1:
            start = cursor
        end = start + len(c)
        offsets.append((start, end))
        cursor = end
    return offsets


def advanced_recursive_text_splitter(
    documents: List[Document],
    chunk_size: int = 7000,
    chunk_overlap: int = 140,
    document_type: DocumentType = DocumentType.general,
) -> Tuple[List[Document], Dict, Dict]:

    separators = {
        DocumentType.academic: academic_docs_sep,
        DocumentType.legal: legal_docs_sep,
        DocumentType.general: general_docs_sep,
    }[document_type]

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=separators,
        length_function=len,
    )

    all_chunks: List[Document] = []
    page_info: Dict = {}
    chunk_reverse_map: Dict = {}

    for page_idx, doc in enumerate(documents):

        splits = splitter.split_documents([doc])
        texts = [d.page_content for d in splits]
        offsets = find_offsets(doc.page_content, texts)
        token_counts = [token_len(t) for t in texts]

        page_info[page_idx] = {
            "source": doc.metadata.get("source"),
            "original_tokens": token_len(doc.page_content),
            "num_chunks": len(splits),
        }

        for chunk_idx, (split_doc, (start, end), tok_count) in enumerate(
            zip(splits, offsets, token_counts)
        ):
            chunk_id = f"{page_idx}_{chunk_idx}"

            # ✅ inherit + enrich metadata
            meta = deepcopy(doc.metadata or {})
            meta.update({
                "chunk_id": chunk_id,
                "page_index": page_idx,
                "chunk_index": chunk_idx,
                "start_char": start,
                "end_char": end,
                "token_count": tok_count,
            })

            chunk_doc = Document(
                page_content=split_doc.page_content,
                metadata=meta,
            )

            all_chunks.append(chunk_doc)

            # ✅ reverse lookup table
            chunk_reverse_map[chunk_id] = {
                "page_index": page_idx,
                "chunk_index": chunk_idx,
                "start_char": start,
                "end_char": end,
                "token_count": tok_count,
                "source": meta.get("source"),
                "page_number": meta.get("page"),
                "doc_id": meta.get("doc_id"),
            }

    return all_chunks, page_info, chunk_reverse_map


chunks, page_info, chunk_reverse_map = advanced_recursive_text_splitter(documents=documents, document_type=DocumentType.general)
print("all_chunks:", chunks)
print("page_info:", page_info)
print("chunk_reverse_map:", chunk_reverse_map)

all_chunks: [Document(metadata={'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Copy of 251102 PromptBI Analysts Training Agreement Draft v.2', 'source': '/home/jeremy/Documents/Work/Learning/fastapi/llm/4-document_loader/sample_data/document.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1', 'chunk_id': '0_0', 'page_index': 0, 'chunk_index': 0, 'start_char': 0, 'end_char': 1865, 'token_count': 539}, page_content='PromptBI  Analyst  Agreement  \nThis  Agreement  is  made  between:  \n●  PromptBI  Ltd  (“PromptBI”),  and  \n ●  [Jeremiah  Katumo  Kurwa]  (“Analyst”)  \n \nEffective  Date:   \n \n1.  Purpose  \n1.1  PromptBI  operates  a  training  and  placement  programme  (the  “Programme”)  designed  to  \ndevelop\n \nprofessional\n \nskills\n \nin\n \nprompt\n \nengineering,\n \nAI-human\n \ncollaboration,\n \napplied\n \nanalytics,\n \nstakeholder\n \nmanagement,\n \nand\n \nrelated\n \ncompetencies.\n \n1.2  The  Analyst  wish