In [1]:
from abc import ABC, abstractmethod
import fitz
import re
from typing import TypedDict, Optional

In [2]:
class Page(TypedDict):
    page_number: int
    text: str


class ChunkMetadata(TypedDict):
    page: int
    unit: Optional[str]
    section: Optional[str]
    section_title: Optional[str]


class Chunk(TypedDict):
    text: str
    metadata: ChunkMetadata

In [3]:
def load_pdf_with_pages(pdf_path: str) -> list[Page]:
    docs = fitz.open(pdf_path)
    pages: list[Page] = []

    for page_num in range(len(docs)):
        pages.append(
            {
                'page_number': page_num + 1,
                'text': docs[page_num].get_text('text').strip()
            }
        )
    return pages

In [4]:
class StructureDetector(ABC):
    @abstractmethod
    def detect(self, line: str):
        pass

In [5]:
class NumberedSectionDetector(StructureDetector):
    pattern = re.compile(r"^(\d+(\.\d+)*)\s+(.*)")

    def detect(self, line: str):
        match = self.pattern.match(line.strip())
        if match: return {
            'section': match.group(1),
            'section_title': match.group(3).strip()
        }
        return None

In [6]:
class UnitDetector(StructureDetector):
    pattern = re.compile(r"^(UNIT|CHAPTER)\s+([IVXLC]+)", re.IGNORECASE)

    def detect(self, line: str):
        match = self.pattern.match(line.strip())
        if match: return {
            'unit': match.group(2).upper()
        }
        return None

In [7]:
class StructurePipeline:
    def __init__(self, detectors: list[StructureDetector]):
        self.detectors = detectors
        self.state = {
            'unit': None,
            'section': None,
            'section_title': None,
        }

    def process_line(self, line: str):
        updated = False

        for detector in self.detectors:
            result = detector.detect(line)
            if result:
                self.state.update(result)
                updated = True
        return updated, self.state.copy()

In [8]:
def structured_chunker(pages: list[Page], detectors: list[StructureDetector], max_words: int = 350, overlap: int = 50) -> list[Chunk]:
    pipeline = StructurePipeline(detectors)
    chunks: list[Chunk] = []

    buffer: list[str] = []
    current_metadata: ChunkMetadata = {}
    last_structure = None

    def flush(page_number: int, forced_reset: bool = False):
        nonlocal buffer
        if not buffer: return

        chunks.append(
            {
                'text': ' '.join(buffer).strip(),
                'metadata': {
                    'page': page_number,
                    **current_metadata
                }
            }
        )
        if forced_reset or overlap == 0: buffer = []
        else: buffer = buffer[-overlap:]

    for page in pages:
        page_number = page['page_number']

        for line in page['text'].splitlines():
            line = line.strip()
            if not line: continue
            structure_changed, state = pipeline.process_line(line)
            new_structure = (
                state.get('unit'),
                state.get('section'),
                state.get('section_title')
            )

            if structure_changed and new_structure != last_structure:
                flush(page_number, forced_reset=True)
                current_metadata = {
                    'unit': state['unit'],
                    'section': state['section'],
                    'section_title': state['section_title']
                }
                last_structure = new_structure
            buffer.extend(line.split())

            if len(buffer) >= max_words: flush(page_number)
        flush(page_number)
    return chunks

In [9]:
pdf_pages = load_pdf_with_pages(r'C:\Users\ASUS\Desktop\Git\TextBook-Assistant\rag_pipeline\data\ML.pdf')

In [10]:
len(pdf_pages)

115

In [11]:
pdf_pages[0]

{'page_number': 1,
 'text': '1 \n \nUNIT I  \nIntroduction to Machine Learning \n1. Introduction \n \n1.1 What Is Machine Learning?  \nMachine learning is programming computers to optimize a performance criterion using example \ndata or past experience. We have a model defined up to some parameters, and learning is the \nexecution of a computer program to optimize the parameters of the model using the training data or \npast experience. The model may be predictive to make predictions in the future, or descriptive to gain \nknowledge from data, or both. \nArthur Samuel, an early American leader in the field of computer gaming and artificial intelligence, \ncoined the term “Machine Learning” in 1959 while at IBM. He defined machine learning as “the field of \nstudy that gives computers the ability to learn without being explicitly programmed.” However, there is \nno universally accepted definition for machine learning. Different authors define the term differently. \n \nDefinition of lea

In [12]:
chunks = structured_chunker(pdf_pages, detectors=[
    UnitDetector(),
    NumberedSectionDetector()
])

In [13]:
for c in chunks[:5]:
    print(c["metadata"])
    print(c["text"][:120])
    print("-" * 50)

{'page': 1}
1
--------------------------------------------------
{'page': 1, 'unit': 'I', 'section': None, 'section_title': None}
UNIT I Introduction to Machine Learning 1. Introduction
--------------------------------------------------
{'page': 1, 'unit': 'I', 'section': '1.1', 'section_title': 'What Is Machine Learning?'}
1.1 What Is Machine Learning? Machine learning is programming computers to optimize a performance criterion using exampl
--------------------------------------------------
{'page': 1, 'unit': 'I', 'section': '1.2', 'section_title': 'Components of Learning'}
1.2 Components of Learning Basic components of learning process The learning process, whether by a human or a machine, c
--------------------------------------------------
{'page': 2, 'unit': 'I', 'section': '1.2', 'section_title': 'Components of Learning'}
1.2 Components of Learning Basic components of learning process The learning process, whether by a human or a machine, c
-------------------------------------