In [1]:
from abc import ABC, abstractmethod
import faiss
import fitz
import json
import numpy as np
import re
import subprocess
from sentence_transformers import SentenceTransformer
from typing import TypedDict, Optional

In [2]:
class Page(TypedDict):
    page_number: int
    text: str


class ChunkMetadata(TypedDict):
    page: int
    unit: Optional[str]
    section: Optional[str]
    section_title: Optional[str]


class Chunk(TypedDict):
    text: str
    metadata: ChunkMetadata

In [12]:
def load_pdf_with_pages(pdf_path: str) -> list[Page]:
    docs = fitz.open(pdf_path)
    pages: list[Page] = []

    for page_num in range(len(docs)):
        pages.append(
            {
                'page_number': page_num + 1,
                'text': docs[page_num].get_text('text').strip()
            }
        )
    return pages

In [13]:
class StructureDetector(ABC):
    @abstractmethod
    def detect(self, line: str):
        pass

In [14]:
class NumberedSectionDetector(StructureDetector):
    pattern = re.compile(r"^(\d+(\.\d+)*)\s+(.*)")

    def detect(self, line: str):
        match = self.pattern.match(line.strip())
        if match: return {
            'section': match.group(1),
            'section_title': match.group(3).strip()
        }
        return None

In [15]:
class UnitDetector(StructureDetector):
    pattern = re.compile(r"^(UNIT|CHAPTER)\s+([IVXLC]+)", re.IGNORECASE)

    def detect(self, line: str):
        match = self.pattern.match(line.strip())
        if match: return {
            'unit': match.group(2).upper()
        }
        return None

In [16]:
class StructurePipeline:
    def __init__(self, detectors: list[StructureDetector]):
        self.detectors = detectors
        self.state = {
            'unit': None,
            'section': None,
            'section_title': None,
        }

    def process_line(self, line: str):
        updated = False

        for detector in self.detectors:
            result = detector.detect(line)
            if result:
                self.state.update(result)
                updated = True
        return updated, self.state.copy()

In [17]:
def structured_chunker(pages: list[Page], detectors: list[StructureDetector], max_words: int = 350, overlap: int = 50) -> list[Chunk]:
    pipeline = StructurePipeline(detectors)
    chunks: list[Chunk] = []

    buffer: list[str] = []
    current_metadata: ChunkMetadata = {}
    last_structure = None

    def flush(page_number: int, forced_reset: bool = False):
        nonlocal buffer
        if not buffer: return

        chunks.append(
            {
                'text': ' '.join(buffer).strip(),
                'metadata': {
                    'page': page_number,
                    **current_metadata
                }
            }
        )
        if forced_reset or overlap == 0: buffer = []
        else: buffer = buffer[-overlap:]

    for page in pages:
        page_number = page['page_number']

        for line in page['text'].splitlines():
            line = line.strip()
            if not line: continue
            structure_changed, state = pipeline.process_line(line)
            new_structure = (
                state.get('unit'),
                state.get('section'),
                state.get('section_title')
            )

            if structure_changed and new_structure != last_structure:
                flush(page_number, forced_reset=True)
                current_metadata = {
                    'unit': state['unit'],
                    'section': state['section'],
                    'section_title': state['section_title']
                }
                last_structure = new_structure
            buffer.extend(line.split())

            if len(buffer) >= max_words: flush(page_number)
        flush(page_number)
    return chunks

In [18]:
pdf_pages = load_pdf_with_pages(r'C:\Users\ASUS\Desktop\Git\TextBook-Assistant\rag_pipeline\data\ML.pdf')

In [19]:
len(pdf_pages)

115

In [20]:
pdf_pages[0]

{'page_number': 1,
 'text': '1 \n \nUNIT I  \nIntroduction to Machine Learning \n1. Introduction \n \n1.1 What Is Machine Learning?  \nMachine learning is programming computers to optimize a performance criterion using example \ndata or past experience. We have a model defined up to some parameters, and learning is the \nexecution of a computer program to optimize the parameters of the model using the training data or \npast experience. The model may be predictive to make predictions in the future, or descriptive to gain \nknowledge from data, or both. \nArthur Samuel, an early American leader in the field of computer gaming and artificial intelligence, \ncoined the term “Machine Learning” in 1959 while at IBM. He defined machine learning as “the field of \nstudy that gives computers the ability to learn without being explicitly programmed.” However, there is \nno universally accepted definition for machine learning. Different authors define the term differently. \n \nDefinition of lea

In [12]:
chunks = structured_chunker(pdf_pages, detectors=[
    UnitDetector(),
    NumberedSectionDetector()
])

In [13]:
for c in chunks[:5]:
    print(c['metadata'])
    print(c['text'][:120])
    print('*' * 50)

{'page': 1}
1
**************************************************
{'page': 1, 'unit': 'I', 'section': None, 'section_title': None}
UNIT I Introduction to Machine Learning 1. Introduction
**************************************************
{'page': 1, 'unit': 'I', 'section': '1.1', 'section_title': 'What Is Machine Learning?'}
1.1 What Is Machine Learning? Machine learning is programming computers to optimize a performance criterion using exampl
**************************************************
{'page': 1, 'unit': 'I', 'section': '1.2', 'section_title': 'Components of Learning'}
1.2 Components of Learning Basic components of learning process The learning process, whether by a human or a machine, c
**************************************************
{'page': 2, 'unit': 'I', 'section': '1.2', 'section_title': 'Components of Learning'}
1.2 Components of Learning Basic components of learning process The learning process, whether by a human or a machine, c
*************************************

In [14]:
MODEL_PATH = r'C:\Users\ASUS\.hf_models\all-MiniLM-L6-v2'

In [15]:
embedder = SentenceTransformer(
    MODEL_PATH,
    device='cpu'
)
embedder.get_sentence_embedding_dimension()

384

In [16]:
len(chunks)

224

In [17]:
chunks[1]['metadata']

{'page': 1, 'unit': 'I', 'section': None, 'section_title': None}

In [18]:
chunks[1]['text'][:100]

'UNIT I Introduction to Machine Learning 1. Introduction'

In [19]:
texts = [c['text'] for c in chunks]

In [20]:
embeddings = embedder.encode(
    texts,
    batch_size=32,
    show_progress_bar=True,
    normalize_embeddings=True
)

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [21]:
embeddings = np.asarray(embeddings, dtype='float32')
embeddings.shape

(224, 384)

In [22]:
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)

In [23]:
index.add(embeddings)

In [24]:
index.ntotal

224

In [25]:
def retrieve_chunks(query: str, k: int = 5) -> list[Chunk]:
    q_emd = embedder.encode(
        [query],
        normalize_embeddings=True
    ).astype('float32')

    scores, indices = index.search(q_emd, k)

    results = []
    for i in indices[0]: results.append(chunks[i])

    return results

In [26]:
results = retrieve_chunks(
    'What are the components of learning?'
)
results

[{'text': '1.2 Components of Learning Basic components of learning process The learning process, whether by a human or a machine, can be divided into four components, namely, data storage, abstraction, generalization and evaluation. Figure 1.1 illustrates the variouscomponents and the steps involved in the learning process.',
  'metadata': {'page': 1,
   'unit': 'I',
   'section': '1.2',
   'section_title': 'Components of Learning'}},
 {'text': '1.2 Components of Learning Basic components of learning process The learning process, whether by a human or a machine, can be divided into four components, namely, data storage, abstraction, generalization and evaluation. Figure 1.1 illustrates the variouscomponents and the steps involved in the learning process. 2 1. Data storage Facilities for storing and retrieving huge amounts of data are an important component of the learning process. Humans and computers alike utilize data storage as a foundation for advanced reasoning. • In a human being

In [27]:
for r in results:
    print(r['metadata'])
    print(r['text'][:120])
    print('*' * 40)

{'page': 1, 'unit': 'I', 'section': '1.2', 'section_title': 'Components of Learning'}
1.2 Components of Learning Basic components of learning process The learning process, whether by a human or a machine, c
****************************************
{'page': 2, 'unit': 'I', 'section': '1.2', 'section_title': 'Components of Learning'}
1.2 Components of Learning Basic components of learning process The learning process, whether by a human or a machine, c
****************************************
{'page': 7, 'unit': 'I', 'section': '1.4', 'section_title': 'Designing a Learning System'}
1.4 Designing a Learning System For any learning system, we must be knowing the three elements — T (Task), P (Performanc
****************************************
{'page': 15, 'unit': 'I', 'section': '1.6', 'section_title': 'PERSPECTIVES AND ISSUES IN MACHINE LEARNING'}
learner's hypothesis space?  When and how can prior knowledge held by the learner guide the process of generalizing fro
**********************

In [31]:
def aggregate_pages(output: list[Chunk]):
    pages = sorted({r['metadata']['page'] for r in output})
    return pages[0], pages[-1]

In [32]:
start, end = aggregate_pages(results)

In [34]:
def extract_section(results: list[Chunk]):
    sections = {
        r['metadata'].get('section_title')
        for r in results
        if r['metadata'].get('section_title')
    }
    return sections.pop() if len(sections) == 1 else None

In [35]:
def build_response(results: list[Chunk]) -> str:
    start, end = aggregate_pages(results)
    section = extract_section(results)

    if section:
        return (
            f"The topic '{section}' is discussed on pages {start}–{end} of the textbook."
        )
    else:
        return (
            f"Relevant content for this question can be found on pages {start}–{end} of the textbook."
        )


In [42]:
def polish_sentence(raw_text: str) -> str:
    prompt = f'Rephrase the following sentence in a clear academic tone:\n{raw_text}'

    result = subprocess.run(
        [
            'ollama',
            'run',
            'llama3.2:3b'
        ],
        input=prompt,
        text=True,
        capture_output=True
    )

    return result.stdout.strip()

In [43]:
build_response(results)

'Relevant content for this question can be found on pages 1–15 of the textbook.'

In [44]:
raw = build_response(results)

In [45]:
polished = polish_sentence(raw)
polished

'Here is a rephrased version of the sentence in a clear academic tone:\n\n"The relevant information for this inquiry can be located in chapters 1-15 of the specified text."'