## 1. Load pdf

In [2]:
from langchain_community.document_loaders import PyPDFLoader



In [3]:
file_path = "./dataset/data_pdf.pdf"
loader = PyPDFLoader(file_path)

In [4]:
docs= loader.load()
docs[1]

Document(metadata={'producer': 'Adobe PDF Library 10.0.1', 'creator': 'PyPDF', 'creationdate': '2019-09-05T17:40:31+05:30', 'moddate': '2019-09-06T18:51:55+05:30', 'source': './dataset/data_pdf.pdf', 'total_pages': 199, 'page': 1, 'page_label': 'i'}, page_content='The IT Support Handbook\nA How-To Guide to Providing Effective \nHelp and Support to IT Users\nMike\xa0Halsey')

In [5]:
import pprint

pprint.pp(docs[1].metadata)

{'producer': 'Adobe PDF Library 10.0.1',
 'creator': 'PyPDF',
 'creationdate': '2019-09-05T17:40:31+05:30',
 'moddate': '2019-09-06T18:51:55+05:30',
 'source': './dataset/data_pdf.pdf',
 'total_pages': 199,
 'page': 1,
 'page_label': 'i'}


## 2. Load pages data

### 2.1 Extract text page par page

In [6]:
pages=[]
for doc in loader.lazy_load():
    pages.append(doc)
    print(doc.page_content[:100])
len(pages)


The IT Support 
Handbook
A How-To Guide to Providing Effective  
Help and Support to IT Users
—
Mike
The IT Support Handbook
A How-To Guide to Providing Effective 
Help and Support to IT Users
Mike Hal
The IT Support Handbook
ISBN-13 (pbk): 978-1-4842-5132-4   ISBN-13 (electronic): 978-1-4842-5133-1
h
For my father, James Halsey, who taught me the value of hard work 
and dedication, and who first int
v
Part I:  IT Support Fundamentals �����������������������������������������������������������������
vi
Chapter 3: Understanding Your Users: How Much Do They Know? ������������������������23
How to Com
vii
Take the User with You on the Journey ����������������������������������������������������������
viii
Chapter 8: The Human Factor �������������������������������������������������������������������
ix
Part IV:  Documentation and Reporting �������������������������������������������������������103

x
Chapter 14: Harnessing System and Error Reporting in Windows �����������������������133
R

199

In [7]:
print(pprint.pp(pages[12].metadata))

{'producer': 'Adobe PDF Library 10.0.1',
 'creator': 'PyPDF',
 'creationdate': '2019-09-05T17:40:31+05:30',
 'moddate': '2019-09-06T18:51:55+05:30',
 'source': './dataset/data_pdf.pdf',
 'total_pages': 199,
 'page': 12,
 'page_label': 'xv'}
None


### 2.2 Shunking (lang text -> small text)

**convert the PDF into Markdown**

In [8]:
import re
import pymupdf4llm
from pathlib import Path

# 1. Extract text as a list of page chunks (Best for IQ/Context)
# page_chunks=True gives you a list: [{'text': '...', 'metadata': {...}}, ...]
pages = pymupdf4llm.to_markdown('./dataset/data_pdf.pdf', page_chunks=True)

final_md_list = []

# 2. Loop through pages to clean and add page markers
for page in pages:
    text = page['text']
    page_num = page['metadata']['page'] + 1  # Humans count from 1, not 0
    
    # Clean the "black diamond" characters
    text = text.replace('\ufffd', '')
    # Clean up multiple spaces
    text = re.sub(r' +', ' ', text)
    
    # Add a clear Page Header so your "30-second" chunks have context
    page_entry = f"\n\n## --- PAGE {page_num} ---\n\n{text}"
    final_md_list.append(page_entry)

# 3. Join everything into one clean string
full_clean_text = "".join(final_md_list)

# 4. Save to your dataset folder
output_path = Path("./dataset/output.md")
output_path.parent.mkdir(parents=True, exist_ok=True) # Ensure folder exists
output_path.write_text(full_clean_text, encoding="utf-8")

print(f"Success! Saved to {output_path}")

Consider using the pymupdf_layout package for a greatly improved page layout analysis.
Success! Saved to dataset/output.md


**Summarize Figures**

**Recursive Character Splitting**

In [10]:
import pymupdf
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 1. Extraire le texte page par page
doc = pymupdf.open("./dataset/data_pdf.pdf")
pages_textes = []

for i in range(len(doc)):
    page = doc.load_page(i)
    # On récupère le texte brut et on nettoie les caractères invalides
    texte_page = page.get_text().replace('\ufffd', '')
    pages_textes.append({"texte": texte_page, "page": i + 1})

# 2. Définir le découpeur (Chunks de 250 caractères)
splitter = RecursiveCharacterTextSplitter(
    chunk_size=250,
    chunk_overlap=30,
    separators=["\n\n", "\n", ".", " "]
)

# 3. Découper et stocker
chunks_finaux = []

for item in pages_textes:
    # On découpe le texte de la page
    chunks = splitter.split_text(item["texte"])
    
    for c in chunks:
        chunks_finaux.append({
            "page": item["page"],
            "contenu": c.strip()
        })

# Affichage du résultat (Exemple : les 3 premiers chunks)
for c in chunks_finaux[:3]:
    print(f"PAGE: {c['page']} | TEXTE: {c['contenu']}\n")

PAGE: 1 | TEXTE: The IT Support 
Handbook
A How-To Guide to Providing Effective  
Help and Support to IT Users
—
Mike Halsey

PAGE: 2 | TEXTE: The IT Support Handbook
A How-To Guide to Providing Effective 
Help and Support to IT Users
Mike Halsey

PAGE: 3 | TEXTE: The IT Support Handbook
ISBN-13 (pbk): 978-1-4842-5132-4	
	
	
ISBN-13 (electronic): 978-1-4842-5133-1
https://doi.org/10.1007/978-1-4842-5133-1
Copyright © 2019 by Mike Halsey

