# Loading PDF File

In [None]:
# The three loaders below can be used to load PDF and Word documents.
from langchain_community.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader,
)

### Loading a book named as 'A BRIEF HISTORY OF PAKISTAN'

In [3]:
## pypdfloader is the most basic one. It works well for simple PDFs but struggles with more complex layouts.

try:
    py_pdf_loader = PyPDFLoader("/home/hammadali08/Personal/FYP Datasets/A Brief History of Pakistan By JAMES WYNBRANDT.pdf")
    py_pdf_documents = py_pdf_loader.load()
    print(f"PyPDFLoader found {len(py_pdf_documents)} document(s).")
    
except Exception as e:
    print(f"PyPDFLoader error: {e}")

PyPDFLoader found 337 document(s).


In [4]:
# Loading certain page of the document
print(py_pdf_documents[18])

page_content='A BRIEF HISTORY OF PAKISTAN
6
The Safed Koh Range, which runs east–west, has peaks averag-
ing about 12,000 feet (3,657 m). The Khyber Pass, the most famous 
of the high-elevation gateways to the subcontinent, cuts through its 
mountains. About 33 miles (53 km) in length, the pass extends from 
Jamrud, some 10 miles (16 km) from Peshawar, Pakistan, to Dakka 
in Afghanistan. South of the range is the Kurram River. The Kurram 
Pass, which goes through Parachinar, Thal, and Kohat, has long been 
another favored route to Afghanistan. To the south, the Waziristan 
Hills lie between the Kurram and Gomal Rivers. The Gomal Pass, 
named for the Gomal River, which feeds into the Indus, has been an 
important trade route between Afghanistan and Pakistan for nomadic 
tribes known as the Powindahs. (Today their entry into Pakistan is 
restricted.)
South of the Gomal River the Sulaiman Mountains extend for 300 
miles (483 km). The main peak, Takht-i-Sulaiman, is 11,100 feet (3,383 
m).

In [5]:
# Seeing the metadata of the document
print(py_pdf_documents[1].metadata)

{'producer': 'Acrobat Distiller 7.0.5 for Macintosh', 'creator': 'Adobe InDesign CS2 (4.0.5)', 'creationdate': '2008-10-28T11:49:49-04:00', 'title': 'Brief History of Pakistan', 'author': 'Wynbrandt, James.', 'moddate': '2009-07-28T15:09:17-05:00', 'ebx_publisher': 'Facts on File, Inc.', 'source': '/home/hammadali08/Personal/FYP Datasets/A Brief History of Pakistan By JAMES WYNBRANDT.pdf', 'total_pages': 337, 'page': 1, 'page_label': 'i'}


### PyMuPDF_Loader

In [6]:
# pymupdfloader is more advanced and can handle complex layouts better. It also provides more metadata about the document.
try:
    pymupdf_loader = PyMuPDFLoader("/home/hammadali08/Personal/FYP Datasets/A Brief History of Pakistan By JAMES WYNBRANDT.pdf")
    pymupdf_documents = pymupdf_loader.load()
    print(f"PyMuPDFLoader found {len(pymupdf_documents)} document(s).")
except Exception as e:
    print(f"PyMuPDFLoader error: {e}")

PyMuPDFLoader found 337 document(s).


In [7]:
print(pymupdf_documents[9])
print('Metadata of the page:')
print(pymupdf_documents[9].metadata)

page_content='ix
Foreword
S
ince September 11, 2001, Pakistan has emerged as a pivotal front in 
the U.S. war on terrorism. Its very political destiny is distorted by 
the unfolding global struggle against al-Qaeda and other militants, such 
as the Taliban, who have found a home in Pakistan. With the excep-
tion of Iraq, the global confrontation against jihadists and their Taliban 
allies is playing itself out on the streets of Pakistan’s crowded urban 
centers and tribal areas more than in any other country.
From a U.S. perspective, Pakistan’s active participation in the ﬁ ght 
against terrorism dwarfs everything else in importance, including 
human rights, socioeconomic equity, and democracy; like its other 
Arab and Muslim neighbors, Pakistan has become important for the 
wrong reasons.
A ﬂ ood of publications and media commentary on Pakistan focuses 
almost exclusively on Pakistan’s commitment to the war on terrorism 
and the security of its nuclear arsenal. Little is being written

### Cleaning that RAW DATA

In [8]:
raw_text=''' Pakistan: A Brief History


by James Wynbrandt

Pakistan is really an impressive country. It is the sixth most populous nation on earth, with a population of more than 220 million people. It has the world’s second-largest Muslim population (after Indonesia) and is home to a stunning


                    population is 26 million.
                    

It is growing at a high pace.

page 2 of 216

'''

# Cleaning that RAW DATA
def clean_text(text):
    # Remove leading/trailing whitespace
    text = text.strip()
    
    # Replace multiple spaces with a single space
    text = ' '.join(text.split())
    
    # Remove unwanted characters (e.g., non-printable characters)
    text = ''.join(char for char in text if char.isprintable())
    
    return text

In [9]:
raw_text

' Pakistan: A Brief History\n\n\nby James Wynbrandt\n\nPakistan is really an impressive country. It is the sixth most populous nation on earth, with a population of more than 220 million people. It has the world’s second-largest Muslim population (after Indonesia) and is home to a stunning\n\n\n                    population is 26 million.\n\n\nIt is growing at a high pace.\n\npage 2 of 216\n\n'

In [10]:
print(clean_text(raw_text))

Pakistan: A Brief History by James Wynbrandt Pakistan is really an impressive country. It is the sixth most populous nation on earth, with a population of more than 220 million people. It has the world’s second-largest Muslim population (after Indonesia) and is home to a stunning population is 26 million. It is growing at a high pace. page 2 of 216


In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

In [None]:
from typing import List, Dict, Optional
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

class SmartPDFProcessor:
    """Advanced PDF processing with error handling + custom metadata injection"""

    def __init__(self, chunk_size: int = 500, chunk_overlap: int = 10):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=[" "],
        )

    def process_pdf(
        self, 
        pdf_path: str, 
        custom_metadata: Optional[Dict[str, str]] = None
    ) -> List[Document]:
        """Process PDF with smart chunking and metadata enhancement"""

        # Load PDF
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()

        processed_chunks = []

        for page_num, page in enumerate(pages):
            cleaned_text = self._clean_text(page.page_content)

            # Skip nearly empty pages
            if len(cleaned_text.strip()) < 50:
                continue

            # Step 1: Page details FIRST
            metadata = {
                "page": page_num + 1,
                "total_pages": len(pages),
                "chunk_method": "smart_pdf_processor",
                "char_count": len(cleaned_text),
            }

            # Step 2: Add PDF metadata (if any)
            if page.metadata:
                metadata.update(page.metadata)

            # Step 3: Add custom metadata (overrides everything if same key)
            if custom_metadata:
                metadata.update(custom_metadata)

            # Create chunks
            chunks = self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[metadata]
            )

            processed_chunks.extend(chunks)

        return processed_chunks

    def _clean_text(self, text: str) -> str:
    # Remove excessive whitespace
        text = " ".join(text.split())
    
    # Fix common PDF extraction issues
        text = text.replace("ﬁ", "fi")
        text = text.replace("ﬂ", "fl")
    
        return text


In [14]:
Preprocessor=SmartPDFProcessor()

In [15]:
# Loading and processing the PDF
try:
    processed_chunks=Preprocessor.process_pdf("/home/hammadali08/Personal/FYP Datasets/Pakistan-A-Hard-Country-by-Anatol-Lieven-.pdf"
                                             ,custom_metadata={"source":"FYP Dataset","author":"Anatol Lieven"})
    print(f"Processed {len(processed_chunks)} chunks from the PDF.")
    
    # Show enhanced metadata
    if processed_chunks:
        print("\nSample chunk metadata:")
        for key, value in processed_chunks[0].metadata.items():
            print(f"  {key}: {value}")
except Exception as e:
    print(f"Error processing PDF: {e}")

Processed 3678 chunks from the PDF.

Sample chunk metadata:
  page: 1
  total_pages: 465
  chunk_method: smart_pdf_processor
  char_count: 53
  producer: Nuance PDF Create 8
  creator: Microsoft Word - Final.docx
  creationdate: 2019-11-20T01:55:25-08:00
  moddate: 2019-11-21T00:00:05-08:00
  author: Anatol Lieven
  title: Microsoft Word - Final.docx
  source: FYP Dataset
  page_label: 2


In [16]:
print(processed_chunks[134])

page_content='n ly i n t er m s o f p r o p er t y an dbus i n es s ,but als o fr o m r es p ec t abi li t y ( p r i m ar i ly exp r es s ed by anashraf [ ‘ n o ble’ ] li fes t y le) . O n e elem en to f bei n g c o n s i d er ed a m an w o r t hy o f r es p ec td er i v es fr o m hav i n g a r ep ut at i o n as bei n g s o m eo n e w hoho n o r s hi s o bli g at i o n s t ok i n .Co us i n m ar r i ag e i s o n e o f t he m o s ti m p o r t an texp r es s i o n s o f t hi s o bli g at i o n .The m ajo r i t' metadata={'page': 22, 'total_pages': 465, 'chunk_method': 'smart_pdf_processor', 'char_count': 3863, 'producer': 'Nuance PDF Create 8', 'creator': 'Microsoft Word - Final.docx', 'creationdate': '2019-11-20T01:55:25-08:00', 'moddate': '2019-11-21T00:00:05-08:00', 'author': 'Anatol Lieven', 'title': 'Microsoft Word - Final.docx', 'source': 'FYP Dataset', 'page_label': '23'}
