### Ingesting & Parsing PDF Loaders

In [9]:
from langchain_community.document_loaders import (
    PyPDFLoader, PyMuPDFLoader
)

#### PyPDFLoader

In [6]:
print("PyPDFLoader \n")

try:
    pypdf_loader = PyPDFLoader("data/pdfs/attention.pdf")
    pdf_documents = pypdf_loader.load()
    print(f"No of PDF Documents: {len(pdf_documents)}")
    print(f"Preview of content from Document 1 : {pdf_documents[0].page_content[0:100]}")
    print(f"Metadata from Document 1: {pdf_documents[0].metadata}")
except Exception as e:
    print(f"Error occurred: {e}")    

PyPDFLoader 

No of PDF Documents: 15
Preview of content from Document 1 : Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and
Metadata from Document 1: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-08-03T00:07:29+00:00', 'author': '', 'keywords': '', 'moddate': '2023-08-03T00:07:29+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data/pdfs/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}


#### PyMuPDFLoader

In [8]:
print("PyMuPDFLoader \n")

try:
    pymupdf_loader = PyMuPDFLoader("data/pdfs/attention.pdf")
    pdf_documents = pymupdf_loader.load()
    print(f"No of PDF Documents: {len(pdf_documents)}")
    print(f"Preview of content from Document 1 : {pdf_documents[0].page_content[0:100]}")
    print(f"Metadata from Document 1: {pdf_documents[0].metadata}")
except Exception as e:
    print(f"Error occurred: {e}")  

PyMuPDFLoader 

No of PDF Documents: 15
Preview of content from Document 1 : Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and
Metadata from Document 1: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-08-03T00:07:29+00:00', 'source': 'data/pdfs/attention.pdf', 'file_path': 'data/pdfs/attention.pdf', 'total_pages': 15, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2023-08-03T00:07:29+00:00', 'trapped': '', 'modDate': 'D:20230803000729Z', 'creationDate': 'D:20230803000729Z', 'page': 0}


#### Handling common PDF issues

In [None]:
from  langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
from langchain_core.documents import Document

#Advanced PDF processing with error handling

class PDFProcessor:
    def __init__(self, chunk_size=1000, chunk_overlap=100):
        self.chunk_size=chunk_size
        self.chunk_overlap= chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=[" ",])

    def process_pdf(self, pdf_path)->List[Document]: 
        'Process PDF with smart chunking and metadata enhancement'
        #Load PDF
        loader = PyPDFLoader(pdf_path)
        pages= loader.load()

        #process each page
        processed_chunks=[]

        for page_num, page in enumerate(pages):
            #clean text
            cleaned_text = self._clean_text(page.page_content)

            #skip nearly empty pages
            if len(cleaned_text.strip()) < 50:
                continue

            #create chunks with enhanced metadata
            chunks = self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[
                    {**page.metadata,
                    "page": page_num+1,
                    "total_pages": len(pages),
                    "chunk_method": "pdf_processor",
                    "char_count": len(cleaned_text)}
                ]

            )

            processed_chunks.extend(chunks)
        return processed_chunks

    def _clean_text(self, text:str)->str:
        "clean extracted text" 
        #remove excess whitespaces
        text = " ".join(text.split())   

        return text  



In [26]:
processor = PDFProcessor()

In [30]:
#Process a PDF if available

try:
    processed_chunks = processor.process_pdf(pdf_path='data/pdfs/attention.pdf')
    print(f"Processed into {len(processed_chunks)} chunks")

    # show enhanced metadata
    if processed_chunks:
        print("\nSample chunk metadata")
        for key, value in processed_chunks[0].metadata.items():
            print(f" {key}: {value}")
except Exception as e:
    print(f"Error occurred: {e}")    

Processed into 49 chunks

Sample chunk metadata
 producer: pdfTeX-1.40.25
 creator: LaTeX with hyperref
 creationdate: 2023-08-03T00:07:29+00:00
 author: 
 keywords: 
 moddate: 2023-08-03T00:07:29+00:00
 ptex.fullbanner: This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5
 subject: 
 title: 
 trapped: /False
 source: data/pdfs/attention.pdf
 total_pages: 15
 page: 1
 page_label: 1
 chunk_method: pdf_processor
 char_count: 2857


Note:

please do enhancement for metadata based on your chunks, it helps to retrieve data in more efficient way.