#pdf loader


In [3]:
from langchain_community.document_loaders import(
    PyPDFLoader,
    UnstructuredPDFLoader,
    PyMuPDFLoader
)
from typing import List, Dict, Any
from langchain_core.documents import Document

In [4]:
#py pdf loader

try:
    pypdf_loader = PyPDFLoader(r"data\pdf_file\KBC_2_Invoice_1.pdf")
    pypdf_docs = pypdf_loader.load()
    print(pypdf_docs)
except Exception as e:
    print(f"Exception {e}")





[Document(metadata={'producer': 'TradeDesign PDF 2.0', 'creator': 'PyPDF', 'creationdate': '', 'source': 'data\\pdf_file\\KBC_2_Invoice_1.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}, page_content="ARCELORMITTAL INTERN. LUXEMBOURG SA\nBOULEVARD D'AVRANCHES 24-26\nL-1160 LUXEMBOURG\nLuxembourg\nAntwerpen, 30 June 2025\nAdvice of Financing\nYour Reference: B073384001\nOur Reference: ANTWLE0040018 & ANTWBE0040018/01\nCounterparty: FIERCTC SIBEL SRL\nReceivable amount: USD 2.918.936,80\nFinanced amount: USD 2.918.936,80\nStart Date: 30 June 2025\nDue Date: 24 September 2025\nDear Sir/Madam,\nWe have checked above documentary remittance. The documents are taken up and forfaited without \nrecourse as follows:\nFinanced amount USD 2.918.936,80 value date 30 June 2025 as per details below\nTo the credit of your account with us, no.  IBAN BE68733012009034\nDetails of settlement:\nAmount Financed 2.918.936,80 USD\nInterest USD 30.860,92 value date 30 June 2025 as per details below\nTo t

In [5]:
#PyMuPDFLoader

try:
    pymupdf_loader = PyMuPDFLoader(r"data\pdf_file\KBC_2_Invoice_1.pdf")
    pymupdf_docs = pymupdf_loader.load()
    print(pymupdf_docs[0].page_content)


except Exception as e:
    print(f"Exception {e}")

ARCELORMITTAL INTERN. LUXEMBOURG SA
BOULEVARD D'AVRANCHES 24-26
L-1160 LUXEMBOURG
Luxembourg
Antwerpen, 30 June 2025
Advice of Financing
Your Reference:
B073384001
Our Reference:
ANTWLE0040018 & ANTWBE0040018/01
Counterparty:
FIERCTC SIBEL SRL
Receivable amount:
USD 2.918.936,80
Financed amount:
USD 2.918.936,80
Start Date:
30 June 2025
Due Date:
24 September 2025
Dear Sir/Madam,
We have checked above documentary remittance. The documents are taken up and forfaited without 
recourse as follows:
Financed amount USD 2.918.936,80 value date 30 June 2025 as per details below
To the credit of your account with us, no.  IBAN BE68733012009034
Details of settlement:
Amount Financed
2.918.936,80 USD
Interest USD 30.860,92 value date 30 June 2025 as per details below
To the debit of your account with us, no. IBAN BE68733012009034
Details of settlement:
Interest on 87 days at 4,374893 %
30.860,92 USD
Charges USD 3.880,70 value date 30 June 2025 as per details below
To the debit of your account wi

In [6]:
#problems when extracting pdf 

raw_text = """
Cricket is one of the most popular sports in the world, loved by millions of fans.
It is a game played between two teams of eleven players each.

The sport requires a mix of strategy, skill, and endurance.
It is especially popular in countries like India, Australia, and England.
Cricket has formats like Test matches, ODIs, and T20s, each with its own excitement.


Page 1
"""

In [7]:
#cleaning text 

def clean_text(text):
    text = text.split()
    text = " ".join(text)
    return text


In [8]:

print(f"raw pdf text {raw_text} ")
print(f"clean pdf text {clean_text(raw_text)}")


raw pdf text 
Cricket is one of the most popular sports in the world, loved by millions of fans.
It is a game played between two teams of eleven players each.

The sport requires a mix of strategy, skill, and endurance.
It is especially popular in countries like India, Australia, and England.
Cricket has formats like Test matches, ODIs, and T20s, each with its own excitement.


Page 1
 
clean pdf text Cricket is one of the most popular sports in the world, loved by millions of fans. It is a game played between two teams of eleven players each. The sport requires a mix of strategy, skill, and endurance. It is especially popular in countries like India, Australia, and England. Cricket has formats like Test matches, ODIs, and T20s, each with its own excitement. Page 1


In [32]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


class SmartPdfProcessor:
    """Advanced PDF processor"""
    
    def __init__(self,chunk_size=1000,chunk_overlap=100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            length_function=len,
            separators=[" "],
        )
    
    
    def process_pdf(self , pdf_path: str) -> List[Document]:
        """Process PDF with Smart Chunking and meta data enhancement"""
        
        #load the pdf
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()
        
        #clean each page 
        processed_chunks = []
        for page_num , page in enumerate(pages):
            cleaned_text = self._clean_text(page.page_content)
            
            if(len(cleaned_text.strip()))  <  50:
                continue
            
            chunks = self.text_splitter.create_documents(
                texts = [cleaned_text],
                metadatas=[{
                    **page.metadata ,
                    "page": page_num+1,
                    "total_pages" : len(pages),
                    "chunk_method" : "smart_pdf_processor",
                    "char-count":len(cleaned_text),
                }]
            )
            
            processed_chunks.extend(chunks)
        return processed_chunks
    
    
    def _clean_text(self,text):
        text = text.split()
        text = " ".join(text)
        return text.strip()



In [33]:
preprocessor = SmartPdfProcessor()

In [37]:
smart_chunks = preprocessor.process_pdf(r"data\pdf_file\KBC_2_Invoice_1.pdf")
print(smart_chunks)
print(f"length of smart chunks: {len(smart_chunks)}")

if smart_chunks:
    for key , value in smart_chunks[0].metadata.items():
        print(key,value)


[Document(metadata={'producer': 'TradeDesign PDF 2.0', 'creator': 'PyPDF', 'creationdate': '', 'source': 'data\\pdf_file\\KBC_2_Invoice_1.pdf', 'total_pages': 2, 'page': 1, 'page_label': '1', 'chunk_method': 'smart_pdf_processor', 'char-count': 1450}, page_content="ARCELORMITTAL INTERN. LUXEMBOURG SA BOULEVARD D'AVRANCHES 24-26 L-1160 LUXEMBOURG Luxembourg Antwerpen, 30 June 2025 Advice of Financing Your Reference: B073384001 Our Reference: ANTWLE0040018 & ANTWBE0040018/01 Counterparty: FIERCTC SIBEL SRL Receivable amount: USD 2.918.936,80 Financed amount: USD 2.918.936,80 Start Date: 30 June 2025 Due Date: 24 September 2025 Dear Sir/Madam, We have checked above documentary remittance. The documents are taken up and forfaited without recourse as follows: Financed amount USD 2.918.936,80 value date 30 June 2025 as per details below To the credit of your account with us, no. IBAN BE68733012009034 Details of settlement: Amount Financed 2.918.936,80 USD Interest USD 30.860,92 value date 30