In [34]:
from langchain_community.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader,
    UnstructuredPDFLoader
)
from pathlib import Path
from langchain_core.documents import Document
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    CharacterTextSplitter,
    TokenTextSplitter
)
from typing import List

In [23]:
print("PyPDFLoader")

try:
    path_var=Path(__name__).cwd().parent/"data/raw/amtg_handbook_big.pdf"
    pypdfloader=PyPDFLoader(path_var)
    pypdf_docs=pypdfloader.load()
    print(pypdf_docs)
except Exception as e:
    print(f"Error:{e}")

PyPDFLoader


In [24]:
print(pypdf_docs)



In [27]:
print("PyMuPDFLoader")

try:
    path_var=Path(__name__).cwd().parent/"data/raw/amtg_handbook_big.pdf"
    pymupdfloader=PyMuPDFLoader(path_var)
    pymupdf_docs=pymupdfloader.load()
    print(pymupdf_docs)
except Exception as e:
    print(f"Error:{e}")

PyMuPDFLoader


In [32]:
print(pypdf_docs[49].page_content[:1000])

2-1
Regulations, Maintenance Forms, 
Records, & Publications
Chapter 2
Overview — Title 14 of the Code of Federal 
Regulations (14 CFR)
Aviation-related regulations that have occurred from 1926–
1966 are reflected in Figure 2-1. Just as aircraft continue to 
evolve with ever improving technology, so do the regulations, 
publications, forms, and records required to design, build, 
and maintain them.
The Federal Aviation Administration (FAA) regulations that 
govern today’s aircraft are found in Title 14 of the Code 
of Federal Regulations (14 CFR). [Figure 2-2] There are 
five volumes under Title 14, Aeronautics and Space. The 
first three volumes containing 75 active regulations address 
the Federal Aviation Administration. The fourth volume 
deals with the Office of the Secretary of the Department 
of Transportation (Aviation Proceedings) and Commercial 
Space Transportation, while the fifth volume addresses the 
National Aeronautics and Space Administration (NASA) and 
Air Transporta

In [78]:
import re
class DataPreProcessor:
    """ Preprocessing pdf content along with error handling"""
    def __init__(self,chunk_size=1000,chunk_overlap=10):
        self.chunk_size=chunk_size
        self.chunk_overlap=chunk_overlap
        self.text_splitter=RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            separators=[" "]
        )
    def clean_text(self,text):
        lines = text.split("\n")
        cleaned = []
        for line in lines:
            line = line.strip()
            # skip empty lines, headers, or footers
            if not line or line.isdigit():  # remove page numbers
                continue
            if "aviation" in line.lower() and "manual" in line.lower():
                continue  # remove repeated headers
            cleaned.append(line)
        return " ".join(cleaned)

    
    def preproces(self,pdf_path:str)->List[Document]:
        """Preprocessing the text"""
        #Step 1: Loading the PDF
        #path_var=Path(__name__).cwd().parent/"data/raw/Aviation Maintenance Manual Handbook.pdf"
        loader=PyMuPDFLoader(pdf_path)
        pages=loader.load()
        
        #Step 2: Processing each page
        processed_chunks=[]
        
        for page_num,page in enumerate(pages):
            # basic cleaning of text in each page
            clean_text=self.clean_text(page.page_content)
            
            #skipping empty pages
            if(len(clean_text.strip())<50):
                continue
            
            #Normalizing the text
            norm_text = re.sub(r'\s+', ' ', clean_text).strip()
            
            #Merge Hyphenated lines
            text=norm_text.replace("-\n","")
            text=text.replace("\xad","")
            text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)
            
            #Merge wrapped lines
            text=text.replace("\n"," ")
            
            #Creating chunks with enhance metadata
            chunks=self.text_splitter.create_documents(
                texts=[text],
                metadatas=[{
                    **page.metadata,
                    "page":page_num + 1,
                    "total_pages": len(pages),
                    "chunk_method": "pdf_data_processor",
                    "chunk_count": len(text)
                }]
            )
        
            processed_chunks.extend(chunks)
        return processed_chunks
            
            
                
            
        

In [79]:
preprocess=DataPreProcessor()

In [80]:
path_var=Path(__name__).cwd().parent/"data/raw/Aviation Maintenance Manual Handbook.pdf"
#path_var=Path(__name__).cwd().parent/"data/raw/amtg_handbook_big.pdf"
try:
    chunks=preprocess.preproces(path_var)
    print(f"Processed into {len(chunks)} chunks")
    
    if(chunks):
        print("\n Chunks Metadata:")
        for key,value in chunks[0].metadata.items():
            print(f"{key}: {value}")
except Exception as e:
    print(f"Processing Error:{e}")

Processed into 174 chunks

 Chunks Metadata:
producer: Adobe PDF Library 17.0
creator: Adobe InDesign 18.3 (Windows)
creationdate: 2023-07-28T11:58:10-07:00
source: c:\Users\asus\OneDrive\Documents\Projects\llm_aircraft_MX_chatbot\data\raw\Aviation Maintenance Manual Handbook.pdf
file_path: c:\Users\asus\OneDrive\Documents\Projects\llm_aircraft_MX_chatbot\data\raw\Aviation Maintenance Manual Handbook.pdf
total_pages: 35
format: PDF 1.7
title: Aviation Maintenance Technician Handbook—General
author: FAA
subject: 
keywords: 
moddate: 2023-07-28T12:00:43-07:00
trapped: 
modDate: D:20230728120043-07'00'
creationDate: D:20230728115810-07'00'
page: 2
chunk_method: pdf_data_processor
chunk_count: 146


In [83]:
chunks[:3]

[Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.3 (Windows)', 'creationdate': '2023-07-28T11:58:10-07:00', 'source': 'c:\\Users\\asus\\OneDrive\\Documents\\Projects\\llm_aircraft_MX_chatbot\\data\\raw\\Aviation Maintenance Manual Handbook.pdf', 'file_path': 'c:\\Users\\asus\\OneDrive\\Documents\\Projects\\llm_aircraft_MX_chatbot\\data\\raw\\Aviation Maintenance Manual Handbook.pdf', 'total_pages': 35, 'format': 'PDF 1.7', 'title': 'Aviation Maintenance Technician Handbook—General', 'author': 'FAA', 'subject': '', 'keywords': '', 'moddate': '2023-07-28T12:00:43-07:00', 'trapped': '', 'modDate': "D:20230728120043-07'00'", 'creationDate': "D:20230728115810-07'00'", 'page': 2, 'chunk_method': 'pdf_data_processor', 'chunk_count': 146}, page_content='Aviation Maintenance Technician Handbook–General U.S. Department of Transportation FEDERAL AVIATION ADMINISTRATION Flight Standards Service SAMPLE'),
 Document(metadata={'producer': 'Adobe PDF Library 17.0