In [1]:
import fitz
import os
from tqdm import tqdm
import camelot

In [2]:
HEADER_LIMIT = 50
FOOTER_LIMIT = 570
pdf_path = "Boeing B737 Manual.pdf"
clipped_pdf_path = "Boeing_B737_Clipped.pdf"
poppler_bin_path = r"C:\poppler-25.12.0\Library\bin"
llm_responses_path = "llm_responses.json"
collection_name = "my_rag_collection"
qdrant_path = "./qdrant_db_data"

In [3]:
def clean_and_prep_pdf(input_path, output_path, header_limit, footer_limit):
    # 1. Open the file
    doc = fitz.open(input_path)
    
    # 2. Iterate through pages to apply Redactions (Safest method)
    for page in doc:
        w = page.rect.width
        h = page.rect.height
        
        # Define areas to REMOVE (White box over header/footer)
        # Top strip (Header)
        header_rect = fitz.Rect(0, 0, w, header_limit)
        # Bottom strip (Footer)
        footer_rect = fitz.Rect(0, footer_limit, w, h)
        
        # Add redaction annotations
        page.add_redact_annot(header_rect, fill=(1, 1, 1)) 
        page.add_redact_annot(footer_rect, fill=(1, 1, 1))
        
        # 3. Apply the redactions
        # images=0 prevents it from trying to re-compress images (saves time/errors)
        # graphics=0 prevents it from messing with complex vector graphics (avoids P1 error)
        page.apply_redactions(images=0, graphics=0, text=1)

    # 4. Save with "clean=True" and "garbage=4"
    # garbage=4: Remove unused objects (cleans the corruption)
    # clean=True: Re-writes the syntax to fix "wrong pointing objects"
    doc.save(output_path, garbage=4, clean=True)
    doc.close()
    return output_path

In [4]:
clean_and_prep_pdf(pdf_path, clipped_pdf_path, HEADER_LIMIT, FOOTER_LIMIT)

'Boeing_B737_Clipped.pdf'

In [5]:
import pickle
if "tables_images_locations_saved.pkl" not in os.listdir():   
    tables_and_images = camelot.read_pdf(clipped_pdf_path, flavor="lattice", pages="all")
else:
    with open("tables_images_locations_saved.pkl", "rb") as f:
        tables_and_images = pickle.load(f)

In [6]:
for i, table in enumerate(tables_and_images):
    print(f"Table {i} found on Page: {table.page}")

Table 0 found on Page: 6
Table 1 found on Page: 7
Table 2 found on Page: 33
Table 3 found on Page: 39
Table 4 found on Page: 40
Table 5 found on Page: 41
Table 6 found on Page: 42
Table 7 found on Page: 43
Table 8 found on Page: 45
Table 9 found on Page: 46
Table 10 found on Page: 47
Table 11 found on Page: 51
Table 12 found on Page: 52
Table 13 found on Page: 53
Table 14 found on Page: 54
Table 15 found on Page: 55
Table 16 found on Page: 56
Table 17 found on Page: 74
Table 18 found on Page: 81
Table 19 found on Page: 81
Table 20 found on Page: 82
Table 21 found on Page: 82
Table 22 found on Page: 83
Table 23 found on Page: 83
Table 24 found on Page: 84
Table 25 found on Page: 84
Table 26 found on Page: 85
Table 27 found on Page: 85
Table 28 found on Page: 86
Table 29 found on Page: 86
Table 30 found on Page: 87
Table 31 found on Page: 87
Table 32 found on Page: 87
Table 33 found on Page: 88
Table 34 found on Page: 89
Table 35 found on Page: 89
Table 36 found on Page: 89
Table 37 foun

In [7]:
image_page_list = [table.page for table in tables_and_images]

# 1.Data Ingestion

In [8]:
import google.generativeai as genai
from dotenv import load_dotenv
import os
from pdf2image import convert_from_path

load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
system_instruction = """
You are an expert Aviation Technical Documentation Specialist.
Your input will be images from a Boeing B737 manual (tables, text, or diagrams).
Your output will be used to populate a Vector Database for a RAG system.
Your goal is to extract information in a way that answers "Where is X?", "How does X work?", or "What is the procedure for X?".

Strictly follow these rules:

### 1. GENERAL TEXT INTEGRITY (Top Priority)
- **Introductory Text:** Extract any introductory text blocks
- **verbatim** (word-for-word). Do not paraphrase or summarize these sections.
- **Labels:** Use the exact terminology found in the image labels.

### 2. FOR TABLES
- **Compact Format:** Use standard `|---|` syntax. No pretty-printing.
- **Consistent Column Count:** Markdown does not support merged cells (colspan). 
  - **Rule:** You MUST ensure that **every row** in the table (including headers) has the **same number of columns**.
  - **Handling Merged Headers:** If a header spans multiple columns (e.g., "LIMIT WEIGHT"), repeat it in empty cells OR flatten the header structure so it aligns with the data columns below.
  - **Example:** If the data has 6 columns, the header row must also have 6 columns (using empty cells `| |` if necessary to maintain alignment).
- **Line Breaks:** Use `<br>` for line breaks inside cells.
- **Content:** Do not summarize.

### 3. FOR FLOWCHARTS & PROCESS DIAGRAMS
-Identify specific components using standard aviation acronyms (e.g., CDU, MCP, EFIS, Throttle Quadrant) whenever visually recognizable.
- **Follow the Numbers:** If the diagram contains numbered steps (e.g., [1], [2], [3]), **strictly follow that numerical sequence** in your description, regardless of standard operating procedures. Describe step [1] first, then [2], etc.
- **Describe Sequence:** Use "First," "Then," "Next."
- **Example:** "Step [1]: The scan begins at the side panels... Step [2]: The flow moves to the overhead panel..."

### 4. FOR SCHEMATICS & LOCATION DIAGRAMS
- **Do not** use passive visual descriptions like "There is a line pointing to the wing."
- **Instead, interpret the visual relation** into definitive technical statements.
- Map labels to their physical location on the aircraft (Forward, Aft, Fuselage, Wing, Ceiling, Floor).
- **Combine labels with their descriptive text.** If a label has sub-text, merge it into a coherent sentence.
- **Example:** "Integral Slide Lighting is installed at the forward and aft service doors to illuminate the deployment area."

### 5. OUTPUT FORMAT
- Structure your response into two distinct sections:
  **[Context Text]**: The verbatim introductory text found on the page.
  **[Visual Analysis]**: The structured description of the diagram, table, or schematic.
"""

In [10]:
image_model  = genai.GenerativeModel(
    model_name="gemini-2.5-flash",
    system_instruction=system_instruction
)

In [11]:
from time import time
from google.api_core.exceptions import ResourceExhausted
def extract_data_from_images(image_page_list,input_path, image_model, llm_responses_path):
    import json
    extracted_data = []
    failed_pages = []

    if os.path.exists(llm_responses_path):
        try:
            with open(llm_responses_path, "r", ) as f:
                extracted_data = json.load(f)
        except json.JSONDecodeError:
            print(f"warning: '{llm_responses_path}' file is empty or broken new file is starting.")
            extracted_data = []


    processed_pages = {item['page'] for item in extracted_data}

    for page_num in tqdm(image_page_list,desc="Processing Pages"):

        if page_num in processed_pages:
            continue
        
        
        try:
            images = convert_from_path(input_path, first_page=page_num, last_page=page_num, poppler_path=poppler_bin_path)
            image = images[0]

            response = image_model.generate_content(
                ["Analyze this page based on the system instructions.", image])
            
            if response.candidates[0].finish_reason == 4:
                print(f"Warning: Page {page_num} was blocked due to copyright (Recitation).")
                failed_pages.append(page_num)
                continue


            page_content = response.text

            extracted_data.append({
                "page": page_num,
                "content": page_content
            })
            processed_pages.add(page_num)

            # Save after each extraction to avoid data loss
            with open(llm_responses_path, "w") as f:
                json.dump(extracted_data, f, indent=4)


        except ResourceExhausted:
            print(f"Quota exceeded! Waiting 60 seconds... (Page {page_num})")
            time.sleep(60)
            failed_pages.append(page_num)
        except Exception as e:
            print(f"Error processing page {page_num}: {e}")
    print(f"Extraction complete. Successfully processed {len(processed_pages)} pages.")
    print(f"Failed pages: {failed_pages}")
    print(f"failed pages number: {len(failed_pages)}")
    return extracted_data

In [12]:
data_from_images = extract_data_from_images(image_page_list, clipped_pdf_path, image_model, llm_responses_path)

Processing Pages: 100%|██████████| 112/112 [00:10<00:00, 10.63it/s]

Extraction complete. Successfully processed 71 pages.
Failed pages: [103]
failed pages number: 1





# 2. Chunking (Splitting)

In [13]:
import tiktoken
from langchain_core.documents import Document

def create_chunks_per_page(data_from_images, source_name):
    documents = []
    token_count_list = []
    encoder = tiktoken.get_encoding("gpt2")
    manual_split_points = {
        82: "The following table presents Field Limit Weight (1000 KG) and Climb Limit Weight (1000 KG) for 1000 FT Pressure Altitude.",
        83: "3000 FT Pressure Altitude", # Bu başlık sayfanın ortasında geçiyor
        85: "### Table: 1000 FT Pressure Altitude",
        86: "**Table 2: 3000 FT Pressure Altitude**"
    }

    for item in data_from_images:
        page_num = item['page']
        content = item['content']
        
        if not content or len(content.strip()) < 10:
            print(f"Skipping page {page_num} due to insufficient content.")
            continue

        token_count = len(encoder.encode(content))
        token_count_list.append(token_count)

        if page_num in manual_split_points and token_count > 2000:
            separator = manual_split_points[page_num]
            
            parts = content.split(separator, 1) 
            
            if len(parts) == 2:
                print(f"Page {page_num} manually split into 2 parts (Tokens: {token_count}).")
                
                meta1 = {
                    "source": source_name, "page": page_num, 
                    "chunk_type": "image_extracted_part", "part": 1, "total_parts": 2
                }
                documents.append(Document(page_content=parts[0].strip(), metadata=meta1))
                
                part2_content = separator + "\n" + parts[1].strip()
                meta2 = {
                    "source": source_name, "page": page_num, 
                    "chunk_type": "image_extracted_part", "part": 2, "total_parts": 2
                }
                documents.append(Document(page_content=part2_content, metadata=meta2))
                continue
            else:
                print(f"WARNING: Could not find separator for Page {page_num}. Processing as single chunk.")

        if token_count > 2000 and page_num not in manual_split_points:
             print(f"Warning: Page {page_num} has {token_count} tokens but no manual split defined. Adding as is.")

        metadata = {
            "source": source_name,
            "page": page_num,
            "chunk_type": "image_extracted"
        }
        doc = Document(page_content=content, metadata=metadata)
        documents.append(doc)

    return documents, token_count_list

In [14]:
chunked_response, token_count_list = create_chunks_per_page(data_from_images, "llm_responses.json")

Page 82 manually split into 2 parts (Tokens: 2410).
Page 83 manually split into 2 parts (Tokens: 3331).
Page 85 manually split into 2 parts (Tokens: 4126).
Page 86 manually split into 2 parts (Tokens: 2592).


In [15]:
len(chunked_response)

75

In [16]:
image_page_list = list(set(image_page_list))

In [17]:
import numpy as np
shoerted_token_count = np.sort(token_count_list)[::-1]

shoerted_token_count[:20]

array([4126, 3331, 2592, 2410, 1903, 1707, 1670, 1605, 1603, 1530, 1399,
       1304, 1150, 1033,  942,  902,  796,  685,  653,  588])

In [18]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,      
    chunk_overlap=150, 
    separators=["\n\n", "\n", " ", ""], 
    length_function=len
)

In [19]:
def create_chunks_for_text_only(image_page_list,source_name,clipped_pdf_path, text_splitter):
    doc  = fitz.open(clipped_pdf_path)
    text_chunks = []
    for item in doc:
        if item.number + 1 not in image_page_list:
            text_content = item.get_text()
            page = item.number + 1

            metadata = {
                "source": source_name,
                "page": page,
                "chunk_type": "text_only"
            }
            if len(text_content.strip()) < 20:
                continue
            
            chunks = text_splitter.create_documents([text_content], metadatas=[metadata])
            text_chunks.extend(chunks)

    return text_chunks
    

In [20]:
chunked_text_only = create_chunks_for_text_only(image_page_list, "Boeing_B737_Clipped.pdf", clipped_pdf_path, text_splitter)

In [21]:
all_chunks = chunked_response + chunked_text_only

# 3. Embedding

In [22]:
import hashlib
#from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_qdrant import QdrantVectorStore
from langchain_community.embeddings import HuggingFaceEmbeddings
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

#embedding_model = GoogleGenerativeAIEmbeddings( #i dont have that much money the free tier not enough
 #   model="gemini-embedding-001"
#)

embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-base-en-v1.5",
    model_kwargs={'device': 'cpu'}
)

client = QdrantClient(path=qdrant_path)


print(f"the dataset is loading... to qdrant database {len(all_chunks)}")
chunk_ids = []
for chunk in all_chunks:
    content = chunk.page_content
    id_hash = hashlib.md5(content.encode('utf-8')).hexdigest()
    chunk_ids.append(id_hash)


try:
    dummy_vec = embedding_model.embed_query("test")
    vec_size = len(dummy_vec)

    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=vec_size, distance=Distance.COSINE),
    )

    vector_store = QdrantVectorStore(
        client=client,
        collection_name=collection_name,
        embedding=embedding_model,
    )
    vector_store.add_documents(
        documents=all_chunks,
        ids=chunk_ids,
        batch_size=50
    )
    
    print("OPERATION SUCCESSFUL! The database was set up appropriately for the size of the model.")
except Exception as e:
    print(f"ERROR: {e}")

  embedding_model = HuggingFaceEmbeddings(


the dataset is loading... to qdrant database 224


  client.recreate_collection(


OPERATION SUCCESSFUL! The database was set up appropriately for the size of the model.
