In [1]:
import fitz  # PyMuPDF

PDF_PATH = r"RAG\All New Amaze - 3rd Gen (2024~).pdf"

def extract_pdf_toc(pdf_path):
    doc = fitz.open(pdf_path)
    toc = doc.get_toc(simple=True)

    chapters = []

    for entry in toc:
        level, title, page = entry

        # Only take level-1 entries = major chapters
        if level == 1:
            chapters.append({
                "title": title.strip(),
                "page": page
            })

    return chapters

if __name__ == "__main__":
    chapters = extract_pdf_toc(PDF_PATH)

    print("\nClean Major Chapters:\n")
    for idx, ch in enumerate(chapters, start=1):
        print(f"{idx}. {ch['title']}  (starts at page {ch['page']})")

    print("\nTotal clean chapters detected:", len(chapters))



Clean Major Chapters:

1. General Information  (starts at page 2)
2. Contents  (starts at page 6)
3. Visual Index  (starts at page 7)
4. Before Driving  (starts at page 13)
5. Driving Operation  (starts at page 90)
6. Controls  (starts at page 130)
7. Safety Driving Assist System  (starts at page 148)
8. About Your Instrument Panel  (starts at page 188)
9. Maintenance  (starts at page 210)
10. Handling the Unexpected  (starts at page 240)
11. Vehicle Information  (starts at page 275)
12. Index  (starts at page 283)

Total clean chapters detected: 12


In [3]:
import fitz  # PyMuPDF

PDF_PATH = r"RAG\All New Amaze - 3rd Gen (2024~).pdf"

# The 8 real chapters we want
TARGET_CHAPTERS = [
    "Before Driving",
    "Driving Operation",
    "Controls",
    "Safety Driving Assist System",
    "About Your Instrument Panel",
    "Maintenance",
    "Handling the Unexpected",
    "Vehicle Information"
]

def extract_filtered_chapters(pdf_path):
    doc = fitz.open(pdf_path)

    # Extract full TOC
    toc = doc.get_toc(simple=True)

    chapters = []

    for level, title, page in toc:
        # Only keep Level-1 headings that match our desired chapter names
        if level == 1 and title.strip() in TARGET_CHAPTERS:
            chapters.append({
                "title": title.strip(),
                "page_start": page
            })

    # Sort by actual page numbers (very important!)
    chapters.sort(key=lambda c: c["page_start"])

    # Compute page_end
    for i in range(len(chapters)):
        if i < len(chapters) - 1:
            chapters[i]["page_end"] = chapters[i+1]["page_start"] - 1
        else:
            # Last chapter goes until end of PDF
            chapters[i]["page_end"] = len(doc)

    return chapters


if __name__ == "__main__":
    final = extract_filtered_chapters(PDF_PATH)

    print("\nFiltered Chapters with Page Ranges:\n")
    for ch in final:
        print(f"{ch['title']}: pages {ch['page_start']} → {ch['page_end']}")

    print("\nTotal:", len(final))



Filtered Chapters with Page Ranges:

Before Driving: pages 13 → 89
Driving Operation: pages 90 → 129
Controls: pages 130 → 147
Safety Driving Assist System: pages 148 → 187
About Your Instrument Panel: pages 188 → 209
Maintenance: pages 210 → 239
Handling the Unexpected: pages 240 → 274
Vehicle Information: pages 275 → 290

Total: 8


In [5]:
import fitz  # PyMuPDF

PDF_PATH = r"RAG\All New Amaze - 3rd Gen (2024~).pdf"

# Your final chapter ranges
CHAPTERS = [
    {"title": "Before Driving", "page_start": 13, "page_end": 89},
    {"title": "Driving Operation", "page_start": 90, "page_end": 129},
    {"title": "Controls", "page_start": 130, "page_end": 147},
    {"title": "Safety Driving Assist System", "page_start": 148, "page_end": 187},
    {"title": "About Your Instrument Panel", "page_start": 188, "page_end": 209},
    {"title": "Maintenance", "page_start": 210, "page_end": 239},
    {"title": "Handling the Unexpected", "page_start": 240, "page_end": 274},
    {"title": "Vehicle Information", "page_start": 275, "page_end": 290},
]


def extract_chapter_text(pdf_path, chapters):
    doc = fitz.open(pdf_path)

    extracted = []

    for ch in chapters:
        start = ch["page_start"] - 1     # PyMuPDF pages are 0-indexed
        end = ch["page_end"] - 1

        full_text = []

        for p in range(start, end + 1):
            page = doc.load_page(p)
            text = page.get_text("text") or ""
            clean = " ".join(text.split())  # normalize whitespace
            full_text.append(clean)

        extracted.append({
            "chapter": ch["title"],
            "page_start": ch["page_start"],
            "page_end": ch["page_end"],
            "text": "\n".join(full_text)
        })

    return extracted


if __name__ == "__main__":
    result = extract_chapter_text(PDF_PATH, CHAPTERS)

    for ch in result:
        print(f"\n=== {ch['chapter']} ({ch['page_start']} → {ch['page_end']}) ===")
        print(ch['text'][:500], "...")   # show first 500 chars for preview



=== Before Driving (13 → 89) ===
Check Before Driving For Safe Driving･････････････････････････････････13 Your Vehicle’s Safety Features･･････････････････15 Driving Preparation ･････････････････････････････16 Precautions While Driving･･･････････････････････18 Child Safety･････････････････････････････････････19 Safety of Infants and Small Children････････････21 Installing a Child Restraint System･･････････････31 Safety of Larger Children････････････････････････37 Exhaust Gas Hazard･････････････････････････････39 Modifications an ...

=== Driving Operation (90 → 129) ===
Starting or Stopping the Engine Ignition Switch*･････････････････････････････････90 ENGINE START/STOP Button*･･････････････････91 Ignition Switch and Power Mode Comparison･･････ ････････････････････････････････････････････････92 Starting the Engine･･････････････････････････････93 Stopping the Engine･････････････････････････････94 Remote Engine Start*･･･････････････････････････95 What to Do If････････････････････････････

In [7]:
from chapter_extractor import extract_chapter_text
print("Success")


Success


In [9]:
from chapter_extractor import extract_chapter_text

data = extract_chapter_text()
print("Chapters:", len(data))
print("First chapter:", data[0]["chapter"])
print("Preview text:", data[0]["text"][:300])


Chapters: 8
First chapter: Before Driving
Preview text: Check Before Driving For Safe Driving･････････････････････････････････13 Your Vehicle’s Safety Features･･････････････････15 Driving Preparation ･････････････････････････････16 Precautions While Driving･･･････････････････････18 Child Safety･････････････････････････････････････19 Safety of Infants and


In [11]:
from chapter_extractor import extract_chapter_text
from chunk_and_embed import create_chunks_with_metadata





In [13]:
data = extract_chapter_text()
print(len(data))


8


In [15]:
chunks = create_chunks_with_metadata(data)
print("Total chunks:", len(chunks))
print(chunks[0]["metadata"])
print(chunks[0]["text"][:200])


Total chunks: 208
{'id': '8d4da161-27f8-4188-ae6a-87aa0e5a8c96', 'chapter': 'Before Driving', 'chunk_index': 0, 'page_start': 13, 'page_end': 89}
Check Before Driving For Safe Driving･････････････････････････････････13 Your Vehicle’s Safety Features･･････････････････15 Driving Preparation ･････････････････････････････16 Precautions While Drivin


In [17]:
from chapter_extractor import extract_chapter_text
from chunk_and_embed import create_chunks_with_metadata
from save_faiss_index import save_faiss_index

data = extract_chapter_text()
chunks = create_chunks_with_metadata(data)

save_faiss_index(chunks)


FAISS index saved: amaze_index.faiss
Full metadata saved: amaze_meta.pkl


In [19]:
import pickle
with open("amaze_meta.pkl", "rb") as f:
    meta = pickle.load(f)

meta[0].keys()


dict_keys(['chapter', 'chunk_index', 'page_start', 'page_end', 'embedding', 'text'])

In [21]:
from rag_query import answer_query
answer_query("How do I adjust the seats?", chapter="Before Driving")


{'answer': 'To adjust the seats:\n\n* Lever Height Adjustment (Driver’s seat only): Pull up or push down the lever to raise or lower the seat.\n* Lever Seat-back Angle Adjustment: Pull up the lever to change the angle.\n* Bar Horizontal Position Adjustment: Pull up on the bar to move the seat, then release the bar. Once adjusted, rock it back and forth to make sure it is locked in position.\n\nFor rear seats:\n* Using the rear seat armrest: Pull down the armrest in the centre seat-back.',
 'chunks_used': [{'chapter': 'Before Driving',
   'chunk_index': 54,
   'page_start': 13,
   'page_end': 89,
   'embedding': [0.028744768351316452,
    0.03588581457734108,
    0.004529866855591536,
    -0.012306524440646172,
    -0.06767629832029343,
    0.12445268779993057,
    -0.009152277372777462,
    0.03984733298420906,
    -0.020889880135655403,
    0.07178691774606705,
    0.11931268870830536,
    0.07175207883119583,
    0.05789520591497421,
    -0.02421320602297783,
    -0.02517574653029442

In [23]:
answer_query("How do I bake a cake?", chapter="Maintenance")


{'answer': "I DON'T KNOW",
 'chunks_used': [],
 'retrieval_latency': 0.10679006576538086,
 'generation_latency': 0}

In [27]:
answer_query("How do I check engine coolant level?", chapter="Maintenance")

{'answer': "According to manual p. 216, to check engine coolant level, pull the bonnet release handle (a) under the driver's side lower outside corner of the dashboard, ≫The bonnet will pop up slightly. Then push the bonnet latch lever (a) (located under the front edge of the bonnet to the centre) to the side and raise the bonnet. Once you have raised the bonnet slightly, pull the dipstick (orange) out of the reservoir and wipe it clean with a lint-free cloth or paper towel. Then, insert the dipstick back into the reservoir and pull it out again. The engine coolant level can be read on the dipstick.",
 'chunks_used': [{'chapter': 'Maintenance',
   'chunk_index': 5,
   'page_start': 210,
   'page_end': 239,
   'embedding': [-0.07113233208656311,
    0.0595739409327507,
    -0.03505069017410278,
    -0.018885618075728416,
    0.0006373390788212419,
    0.005188759882003069,
    0.003750395495444536,
    0.1376073956489563,
    -0.04594554752111435,
    -0.026922956109046936,
    0.060394