In [2]:
# streamlit run streamlit_site/streamlit_app.py
# Imports
import os
import re
import pandas as pd
from sqlalchemy import (
    select, create_engine, text, Table, Column, Integer, String, MetaData, ForeignKey, LargeBinary)
from sqlalchemy.orm import sessionmaker
import pdfplumber
import fitz  # PyMuPDF
# For Image Display within the df
from IPython.display import Image
from PIL import Image, ImageOps
from io import BytesIO
from datetime import datetime
from collections import defaultdict

In [None]:
# Intializing database connection
# Replace values with your actual database info
# username = "mpams_user"
# password = "qmy7aT6NQSuAeMNfY02w7fAGjJ7lXqBm"
# host = "dpg-d1hk6bbipnbc73f4du6g-a.singapore-postgres.render.com"
# port = "5432"
# database = "mpams"

# # SQLAlchemy connection URL
# DATABASE_URL = f"postgresql://{username}:{password}@{host}:{port}/{database}"

host = "aws-0-ap-southeast-1.pooler.supabase.com"
port = "5432"
database = "postgres"
username = "postgres.thqqtxvmzisznglpukwh"
password = "ImehQhjJwRw2wnkO"

# SQLAlchemy connection URL
DATABASE_URL = f"postgresql://{username}:{password}@{host}:{port}/{database}"

# Create engine
engine = create_engine(DATABASE_URL)

In [8]:
# Creating the 2 tables
metadata = MetaData()

pdf_info = Table(
    "pdf_info", metadata,
    Column("pdf_id", String, primary_key=True),
    Column("year", Integer, nullable=False),
    Column("brand", String, nullable=False),
    Column("model", String, nullable=False),
    Column("batch_id", String, nullable=False),
    Column("bike_image", LargeBinary),
)

master_parts_list = Table(
    "master_parts_list", metadata,
    Column("mpl_id", Integer, primary_key=True, autoincrement=True),
    Column("part_no", String, nullable=False),
    Column("description", String, nullable=False),
    Column("ref_no", Integer, nullable=False),
    Column("add_info", String),
    Column("section_id", String, ForeignKey("pdf_section.section_id", ondelete="CASCADE")),
    Column("pdf_id", String, ForeignKey("pdf_info.pdf_id", ondelete="CASCADE")),
)

pdf_section = Table(
    "pdf_section", metadata,
    Column("section_id", String, primary_key=True),
    Column("section_no", String, nullable=False),
    Column("section_name", String, nullable=False),
    Column("cc", String, nullable=False),
    Column("section_image", LargeBinary, nullable=False),
    Column("pdf_id", String, ForeignKey("pdf_info.pdf_id", ondelete="CASCADE")),
)

pdf_log = Table(
    "pdf_log", metadata,
    Column("log_id", Integer, primary_key=True, autoincrement=True),
    Column("pdf_id", String, ForeignKey("pdf_info.pdf_id", ondelete="CASCADE")),
    Column("account_id", String, nullable=False), # add FK next time
    Column("timestamp", String, nullable=False),
    Column("is_active", Integer, nullable=False),
    Column("is_current", Integer, nullable=False),
)

# Create the tables
metadata.create_all(engine)

In [None]:
# Old code (For Reference)
# ---------- HELPERS ----------
def extract_pdf_id(pdf_path):
    # To-Do: modify pdf_id extraction to suit both pdf format
    base_filename = os.path.basename(pdf_path).split('.')[0]
    match = re.match(r"([A-Za-z0-9 ]+)", base_filename)
    if match:
        return match.group(1).replace(" ", "")  # Remove all spaces
    return None

def extract_year(pdf_path):
    year_match = re.search(r"'(\d{2})", pdf_path)
    return f"20{year_match.group(1)}" if year_match else None

def extract_model(pdf_path):
    base_filename = os.path.basename(pdf_path)
    match = re.search(r"\((.*?)\)", base_filename)
    if match:
        return match.group(1)  # e.g., "B65P, B65R, B65S"
    return None

# ---------- IMAGE EXTRACTION ----------
def normalize_image_background(image_bytes):
    img = Image.open(BytesIO(image_bytes)).convert("L")  # Grayscale
    mean_brightness = sum(img.getdata()) / (img.width * img.height)

    if mean_brightness < 128:  # Invert if it's a dark background
        img = ImageOps.invert(img)

    img = img.convert("RGB")  # Convert back to RGB

    output = BytesIO()
    img.save(output, format="PNG")
    return output.getvalue()

def get_existing_fig_combos(engine, pdf_id):
    with engine.connect() as conn:
        result = conn.execute(
            text("SELECT section FROM parts_images WHERE pdf_id = :pdf_id"),
            {"pdf_id": pdf_id}
        )
        return set(str(row[0]) for row in result.fetchall())  # Cast to str for consistent comparison

def extract_images_with_fig_labels(pdf_path, pdf_id, engine):
    doc = fitz.open(pdf_path)
    data = []

    # Step 1: Get existing (pdf_id, fig_no) combos from DB
    existing_figs = get_existing_fig_combos(engine, pdf_id)

    seen_figs = set()  # Track unique figs within the PDF

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()

        matches = re.findall(r"FIG\.\s*([\w-]+)", text)
        if not matches:
            continue

        section = matches[0]

        if section in seen_figs or section in existing_figs:
            continue  # Skip if already handled or exists in DB

        image_list = page.get_images(full=True)
        if not image_list:
            continue

        xref = image_list[0][0]
        base_image = doc.extract_image(xref)
        image = normalize_image_background(base_image["image"])

        image_id = "_".join([pdf_id, section])

        data.append({
            "image_id" : image_id,
            "pdf_id": pdf_id,
            "section": section,
            "image": image
        })
        seen_figs.add(section)

    return pd.DataFrame(data)

# ---------- TEXT EXTRACTION ----------
def extract_text_from_pdf(pdf_path):
    all_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            text = page.extract_text() or ""
            lines = text.split('\n')
            first_line = lines[0].strip() if lines else ""
            if not any("FIG." in line for line in lines):
                continue
            if "NUMERICAL INDEX" in first_line:
                break
            all_text += f"\n--- Page {page_num + 1} ---\n{text}\n"
    return all_text

def yamaha_process_data(text, pdf_id, year, model, num_model):
    rows = []
    lines = text.strip().split('\n')
    section = c_name = prev_fig_no = prev_c_name = prev_ref_no = ""
    collect_data = False

    for line in lines:
        line = line.strip()
        if not line: continue
        if line.startswith('FIG.'):
            tokens = line.split()
            if len(tokens) >= 3:
                section = tokens[1]
                c_name = " ".join(tokens[2:])
                prev_fig_no, prev_c_name = section, c_name
                collect_data = True
            continue
        if not collect_data: continue
        if not section:
            section, c_name = prev_fig_no, prev_c_name

        parts = line.split()
        is_valid_data_line = (
            len(parts) >= 2 and 
            (re.match(r'\w+[-–]\w+', parts[0]) or parts[0].isdigit())
        )
        if not is_valid_data_line:
            continue

        if parts[0].isdigit():
            ref_no = parts[0]
            part_no = parts[1]
            rest = parts[2:]
            prev_ref_no = ref_no
        else:
            ref_no = prev_ref_no
            part_no = parts[0]
            rest = parts[1:]

        rest = " ".join(rest).split()
        description = remarks = ""
        numbers = []
        found_numbers = False
        for item in rest:
            if item.isdigit():
                numbers.append(item)
                found_numbers = True
                continue
            if not found_numbers:
                description += item + " "
            else:
                remarks += item + " "
        if len(numbers) > num_model:
            description += numbers[0]

        image_id = "_".join([pdf_id, section])

        rows.append([pdf_id, year, "Yamaha", model, section, c_name, ref_no, part_no, description.strip(), remarks.strip(), image_id])

    return pd.DataFrame(rows, columns=[
        'pdf_id', 'year', 'brand', 'model', 'section', 'component_name',
        'ref_no', 'part_no', 'description', 'remarks', 'image_id'
    ])

# ---------- MAIN PROCESS ----------
def yamaha_data_extraction(pdf_path):

    pdf_id = extract_pdf_id(pdf_path)
    year = extract_year(pdf_path)
    model = extract_model(pdf_path)

    SessionLocal = sessionmaker(bind=engine)
    session = SessionLocal()
    try:
        # -------- pdf_log table --------
        existing_log = session.execute(
            select(1).select_from(pdf_log).where(pdf_log.c.pdf_id == pdf_id)
        ).first()

        if not existing_log:
            session.execute(
                pdf_log.insert().values(
                    pdf_id=pdf_id,
                    timestamp=datetime.now().isoformat()
                )
            )
            session.commit()
            print(f"[INFO] Logged PDF '{pdf_id}' in pdf_log.")
        else:
            print(f"[INFO] PDF '{pdf_id}' already logged.")

        # -------- parts_images table --------
        df_images = extract_images_with_fig_labels(pdf_path, pdf_id, engine)
        image_message = f"[INFO] Inserted {len(df_images)} new images for '{pdf_id}'."
        if not df_images.empty:
            df_images.to_sql("parts_images", engine, if_exists="append", index=False, method="multi")
            print(image_message)
        else:
            print(image_message + f" All images for '{pdf_id}' already exist.")

        existing = session.execute(
            select(1).select_from(master_parts_list).where(master_parts_list.c.pdf_id == pdf_id)
        ).first()

        if existing:
            print(f"[INFO] Master Parts data for '{pdf_id}' already exists.")
            return

    finally:
        session.close()
            
    # -------- master_parts_list table --------
    all_text = extract_text_from_pdf(pdf_path)
    df_parts = yamaha_process_data(all_text, pdf_id, year, model, num_model=3)

    if not df_parts.empty:
        #print(df_parts.to_string(index=False))
        df_parts.to_sql("master_parts_list", engine, if_exists="append", index=False, method="multi")
        print(f"[INFO] Inserted parts data for '{pdf_id}'.")
    else:
        print(f"[INFO] Error, no parts data extracted for '{pdf_id}'.")

In [None]:
# Old code (For Refernece)
# Format 1: Yamaha, Important images from page 6-60
pdf_1 = "Manuals/AEROX 155 '19 (B65P, B65R, B65S).pdf"
pdf_2= "Manuals/FJR1300A '15 (1MCH, 1MCG).PDF"
# Format 2: Honda
pdf_3 = "Manuals/CRF1000 A_PC_13MJPG02_(G.H).pdf"
pdf_4 = "Manuals/NC750XAP_13MKWM02_PC_2022_2023.pdf"

pdf_path = pdf_1

brand = "Yamaha"
supported_brands = ['Yamaha', 'Honda']

if brand in supported_brands:
    if brand == "Yamaha":
        yamaha_data_extraction(pdf_path)
    elif brand == "Honda":
        print("Hi")
else:
    print (f'"{brand}" not supported \nAvailable Brands: {supported_brands}')

In [4]:
# Testing
# Format 1: Yamaha
pdf_1 = "Manuals/AEROX 155 '19 (B65P, B65R, B65S).pdf"
pdf_2= "Manuals/FJR1300A '15 (1MCH, 1MCG).PDF"
# Format 2: Honda
pdf_3 = "Manuals/CRF1000 A_PC_13MJPG02_(G.H).pdf"
pdf_4 = "Manuals/NC750XAP_13MKWM02_PC_2022_2023.pdf"

def model_extraction(pdf_path):
    base_filename = os.path.basename(pdf_path).split('.')[0]

    # Extract model: start of filename, letters/numbers/spaces until a special character (', _)
    match = re.match(r"([A-Za-z0-9 ]+)", base_filename)
    if match:
        return match.group(1).replace(" ", "")  # Removes any spaces
    
    return None

def batch_id_extraction(pdf_path, brand):
    base_filename = os.path.basename(pdf_path).split('.')[0]

    if brand == "Yamaha":
        # Extract model codes inside parentheses
        match = re.search(r"\((.*?)\)", base_filename)
        if match:
            parts = match.group(1).split(",")
            clean_parts = [part.strip() for part in parts]
            return "_".join(clean_parts)
    
    elif brand == "Honda":
        # Look for uppercase/digit code between underscores (6–10 characters)
        match = re.search(r"_([A-Z0-9]{6,10})_", base_filename)
        if match:
            return match.group(1)

    return None

def year_extraction(pdf_path, brand):
    base_filename = os.path.basename(pdf_path)

    if brand == "Yamaha":
        year_match = re.search(r"'(\d{2})", base_filename)
        return f"20{year_match.group(1)}" if year_match else None

    # elif brand == "Honda":
    #     # Look for a full year range like 2022_2023
    #     match = re.search(r"(20\d{2}_20\d{2})", base_filename)
    #     return match.group(1) if match else None

    return None

print(batch_id_extraction (pdf_1, "Yamaha"))
print(batch_id_extraction (pdf_2, "Yamaha"))
print(batch_id_extraction (pdf_3, "Honda"))
print(batch_id_extraction (pdf_4, "Honda"))

B65P_B65R_B65S
1MCH_1MCG
13MJPG02
13MKWM02


In [6]:
# Yamaha + Honda
def extract_model(pdf_path):
    base_filename = os.path.basename(pdf_path).split('.')[0]

    # Extract model: start of filename, letters/numbers/spaces until a special character (', _)
    match = re.match(r"([A-Za-z0-9 ]+)", base_filename)
    if match:
        return match.group(1).replace(" ", "")  # Removes any spaces
    
    return None
# Yamaha + Honda
def extract_batch_id(pdf_path, brand):
    base_filename = os.path.basename(pdf_path).split('.')[0]

    if brand == "Yamaha":
        # Extract model codes inside parentheses
        match = re.search(r"\((.*?)\)", base_filename)
        if match:
            parts = match.group(1).split(",")
            clean_parts = [part.strip() for part in parts]
            return "_".join(clean_parts)
    
    elif brand == "Honda":
        # Look for uppercase/digit code between underscores (6–10 characters)
        match = re.search(r"_([A-Z0-9]{6,10})_", base_filename)
        if match:
            return match.group(1)

    return None
# Yamaha
def extract_year(pdf_path, brand):
    base_filename = os.path.basename(pdf_path)

    if brand == "Yamaha":
        year_match = re.search(r"'(\d{2})", base_filename)
        return f"20{year_match.group(1)}" if year_match else None

    # elif brand == "Honda":
    #     # Look for a full year range like 2022_2023
    #     match = re.search(r"(20\d{2}_20\d{2})", base_filename)
    #     return match.group(1) if match else None

    return None

def reconstruct_lines_from_chars(chars, y_tolerance=2.5):
    lines = defaultdict(list)

    for c in chars:
        # Use midpoint instead of top
        y_center = c["top"] + (c["height"] / 2)
        y_bucket = round(y_center / y_tolerance)
        lines[y_bucket].append(c)

    line_texts = []
    for y in sorted(lines.keys()):
        chars_in_line = sorted(lines[y], key=lambda c: c["x0"])
        line = ""
        prev_x = None

        for char in chars_in_line:
            x = char["x0"]
            text = char["text"]

            if prev_x is not None:
                gap = x - prev_x
                if gap > 1.5:
                    line += " " * int(gap / 2.5)

            line += text
            prev_x = char["x1"]

        line_texts.append((y, line.rstrip()))

    return line_texts

def extract_raw_text(pdf_path):
    output_lines = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            chars = page.chars
            raw_lines = reconstruct_lines_from_chars(chars, y_tolerance=5.5)

            # Skip non-parts pages
            if not raw_lines or not raw_lines[0][1].strip().startswith("FIG."):
                continue

            for _, line in raw_lines:
                stripped_line = line.strip()

                # ✅ Skip lines that are just a number (likely page numbers)
                if re.fullmatch(r"\d+", stripped_line):
                    continue

                if stripped_line:
                    output_lines.append(stripped_line)

    return output_lines

def structure_raw_text(raw_lines):
    structured_output = []
    skip_indices = set()

    for i in range(len(raw_lines)):
        if i in skip_indices:
            continue

        line = raw_lines[i].strip()
        parts = re.split(r"\s{2,}", line)

        # --- Normalize FIG. rows to always be ['FIG.', 'number', 'description']
        if parts and isinstance(parts[0], str) and parts[0].startswith("FIG."):
            if re.match(r"^FIG\.\s*\d+$", parts[0]):
                match = re.match(r"^(FIG\.)\s*(\d+)$", parts[0])
                if match:
                    parts = [match.group(1), match.group(2)] + parts[1:]

            elif re.match(r"^FIG\.\d+$", parts[0]):
                match = re.match(r"^(FIG\.)(\d+)$", parts[0])
                if match:
                    parts = [match.group(1), match.group(2)] + parts[1:]

        # --- Skip rows that are just floating descriptions
        if len(parts) == 1 and re.match(r"^[A-Z ,\-0-9]+$", parts[0]):
            continue

        # --- Heuristic: Missing description, try to find it nearby
        if len(parts) >= 2 and not re.search(r"[A-Za-z]", parts[1]):
            # Try backward merge
            if i > 0:
                prev_line = raw_lines[i - 1].strip()
                if len(re.split(r"\s{2,}", prev_line)) == 1:
                    parts.insert(1, prev_line)
                    skip_indices.add(i - 1)

            # Try forward merge
            elif i + 1 < len(raw_lines):
                next_line = raw_lines[i + 1].strip()
                if len(re.split(r"\s{2,}", next_line)) == 1:
                    parts.insert(1, next_line)
                    skip_indices.add(i + 1)

        # --- Extra fix: Split index + part number if mashed into one string
        if parts and re.match(r"^\d+\s+[A-Z0-9–\-]+$", parts[0]):
            split_part = re.split(r"\s+", parts[0], maxsplit=1)
            parts = split_part + parts[1:]

        structured_output.append(parts)

    # --- Final cleanup
    structured_output = [
        row for row in structured_output
        if not (
            (len(row) == 1 and re.match(r"^[A-Z ,\-0-9]+$", row[0])) or
            all(cell.isdigit() for cell in row)  # <-- remove purely numeric rows
        )
    ]

    return structured_output

def convert_to_table(pdf_id, year, brand, model, batch_id, structured_output):
    rows = []
    section = c_name = prev_section = prev_c_name = prev_ref_no = ""

    for line in structured_output:
        if not line or not line[0]:
            continue

        # FIG. section headers
        if line[0] == "FIG." and len(line) >= 3:
            section = line[1]

            raw_name = " ".join(line[2:])  # Full raw name with possible number
            # Remove trailing digits from component name
            c_name = re.sub(r"\s*\d+$", "", raw_name).strip()

            prev_section, prev_c_name = section, c_name
            continue

        # Fallback to previous if not set
        if not section:
            section, c_name = prev_section, prev_c_name

        # Determine if it's a valid data line
        if len(line) >= 2 and (re.match(r'\w+[-–]\w+', line[0]) or line[0].isdigit()):
            if line[0].isdigit():
                ref_no = line[0]
                part_no = line[1]
                rest = line[2:]
                prev_ref_no = ref_no
            else:
                ref_no = prev_ref_no
                part_no = line[0]
                rest = line[1:]
        else:
            continue

        # Extract description and additional info
        description = ""
        add_info = ""
        numbers = []
        found_numbers = False
        for item in rest:
            if item.isdigit():
                numbers.append(item)
                found_numbers = True
                continue
            if not found_numbers:
                description += item + " "
            else:
                add_info += item + " "

        image_id = f"{pdf_id}_{section}"

        rows.append([
            pdf_id, year, brand, model, batch_id, section, c_name,
            ref_no, part_no, description.strip(), add_info.strip(), image_id
        ])

    return pd.DataFrame(rows, columns=[
        'pdf_id', 'year', 'brand', 'model', 'batch_id',
        'section', 'component_name', 'ref_no', 'part_no',
        'description', 'add_info', 'image_id'
    ])

In [None]:
# Format 1: Yamaha
pdf_1 = "Manuals/AEROX 155 '19 (B65P, B65R, B65S).pdf"
pdf_2= "Manuals/FJR1300A '15 (1MCH, 1MCG).PDF"
pdf_filepath = pdf_2

brand = "Yamaha"
year = extract_year(pdf_filepath, brand)
batch_id = extract_batch_id(pdf_filepath, brand)
model = extract_model(pdf_filepath)
pdf_id = model + '_' + batch_id

raw_lines = extract_raw_text(pdf_filepath)
structured_data = structure_raw_text(raw_lines)
master_parts_list = convert_to_table(pdf_id, year, brand, model, batch_id, structured_data)

In [46]:
def normalize_image_background(image_bytes):
    img = Image.open(BytesIO(image_bytes)).convert("L")  # Grayscale
    mean_brightness = sum(img.getdata()) / (img.width * img.height)

    if mean_brightness < 128:  # Invert if it's a dark background
        img = ImageOps.invert(img)

    img = img.convert("RGB")  # Convert back to RGB

    output = BytesIO()
    img.save(output, format="PNG")
    return output.getvalue()

def get_existing_fig_combos(engine, pdf_id):
    with engine.connect() as conn:
        result = conn.execute(
            text("SELECT section FROM parts_images WHERE pdf_id = :pdf_id"),
            {"pdf_id": pdf_id}
        )
        return set(str(row[0]) for row in result.fetchall())  # Cast to str for consistent comparison

def extract_images_with_fig_labels(pdf_path, pdf_id, engine, brand):
    doc = fitz.open(pdf_path)
    data = []

    seen_figs = set()

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()

        # Skip "Parts Catalogue News" pages
        if "parts catalogue news" in text.lower():
            print(f"[Page {page_num+1}] Skipping Parts Catalogue News page")
            continue

        if brand == "Yamaha":
            matches = re.findall(r"FIG\.\s*([\w-]+)", text)
        elif brand == "Honda":
            matches = re.findall(r"\b((?:E|F|EOP)-\d{1,3}(?:-\d+)?)\b", text)
            print(f"[Page {page_num+1}] Honda matches: {matches}")
        else:
            print("Brand not supported")
            return pd.DataFrame(columns=["image_id", "pdf_id", "section", "image"])

        # Skip if no section match
        if not matches:
            continue

        # Skip TOC-like pages with multiple sections
        if len(matches) > 1:
            print(f"[Page {page_num+1}] Skipping info page with multiple sections number: {matches}")
            continue

        section = matches[0]

        # Check for required table labels (Honda)
        if brand == "Honda":
            required_patterns = [
                r"ref\s*\.?\s*no\.?",
                r"part\s*no\.?",
                r"description",
                r"reqd\.?\s*qty",
                r"serial\s*no\.?"
            ]

            text_lower = text.lower()

            if not all(re.search(pattern, text_lower, re.IGNORECASE) for pattern in required_patterns):
                print(f"[Page {page_num+1}] Skipping — missing table labels")
                continue

        # if section in seen_figs or section in existing_figs:
        #     continue

        image_list = page.get_images(full=True)
        if not image_list:
            continue

        xref = image_list[0][0]
        base_image = doc.extract_image(xref)
        image = normalize_image_background(base_image["image"])

        image_id = "_".join([pdf_id, section])

        data.append({
            "image_id": image_id,
            "pdf_id": pdf_id,
            "section": section,
            "image": image
        })
        seen_figs.add(section)

    return pd.DataFrame(data)


In [47]:
# Format 1: Yamaha
pdf_1 = "Manuals/AEROX 155 '19 (B65P, B65R, B65S).pdf"
pdf_2= "Manuals/FJR1300A '15 (1MCH, 1MCG).PDF"
# Format 2: Honda
pdf_3 = "Manuals/CRF1000 A_PC_13MJPG02_(G.H).pdf"
pdf_4 = "Manuals/NC750XAP_13MKWM02_PC_2022_2023.pdf"

pdf_filepath = pdf_3
brand = "Honda"

batch_id = extract_batch_id(pdf_filepath, brand)
model = extract_model(pdf_filepath)
pdf_id = model + '_' + batch_id

image_df = extract_images_with_fig_labels(pdf_filepath, pdf_id, engine, brand)
#print(image_df.head().to_string(index=False))

[Page 1] Honda matches: []
[Page 2] Honda matches: []
[Page 3] Honda matches: []
[Page 4] Skipping Parts Catalogue News page
[Page 5] Honda matches: []
[Page 6] Honda matches: []
[Page 7] Honda matches: []
[Page 8] Honda matches: []
[Page 9] Honda matches: ['E-1', 'E-18', 'F-1']
[Page 9] Skipping info page with multiple sections number: ['E-1', 'E-18', 'F-1']
[Page 10] Honda matches: []
[Page 11] Honda matches: []
[Page 12] Honda matches: []
[Page 13] Honda matches: []
[Page 14] Honda matches: []
[Page 15] Honda matches: ['F-34', 'F-24', 'F-24', 'F-19', 'F-19', 'F-19', 'F-37', 'F-37', 'F-28', 'F-9', 'F-24', 'F-24', 'F-40-1', 'F-40-1', 'F-34', 'F-34', 'F-34', 'F-34', 'F-18']
[Page 15] Skipping info page with multiple sections number: ['F-34', 'F-24', 'F-24', 'F-19', 'F-19', 'F-19', 'F-37', 'F-37', 'F-28', 'F-9', 'F-24', 'F-24', 'F-40-1', 'F-40-1', 'F-34', 'F-34', 'F-34', 'F-34', 'F-18']
[Page 16] Honda matches: ['F-18', 'F-40-1', 'F-40-1', 'F-40-1', 'F-40-1', 'F-40-1', 'F-40-1', 'F-40-1

In [None]:
def extract_images_with_fig_labels(pdf_path, pdf_id, engine, brand):
    doc = fitz.open(pdf_path)
    data = []

    seen_figs = set()

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()

        # Skip Parts Content pages (multiple images for main group like ENGINE, FRAME GROUP, )
        if "group" in text.lower():
            print(f"[Page {page_num+1}] Skiping Parts Content Page")
            continue

        # Skip "Parts Catalogue News" pages
        if "parts catalogue news" in text.lower():
            print(f"[Page {page_num+1}] Skipping Parts Catalogue News page")
            continue

        lines = text.splitlines()
        top_lines = "\n".join(lines[:15])  # first 15 lines for section matching

        # Match section codes
        if brand == "Yamaha":
            matches = re.findall(r"FIG\.\s*([\w-]+)", text)
        elif brand == "Honda":
            matches = re.findall(r"\b((?:E|F|EOP)-\d{1,3}(?:-\d+)?)\b", top_lines)
            print(f"[Page {page_num+1}] Honda matches: {matches}")
        else:
            print("Brand not supported")
            return pd.DataFrame(columns=["image_id", "pdf_id", "section", "image"])

        # Skip if no section match
        if not matches:
            continue

        # Skip TOC-like pages with multiple sections
        if len(matches) > 1:
            print(f"[Page {page_num+1}] Skipping info page with multiple sections number: {matches}")
            continue

        section = matches[0]

        # Check for required table labels (Honda)
        if brand == "Honda":
            required_patterns = [
                r"ref\s*\.?\s*no\.?",
                r"part\s*no\.?",
                r"description",
                r"reqd\.?\s*qty",
                r"serial\s*no\.?"
            ]

            text_lower = text.lower()

            if not all(re.search(pattern, text_lower, re.IGNORECASE) for pattern in required_patterns):
                print(f"[Page {page_num+1}] Skipping — missing table labels")
                continue

        # Extract image
        image_list = page.get_images(full=True)
        if not image_list:
            continue

        xref = image_list[0][0]
        base_image = doc.extract_image(xref)
        image = normalize_image_background(base_image["image"])

        image_id = "_".join([pdf_id, section])

        data.append({
            "image_id": image_id,
            "pdf_id": pdf_id,
            "section": section,
            "image": image
        })
        seen_figs.add(section)
                
    return pd.DataFrame(data)

# Format 1: Yamaha
pdf_1 = "Manuals/AEROX 155 '19 (B65P, B65R, B65S).pdf"
pdf_2= "Manuals/FJR1300A '15 (1MCH, 1MCG).PDF"
# Format 2: Honda
pdf_3 = "Manuals/CRF1000 A_PC_13MJPG02_(G.H).pdf"
pdf_4 = "Manuals/NC750XAP_13MKWM02_PC_2022_2023.pdf"

pdf_filepath = pdf_3
brand = "Honda"

batch_id = extract_batch_id(pdf_filepath, brand)
model = extract_model(pdf_filepath)
pdf_id = model + '_' + batch_id
image_df = extract_images_with_fig_labels(pdf_filepath, pdf_id, engine, brand)

[Page 1] Honda matches: []
[Page 2] Honda matches: []
[Page 3] Honda matches: []
[Page 4] Skipping Parts Catalogue News page
[Page 5] Honda matches: []
[Page 6] Honda matches: []
[Page 7] Honda matches: []
[Page 8] Honda matches: []
[Page 9] Honda matches: ['E-1']

=== PAGE 9 ===
6
1
2016.09.20     E
Part block and serial number check
1
2
3
Required 
serial 
number
Engine parts
E-1~
[IMAGE FOUND]
Engine serial 
number
Throttle body 
parts
E-18
[IMAGE FOUND]
Throttle body 
serial number
Frame parts
F-1~
Frame serial 
number
Part block
Check 
point
1
2
3
Color label attachment position check
4
Colored parts
Color label
4
[Page 10] Honda matches: []
[Page 11] Honda matches: []
[Page 12] Honda matches: []
[Page 13] Honda matches: []
[Page 14] Honda matches: []
[Page 15] Honda matches: []
[Page 16] Honda matches: []
[Page 17] Honda matches: []
[Page 18] Honda matches: []
[Page 19] Honda matches: []
[Page 20] Honda matches: []
[Page 21] Honda matches: []
[Page 22] Honda matches: []
[Page 23]

In [None]:
def honda_extract_images_with_fig_labels(pdf_path, pdf_id, engine):
    doc = fitz.open(pdf_path)
    data = []

    MAIN_GROUPS = ["ENGINEGROUP", "FRAMEGROUP"]

    section_pattern = r"\b((?:E|F|EOP)-\d{1,3}(?:-\d+)?)\b"

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()
        lines = text.splitlines()

        # --- Check if page is a MAIN GROUP page ---
        page_has_main_group = False

        text_no_spaces = re.sub(r"\s+", "", text).lower()

        for group in MAIN_GROUPS:
            if group.lower() in text_no_spaces:
                page_has_main_group = True
                break

        if not page_has_main_group:
            continue  # skip page

        # --- Check if page has images ---
        image_list = page.get_images()
        if not image_list:
            continue  # skip if no images

        # --- Extract section labels from page ---
        sections_found = []
        for line in lines:
            match = re.search(section_pattern, line)
            if match:
                section = match.group(1)
                sections_found.append(section)

        if not sections_found:
            print(f"\n=== PAGE {page_num+1} ===")
            print("[SKIP] No sections found")
            continue

        print(f"\n=== PAGE {page_num+1} ===")
        print(f"[MAIN GROUP PAGE] → {len(image_list)} image(s) found")
        print(f"Sections found: {sections_found}")

        # --- Map each section to corresponding image ---
        # NOTE: assumes order of section labels = order of images
        for idx, section in enumerate(sections_found):
            if idx >= len(image_list):
                print(f"⚠️ Not enough images for sections — stopping at {idx}")
                break

            image_info = image_list[idx]
            xref = image_info[0]
            base_image = doc.extract_image(xref)
            image = normalize_image_background(base_image["image"])

            image_id = f"{pdf_id}_{section}"

            data.append({
                "image_id": image_id,
                "pdf_id": pdf_id,
                "section": section,
                "image": image
            })

            # # For debug: display the section + image
            # img = Image.open(BytesIO(image))
            # display(img)

            #print(f"[PAGE {page_num+1}] {section} → Image saved")

    return pd.DataFrame(data)

# Format 1: Yamaha
pdf_1 = "Manuals/AEROX 155 '19 (B65P, B65R, B65S).pdf"
pdf_2= "Manuals/FJR1300A '15 (1MCH, 1MCG).PDF"
# Format 2: Honda
pdf_3 = "Manuals/CRF1000 A_PC_13MJPG02_(G.H).pdf"
pdf_4 = "Manuals/NC750XAP_13MKWM02_PC_2022_2023.pdf"

pdf_filepath = pdf_3
brand = "Honda"

batch_id = extract_batch_id(pdf_filepath, brand)
model = extract_model(pdf_filepath)
pdf_id = model + '_' + batch_id
image_df = honda_extract_images_with_fig_labels(pdf_filepath, pdf_id, engine)
print (image_df)


=== PAGE 138 ===
[MAIN GROUP PAGE] → 9 image(s) found
Sections found: ['E-1', 'E-2', 'E-3', 'E-4', 'E-5', 'E-5-1', 'E-6', 'E-7', 'E-7-1']

=== PAGE 139 ===
[MAIN GROUP PAGE] → 9 image(s) found
Sections found: ['E-8', 'E-9', 'E-10', 'E-11', 'E-12', 'E-13', 'E-14', 'E-15', 'E-15-1']

=== PAGE 140 ===
[MAIN GROUP PAGE] → 7 image(s) found
Sections found: ['E-15-2', 'E-16', 'E-16-1', 'E-17', 'E-18', 'EOP-1', 'EOP-2']

=== PAGE 141 ===
[MAIN GROUP PAGE] → 9 image(s) found
Sections found: ['F-1', 'F-2', 'F-3', 'F-3-1', 'F-4', 'F-4-1', 'F-5', 'F-6', 'F-7']

=== PAGE 142 ===
[MAIN GROUP PAGE] → 9 image(s) found
Sections found: ['F-8', 'F-9', 'F-10', 'F-11', 'F-12', 'F-13', 'F-13-1', 'F-14', 'F-14-10']

=== PAGE 143 ===
[MAIN GROUP PAGE] → 9 image(s) found
Sections found: ['F-15', 'F-16', 'F-17', 'F-18', 'F-19', 'F-20', 'F-20-10', 'F-21', 'F-22']

=== PAGE 144 ===
[MAIN GROUP PAGE] → 9 image(s) found
Sections found: ['F-23', 'F-24', 'F-25', 'F-26', 'F-27', 'F-28', 'F-29', 'F-29-1', 'F-30']

===

### Object Creation Testing

In [13]:
class PDFProcessor:
    def __init__(self, pdf_path, pdf_id, brand, model, batch_id, year):
        self.pdf_path = pdf_path
        self.pdf_id = pdf_id
        self.brand = brand
        self.model = model
        self.batch_id = batch_id
        self.year = year
        
    @staticmethod
    def normalize_image_background(image_bytes):
        img = Image.open(BytesIO(image_bytes)).convert("L")  # Grayscale
        mean_brightness = sum(img.getdata()) / (img.width * img.height)
        if mean_brightness < 128:
            img = ImageOps.invert(img)
        img = img.convert("RGB")
        output = BytesIO()
        img.save(output, format="PNG")
        return output.getvalue()

    def extract_text(self):
        raise NotImplementedError("Each brand must implement its own text extraction")

    def extract_images(self, engine):
        raise NotImplementedError("Each brand must implement its own image extraction")

class YamahaProcessor(PDFProcessor):

    @staticmethod
    def reconstruct_lines_from_chars(chars, y_tolerance=2.5):
        lines = defaultdict(list)
        for c in chars:
            y_center = c["top"] + (c["height"] / 2)
            y_bucket = round(y_center / y_tolerance)
            lines[y_bucket].append(c)
        line_texts = []
        for y in sorted(lines.keys()):
            chars_in_line = sorted(lines[y], key=lambda c: c["x0"])
            line = ""
            prev_x = None
            for char in chars_in_line:
                x = char["x0"]
                text = char["text"]
                if prev_x is not None:
                    gap = x - prev_x
                    if gap > 1.5:
                        line += " " * int(gap / 2.5)
                line += text
                prev_x = char["x1"]
            line_texts.append((y, line.rstrip()))
        return line_texts

    def extract_raw_text(self, pdf_path):
        output_lines = []
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                chars = page.chars
                raw_lines = self.reconstruct_lines_from_chars(chars, y_tolerance=5.5)
                if not raw_lines or not raw_lines[0][1].strip().startswith("FIG."):
                    continue
                for _, line in raw_lines:
                    stripped_line = line.strip()
                    if re.fullmatch(r"\d+", stripped_line):
                        continue
                    if stripped_line:
                        output_lines.append(stripped_line)
        return output_lines

    @staticmethod
    def structure_raw_text(raw_lines):
        structured_output = []
        skip_indices = set()

        for i in range(len(raw_lines)):
            if i in skip_indices:
                continue

            line = raw_lines[i].strip()
            parts = re.split(r"\s{2,}", line)

            # --- Normalize FIG. rows to always be ['FIG.', 'number', 'description']
            if parts and isinstance(parts[0], str) and parts[0].startswith("FIG."):
                if re.match(r"^FIG\.\s*\d+$", parts[0]):
                    match = re.match(r"^(FIG\.)\s*(\d+)$", parts[0])
                    if match:
                        parts = [match.group(1), match.group(2)] + parts[1:]

                elif re.match(r"^FIG\.\d+$", parts[0]):
                    match = re.match(r"^(FIG\.)(\d+)$", parts[0])
                    if match:
                        parts = [match.group(1), match.group(2)] + parts[1:]

            # --- Skip rows that are just floating descriptions
            if len(parts) == 1 and re.match(r"^[A-Z ,\-0-9]+$", parts[0]):
                continue

            # --- Heuristic: Missing description, try to find it nearby
            if len(parts) >= 2 and not re.search(r"[A-Za-z]", parts[1]):
                # Try backward merge
                if i > 0:
                    prev_line = raw_lines[i - 1].strip()
                    if len(re.split(r"\s{2,}", prev_line)) == 1:
                        parts.insert(1, prev_line)
                        skip_indices.add(i - 1)

                # Try forward merge
                elif i + 1 < len(raw_lines):
                    next_line = raw_lines[i + 1].strip()
                    if len(re.split(r"\s{2,}", next_line)) == 1:
                        parts.insert(1, next_line)
                        skip_indices.add(i + 1)

            # --- Extra fix: Split index + part number if mashed into one string
            if parts and re.match(r"^\d+\s+[A-Z0-9–\-]+$", parts[0]):
                split_part = re.split(r"\s+", parts[0], maxsplit=1)
                parts = split_part + parts[1:]

            structured_output.append(parts)

        # --- Final cleanup
        structured_output = [
            row for row in structured_output
            if not (
                (len(row) == 1 and re.match(r"^[A-Z ,\-0-9]+$", row[0])) or
                all(cell.isdigit() for cell in row)  # <-- remove purely numeric rows
            )
        ]

        return structured_output

    @staticmethod
    def convert_to_table(pdf_id, year, brand, model, batch_id, structured_output):
        rows = []
        section = s_name = prev_section = prev_c_name = prev_ref_no = ""

        for line in structured_output:
            if not line or not line[0]:
                continue

            # FIG. section headers
            if line[0] == "FIG." and len(line) >= 3:
                section = line[1]

                raw_name = " ".join(line[2:])  # Full raw name with possible number
                # Remove trailing digits from component name
                s_name = re.sub(r"\s*\d+$", "", raw_name).strip()

                prev_section, prev_c_name = section, s_name
                continue

            # Fallback to previous if not set
            if not section:
                section, s_name = prev_section, prev_c_name

            # Determine if it's a valid data line
            if len(line) >= 2 and (re.match(r'\w+[-–]\w+', line[0]) or line[0].isdigit()):
                if line[0].isdigit():
                    ref_no = line[0]
                    part_no = line[1]
                    rest = line[2:]
                    prev_ref_no = ref_no
                else:
                    ref_no = prev_ref_no
                    part_no = line[0]
                    rest = line[1:]
            else:
                continue

            # Extract description and additional info
            description = ""
            remarks = ""
            numbers = []
            found_numbers = False
            for item in rest:
                if item.isdigit():
                    numbers.append(item)
                    found_numbers = True
                    continue
                if not found_numbers:
                    description += item + " "
                else:
                    remarks += item + " "

            image_id = f"{pdf_id}_{section}"

            rows.append([
                pdf_id, year, brand, model, batch_id, section, s_name,
                ref_no, part_no, description.strip(), remarks.strip(), image_id
            ])

        return pd.DataFrame(rows, columns=[
            'pdf_id', 'year', 'brand', 'model', 'batch_id',
            'section', 'section_name', 'ref_no', 'part_no',
            'description', 'remarks', 'image_id'
        ])

    def yamaha_extract_images_with_fig_labels(self, pdf_stream, pdf_id):
        doc = fitz.open(stream=pdf_stream, filetype="pdf")
        data = []
        seen_figs = set()
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text = page.get_text()
            matches = re.findall(r"FIG\.\s*([\w-]+)", text)
            if not matches:
                continue
            section = matches[0]
            if section in seen_figs:
                continue
            image_list = page.get_images(full=True)
            if not image_list:
                continue
            xref = image_list[0][0]
            base_image = doc.extract_image(xref)
            image = self.normalize_image_background(base_image["image"])
            image_id = f"{pdf_id}_{section}"
            data.append({
                "image_id": image_id,
                "pdf_id": pdf_id,
                "section": section,
                "image": image
            })
            seen_figs.add(section)
        return pd.DataFrame(data)

    def extract_text(self):
        raw_lines = self.extract_raw_text(self.pdf_path)
        structured_data = self.structure_raw_text(raw_lines)
        df = self.convert_to_table(
            pdf_id=self.pdf_id,
            year=self.year,
            brand=self.brand,
            model=self.model,
            batch_id=self.batch_id,
            structured_output=structured_data
        )
        return df

    def extract_images(self):
        with open(self.pdf_path, "rb") as f:
            pdf_stream = f.read()
        df = self.yamaha_extract_images_with_fig_labels(
            pdf_stream=pdf_stream,
            pdf_id=self.pdf_id
        )
        return df

class HondaProcessor(PDFProcessor):
    @staticmethod
    def extract_section_with_layout(pdf_path: str, section_code: str, section_title: str) -> pd.DataFrame:
        """
        Finds a specified section, locates 'Reqd. QTY', extracts in layout mode,
        then parses each part and variant into ref_no, part_no, description, remarks.
        Stops collecting once it encounters any line containing 'PART', 'NO', and 'INDEX'.
        Returns a DataFrame with columns ref_no, part_no, description, remarks.
        """
        code = section_code.upper()
        title = section_title.upper()

        next_sec_re     = re.compile(r'^[A-Z]+-\d+', re.IGNORECASE)
        table_header_re = re.compile(r'\bReqd\.?\s*QTY\b', re.IGNORECASE)
        part_no_re      = re.compile(r'\b[0-9]{5,}(?:-[A-Z0-9-]+)+\b')
        end_re          = re.compile(r'.*PART\s*NO\.?\s*INDEX.*', re.IGNORECASE)

        # Phase 1: locate page range
        start_page = header_hit = None
        end_page = None
        with pdfplumber.open(pdf_path) as pdf:
            for i, page in enumerate(pdf.pages):
                for ln in (page.extract_text() or "").splitlines():
                    u = ln.strip().upper()
                    if start_page is None:
                        if (("FRAMEGROUP" in u and u.startswith(code) and title in u)
                            or (u.startswith(code) and title in u)):
                            start_page = i
                            break
                    elif not header_hit:
                        if table_header_re.search(u):
                            header_hit = True
                    else:
                        if next_sec_re.match(u) and not u.startswith(code):
                            end_page = i
                            break
                if end_page is not None:
                    break
            if start_page is None or not header_hit:
                raise ValueError(f"Section '{section_code} {section_title}' not found or missing table header.")
            if end_page is None:
                end_page = len(pdf.pages)

            # Phase 2: collect layout-preserved lines
            collected = []
            in_table = False
            stop_all = False
            for pi in range(start_page, end_page):
                for ln in (pdf.pages[pi].extract_text(layout=True) or "").splitlines():
                    u = ln.strip().upper()
                    if end_re.match(u):
                        stop_all = True
                        break
                    if not in_table:
                        if table_header_re.search(u):
                            in_table = True
                        continue
                    if next_sec_re.match(u) and not u.startswith(code):
                        break
                    collected.append(ln)
                if stop_all:
                    break

        # Phase 3: group into per-part buffers
        records = []
        last_ref = ""
        for ln in collected:
            m_pno = part_no_re.search(ln)
            if m_pno:
                m_ref = re.match(r'^\s*(?:\((\d+)\)|(\d+))\s+', ln)
                if m_ref:
                    last_ref = m_ref.group(1) or m_ref.group(2)
                records.append({
                    "ref":      last_ref,
                    "part_no":  m_pno.group(0),
                    "buf":      [ln[m_pno.end():].strip()]
                })
            else:
                if not records:
                    continue
                txt = ln.strip()
                if re.fullmatch(r'\d+', txt) or re.fullmatch(r'\d{4}\.\d{2}\.\d{2}', txt):
                    continue
                records[-1]["buf"].append(txt)

        # Phase 4: parse each buffer directly into column-lists
        ref_nos      = []
        part_nos     = []
        descriptions = []
        remarks_list = []

        for rec in records:
            raw = " ".join(rec["buf"])
            raw = raw.replace('∙','').replace('•','').replace('\uf020','')
            raw = re.sub(r'\s+', ' ', raw).strip()

            idx       = raw.find("--------")
            desc_part = raw[:idx].strip() if idx != -1 else raw
            cat_part  = raw[idx+8:].strip() if idx != -1 else ""

            # clean up description
            desc_part = re.sub(r'\.{2,}\s+\d.*$', '', desc_part).strip()
            desc_part = re.sub(r'\s+GK[A-Za-z0-9]+\s*$', '', desc_part)
            desc_part = re.sub(r'\s+(?:-+|\d+)+\s*$', '', desc_part)
            desc      = re.sub(r'\s+\d+\s+\d{4}\.\d{2}\.\d{2}.*$', "", desc_part).strip()
            desc      = re.sub(r'(?:\s+(?:\(\d+\)|-+|\d+))+$',     "", desc).strip()
            desc      = re.sub(r'\.{2,}$',                         "", desc).strip()
            desc      = re.sub(r'(?:\s+[A-Z])+$',                  "", desc).strip()
            desc      = "" if not re.search(r'[A-Za-z]', desc) else desc

            # clean up catalogue codes → remarks
            if cat_part.upper().startswith("GK") and len(cat_part) > 8:
                cat_clean = cat_part[8:].split()[0]
            else:
                m_codes   = re.match(r'[-\s]*([0-9A-Z,\s]+)', cat_part)
                raw_codes = m_codes.group(1) if m_codes else ""
                cat_clean = raw_codes.replace(" ", "")
                cat_clean = re.sub(r'([A-Z])(?=\d)', r'\1,', cat_clean)
                cat_clean = re.sub(r'(?<=[0-9A-Z]{2})(?=[A-Z]{2}(?:,|$))', ',', cat_clean)
            cat_clean    = re.sub(r'\d{4}$', '', cat_clean)
            tokens       = [t for t in cat_clean.split(',') if t]
            seen         = set()
            final_codes  = [c for c in tokens if c not in seen and not seen.add(c)]
            remarks      = ",".join(final_codes)

            # adjust part_no suffix logic
            m3 = re.match(r'^(.+?)([A-Z]{3,})$', rec["part_no"])
            if m3:
                core, suf = m3.group(1), m3.group(2)
                part_no    = core + suf[:2]
                desc       = f"{suf[2:]} {desc}".strip()
            else:
                part_no = rec["part_no"]

            ref_nos.append(rec["ref"])
            part_nos.append(part_no)
            descriptions.append(desc)
            remarks_list.append(remarks)

        df = pd.DataFrame({
            'ref_no':      ref_nos,
            'part_no':     part_nos,
            'description': descriptions,
            'remarks':     remarks_list
        })
        return df
    
    @staticmethod
    def extract_all_sections_one_pass(pdf_id, year, brand, model, batch_id, pdf_path: str) -> pd.DataFrame:
        """
        Opens the PDF once, walks through it page by page, detects sections using
        next_sec_re, collects each section’s lines, inlines Phase 3+4 verbatim,
        stops entirely when end_re is first encountered, strips any leading
        "*GROUP" from titles, and writes a CSV with columns
        section_no, section_name, ref_no, part_no, description, remarks.
        """
        next_sec_re     = re.compile(r'^[A-Z]+-\d+', re.IGNORECASE)
        table_header_re = re.compile(r'\bReqd\.?\s*QTY\b', re.IGNORECASE)
        part_no_re      = re.compile(r'\b[0-9]{5,}(?:-[A-Z0-9-]+)+\b')
        end_re          = re.compile(r'.*PART\s*NO\.?\s*INDEX.*', re.IGNORECASE)

        section_nos   = []
        section_names = []
        ref_nos       = []
        part_nos      = []
        descriptions  = []
        remarks_list  = []

        current = None
        done    = False

        def _flush(cur):
            """Phase 3+4 logic verbatim, flushing cur['collected'] into our lists."""
            records = []; last_ref = ""
            for ln in cur['collected']:
                m_pno = part_no_re.search(ln)
                if m_pno:
                    m_ref = re.match(r'^\s*(?:\((\d+)\)|(\d+))\s+', ln)
                    if m_ref:
                        last_ref = m_ref.group(1) or m_ref.group(2)
                    records.append({
                        'ref': last_ref,
                        'part_no': m_pno.group(0),
                        'buf': [ln[m_pno.end():].strip()]
                    })
                else:
                    if not records: continue
                    txt = ln.strip()
                    if re.fullmatch(r'\d+', txt) or re.fullmatch(r'\d{4}\.\d{2}\.\d{2}', txt):
                        continue
                    records[-1]['buf'].append(txt)

            for rec in records:
                raw = " ".join(rec['buf']).replace('∙','').replace('•','').replace('\uf020','')
                raw = re.sub(r'\s+', ' ', raw).strip()
                idx = raw.find("--------")
                desc_part = raw[:idx].strip() if idx != -1 else raw
                cat_part  = raw[idx+8:].strip() if idx != -1 else ""

                # description cleanup
                desc_part = re.sub(r'\.{2,}\s+\d.*$', '', desc_part).strip()
                desc_part = re.sub(r'\s+GK[A-Za-z0-9]+\s*$', '', desc_part)
                desc_part = re.sub(r'\s+(?:-+|\d+)+\s*$', '', desc_part)
                desc = re.sub(r'\s+\d+\s+\d{4}\.\d{2}\.\d{2}.*$', "", desc_part).strip()
                desc = re.sub(r'(?:\s+(?:\(\d+\)|-+|\d+))+$', "", desc).strip()
                desc = re.sub(r'\.{2,}$', "", desc).strip()
                desc = re.sub(r'(?:\s+[A-Z])+$', "", desc).strip()
                desc = "" if not re.search(r'[A-Za-z]', desc) else desc

                # remarks cleanup
                if cat_part.upper().startswith("GK") and len(cat_part) > 8:
                    cat_clean = cat_part[8:].split()[0]
                else:
                    m_codes   = re.match(r'[-\s]*([0-9A-Z,\s]+)', cat_part)
                    raw_codes = m_codes.group(1) if m_codes else ""
                    cat_clean = raw_codes.replace(" ", "")
                    cat_clean = re.sub(r'([A-Z])(?=\d)', r'\1,', cat_clean)
                    cat_clean = re.sub(r'(?<=[0-9A-Z]{2})(?=[A-Z]{2}(?:,|$))', ',', cat_clean)
                cat_clean   = re.sub(r'\d{4}$', '', cat_clean)
                tokens      = [t for t in cat_clean.split(',') if t]
                seen        = set()
                final_codes = [c for c in tokens if c not in seen and not seen.add(c)]
                remarks     = ",".join(final_codes)

                m3 = re.match(r'^(.+?)([A-Z]{3,})$', rec['part_no'])
                if m3:
                    core, suf = m3.group(1), m3.group(2)
                    pno        = core + suf[:2]
                    desc       = f"{suf[2:]} {desc}".strip()
                else:
                    pno = rec['part_no']

                section_nos.append(cur['code'])
                section_names.append(cur['title'])
                ref_nos.append(rec['ref'])
                part_nos.append(pno)
                descriptions.append(desc)
                remarks_list.append(remarks)

        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                if done:
                    break

                plain  = (page.extract_text() or "").splitlines()
                layout = (page.extract_text(layout=True) or "").splitlines()

                # detect new section headers
                for ln in plain:
                    if done:
                        break
                    u = ln.strip().upper()
                    if next_sec_re.match(u):
                        if current:
                            _flush(current)
                        parts = ln.strip().split(None, 1)
                        raw_title = parts[1].strip() if len(parts) > 1 else ""
                        # strip any leading "*GROUP"
                        title = re.sub(r'\b[A-Z]+GROUP\b\s*', '', raw_title, flags=re.IGNORECASE)
                        current = {
                            'code':       parts[0].upper(),
                            'title':      title,
                            'header_hit': False,
                            'collected':  []
                        }

                # collect layout lines
                if current:
                    for ln in layout:
                        u = ln.strip().upper()
                        if end_re.match(u):
                            _flush(current)
                            done = True
                            break
                        if not current['header_hit']:
                            if table_header_re.search(u):
                                current['header_hit'] = True
                            continue
                        if next_sec_re.match(u) and not u.startswith(current['code']):
                            _flush(current)
                            current = None
                            break
                        current['collected'].append(ln)

        if current and not done:
            _flush(current)

        final_df = pd.DataFrame({
            'pdf_id': pdf_id,       #added
            'year': year,           #added
            'brand': brand,         #added
            'model': model,         #added
            'batch_id': batch_id,   #added
            'section':   section_nos,
            'section_name': section_names,
            'ref_no':       ref_nos,
            'part_no':      part_nos,
            'description':  descriptions,
            'remarks':      remarks_list,
        })
        final_df["image_id"] = final_df["pdf_id"] + "_" + final_df["section"]
        return final_df
    
    def honda_extract_images_with_fig_labels(pdf_stream, pdf_id):
        doc = fitz.open(stream=pdf_stream, filetype="pdf")
        data = []

        MAIN_GROUPS = ["ENGINEGROUP", "FRAMEGROUP"]

        section_pattern = r"\b((?:E|F|EOP)-\d{1,3}(?:-\d+)?)\b"

        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text = page.get_text()
            lines = text.splitlines()

            # --- Check if page is a MAIN GROUP page ---
            page_has_main_group = False

            text_no_spaces = re.sub(r"\s+", "", text).lower()

            for group in MAIN_GROUPS:
                if group.lower() in text_no_spaces:
                    page_has_main_group = True
                    break

            if not page_has_main_group:
                continue  # skip page

            # --- Check if page has images ---
            image_list = page.get_images()
            if not image_list:
                continue  # skip if no images

            # --- Extract section labels from page ---
            sections_found = []
            for line in lines:
                match = re.search(section_pattern, line)
                if match:
                    section = match.group(1)
                    sections_found.append(section)

            if not sections_found:
                # print(f"\n=== PAGE {page_num+1} ===")
                # print("[SKIP] No sections found")
                continue
            
            # For debugging
            # print(f"\n=== PAGE {page_num+1} ===")
            # print(f"[MAIN GROUP PAGE] → {len(image_list)} image(s) found")
            # print(f"Sections found: {sections_found}")

            # --- Map each section to corresponding image ---
            # NOTE: assumes order of section labels = order of images
            for idx, section in enumerate(sections_found):
                if idx >= len(image_list):
                    #print(f"⚠️ Not enough images for sections — stopping at {idx}")
                    break

                image_info = image_list[idx]
                xref = image_info[0]
                base_image = doc.extract_image(xref)
                image = normalize_image_background(base_image["image"])

                image_id = f"{pdf_id}_{section}"

                data.append({
                    "image_id": image_id,
                    "pdf_id": pdf_id,
                    "section": section,
                    "image": image
                })

                # # For debug: display the section + image
                # img = Image.open(BytesIO(image))
                # display(img)

                #print(f"[PAGE {page_num+1}] {section} → Image saved")

        return pd.DataFrame(data)

    def extract_text(self):
        df = self.extract_all_sections_one_pass(
            pdf_id=self.pdf_id,
            year=self.year,
            brand=self.brand,
            model=self.model,
            batch_id=self.batch_id,
            pdf_path=self.pdf_path
        )
        return df

    def extract_images(self, engine):
        with open(self.pdf_path, "rb") as f:
            pdf_stream = f.read()
        df = self.yamaha_extract_images_with_fig_labels(
            pdf_stream=pdf_stream,
            pdf_id=self.pdf_id
        )
        return df

def extract_model(pdf_name):
    # Extract model: start of filename, letters/numbers/spaces until a special character (', _)
    match = re.match(r"([A-Za-z0-9 ]+)", pdf_name)
    if match:
        return match.group(1).replace(" ", "")  # Removes any spaces
def extract_batch_id(pdf_name, brand):
    if brand == "Yamaha":
        # Extract model codes inside parentheses
        match = re.search(r"\((.*?)\)", pdf_name)
        if match:
            parts = match.group(1).split(",")
            clean_parts = [part.strip() for part in parts]
            return "_".join(clean_parts)
    
    elif brand == "Honda":
        # Look for uppercase/digit code between underscores (6–10 characters)
        match = re.search(r"_([A-Z0-9]{6,10})_", pdf_name)
        if match:
            return match.group(1)

    return None
def extract_year(pdf_name, brand):
    if brand == "Yamaha":
        year_match = re.search(r"'(\d{2})", pdf_name)
        return f"20{year_match.group(1)}" if year_match else None

    # elif brand == "Honda":
    #     match = re.search(r"(20\d{2}_20\d{2})", pdf_name)
    #     return match.group(1) if match else None

    return None


In [17]:
# Format 1: Yamaha
pdf_1 = "Manuals/AEROX 155 '19 (B65P, B65R, B65S).pdf"
pdf_2= "Manuals/FJR1300A '15 (1MCH, 1MCG).PDF"
# Format 2: Honda
pdf_3 = "Manuals/CRF1000 A_PC_13MJPG02_(G.H).pdf"
pdf_4 = "Manuals/NC750XAP_13MKWM02_PC_2022_2023.pdf"

pdf_file = pdf_3
brand = "Honda"
filename = os.path.basename(pdf_file)

model = extract_model(filename)
batch_id = extract_batch_id(filename, brand)
year = extract_year(filename, brand)
pdf_id = model + '_' + batch_id

if brand == "Yamaha":
    processor = YamahaProcessor(pdf_file, pdf_id, brand, model, batch_id, year)
elif brand == "Honda":
    processor = HondaProcessor(pdf_file, pdf_id, brand, model, batch_id, year)

df_text = processor.extract_text()
#df_images = processor.extract_images()
print(df_text.to_string(index=False))

           pdf_id year brand    model batch_id section                                  section_name ref_no         part_no                                                        description                                                                remarks                  image_id
CRF1000A_13MJPG02 None Honda CRF1000A 13MJPG02     E-1                           CYLINDER HEAD COVER      1   12229-HL4-000                                                    SEAL, PLUG TUBE                                                                            CRF1000A_13MJPG02_E-1
CRF1000A_13MJPG02 None Honda CRF1000A 13MJPG02     E-1                           CYLINDER HEAD COVER      2   12300-MJP-G50                                         COVER ASSY., CYLINDER HEAD                                                                            CRF1000A_13MJPG02_E-1
CRF1000A_13MJPG02 None Honda CRF1000A 13MJPG02     E-1                           CYLINDER HEAD COVER      3   12341-MAT-750             