In [None]:
# streamlit: Web app framework (No html/css required)
# openai: OpenAI API wrapper
# faiss-cpu: Vector Database
# langchain: LLM wrapper
# opencv-python: Image processing
# pandas: Data manipulation
# sqlalchemy: Database connection

import openai
import faiss
import langchain
import pandas as pd
from sqlalchemy import (
    select, create_engine, Table, Column, Integer, String, MetaData,
    ForeignKey, ForeignKeyConstraint, UniqueConstraint, LargeBinary, text
)
from sqlalchemy.orm import Session, sessionmaker

# PyMuPDF, pdfplumber, or OCR
from pdf2image import convert_from_path
import os
import re
import pdfplumber
import fitz  # PyMuPDF

# For Image Display within the df
from IPython.display import display, Image
from PIL import Image, ImageOps
from io import BytesIO
from IPython.display import display, Image as IPImage

In [None]:
#josiacdwojaoicjdoiaociwdjcoaijcadoijaoij

In [None]:
# Replace values with your actual database info
username = "postgres"
password = "MPAMS"
host = "localhost"
port = "5432"
database = "MPDB"


# SQLAlchemy connection URL
DATABASE_URL = f"postgresql://{username}:{password}@{host}:{port}/{database}"

# Create engine
engine = create_engine(DATABASE_URL)

In [None]:
# Creating the 2 tables
metadata = MetaData()

master_parts_list = Table(
    "master_parts_list", metadata,
    Column("mpl_id", Integer, primary_key=True),
    Column("pdf_id", String, nullable=False),
    Column("year", Integer, nullable=False),
    Column("brand", String, nullable=False),
    Column("model", String, nullable=False),
    Column("section", Integer, nullable=False), # AKA "fig_no" for pdf
    Column("component_name", String, nullable=False),
    Column("ref_no", Integer, nullable=False),
    Column("part_no", String, nullable=False),
    Column("description", String, nullable=False),
    Column("remarks", String),
    Column("image_id", String, ForeignKey("parts_images.image_id", ondelete="SET NULL"))
)

parts_images = Table(
    "parts_images", metadata,
    Column("image_id", String, primary_key=True),
    Column("pdf_id", String, nullable=False),
    Column("fig_no", Integer, nullable=False),
    Column("image", LargeBinary, nullable=False),
)

# Create the tables
metadata.create_all(engine)

### Week 3: Inventory Data Extraction & Indexing (2a–2c)

In [None]:
# Parse PDFs (Use PyMuPDF, pdfplumber, or OCR)
# Store structured data in a relevant data warehouse or database (e.g., SQLite/PostgreSQL).
# Use vector embeddings and FAISS to enable semantic search and RAG.

In [15]:
# Format 1: Yamaha
# Important images from page 6-60
pdf_1 = "Manuals/AEROX 155 '19 (B65P, B65R, B65S).pdf"
pdf_2= "Manuals/FJR1300A '15 (1MCH, 1MCG).PDF"

# Format 2: Honda
pdf_3 = "Manuals/CRF1000 A_PC_13MJPG02_(G.H).pdf"
pdf_4 = "Manuals/NC750XAP_13MKWM02_PC_2022_2023.pdf"

pdf_path = pdf_1

In [None]:
# Preview extracted raw text
import pdfplumber

start_page = 23
end_page = 23

# Initialize a variable to hold all extracted text
all_text = ""

with pdfplumber.open(pdf_path) as pdf:
    # Loop through the specified page range
    for page_num in range(start_page - 1, end_page):  # page_num is 0-indexed, so subtract 1 from start_page
        page = pdf.pages[page_num]
        text = page.extract_text()  # Extract text from the page
        
        if text:  # If the page contains text
            all_text += f"\n--- Page {page_num + 1} ---\n{text}\n"  # page_num + 1 to keep it 1-indexed in output

# Optionally, print the extracted text or save to a file
print(all_text)

# If you want to save the text to a text file:
with open("extracted_text.txt", "w", encoding="utf-8") as f:
    f.write(all_text)

In [16]:
# Test
# ---------- HELPERS ----------
def extract_pdf_id(pdf_path):
    base_filename = os.path.basename(pdf_path).split('.')[0]
    match = re.match(r"([A-Za-z0-9 ]+)", base_filename)
    if match:
        return match.group(1).replace(" ", "")  # Remove all spaces
    return None

def extract_year(pdf_path):
    year_match = re.search(r"'(\d{2})", pdf_path)
    return f"20{year_match.group(1)}" if year_match else None

def extract_model(pdf_path):
    base_filename = os.path.basename(pdf_path)
    match = re.search(r"\((.*?)\)", base_filename)
    if match:
        return match.group(1)  # e.g., "B65P, B65R, B65S"
    return None

# ---------- IMAGE EXTRACTION ----------
def normalize_image_background(image_bytes):
    img = Image.open(BytesIO(image_bytes)).convert("L")  # Grayscale
    mean_brightness = sum(img.getdata()) / (img.width * img.height)

    if mean_brightness < 128:  # Invert if it's a dark background
        img = ImageOps.invert(img)

    img = img.convert("RGB")  # Convert back to RGB

    output = BytesIO()
    img.save(output, format="PNG")
    return output.getvalue()

def extract_images_with_fig_labels(pdf_path, pdf_id, engine):
    doc = fitz.open(pdf_path)
    data = []

    # Step 1: Get existing (pdf_id, fig_no) combos from DB
    existing_figs = get_existing_fig_combos(engine, pdf_id)

    seen_figs = set()  # Track unique figs within the PDF

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()

        matches = re.findall(r"FIG\.\s*([\w-]+)", text)
        if not matches:
            continue

        fig_no = matches[0]

        if fig_no in seen_figs or fig_no in existing_figs:
            continue  # Skip if already handled or exists in DB

        image_list = page.get_images(full=True)
        if not image_list:
            continue

        xref = image_list[0][0]
        base_image = doc.extract_image(xref)
        image = normalize_image_background(base_image["image"])

        image_id = "_".join([pdf_id, fig_no])

        data.append({
            "image_id" : image_id,
            "pdf_id": pdf_id,
            "fig_no": fig_no,
            "image": image
        })
        seen_figs.add(fig_no)

    return pd.DataFrame(data)

def get_existing_fig_combos(engine, pdf_id):
    with engine.connect() as conn:
        result = conn.execute(
            text("SELECT fig_no FROM parts_images WHERE pdf_id = :pdf_id"),
            {"pdf_id": pdf_id}
        )
        return set(str(row[0]) for row in result.fetchall())  # Cast to str for consistent comparison

# ---------- TEXT EXTRACTION ----------
def extract_text_from_pdf(pdf_path):
    all_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            text = page.extract_text() or ""
            lines = text.split('\n')
            first_line = lines[0].strip() if lines else ""
            if not any("FIG." in line for line in lines):
                continue
            if "NUMERICAL INDEX" in first_line:
                break
            all_text += f"\n--- Page {page_num + 1} ---\n{text}\n"
    return all_text

def yamaha_process_data(text, pdf_id, year, model, num_model):
    rows = []
    lines = text.strip().split('\n')
    section = c_name = prev_fig_no = prev_c_name = prev_ref_no = ""
    collect_data = False

    for line in lines:
        line = line.strip()
        if not line: continue
        if line.startswith('FIG.'):
            tokens = line.split()
            if len(tokens) >= 3:
                section = tokens[1]
                c_name = " ".join(tokens[2:])
                prev_fig_no, prev_c_name = section, c_name
                collect_data = True
            continue
        if not collect_data: continue
        if not section:
            section, c_name = prev_fig_no, prev_c_name

        parts = line.split()
        is_valid_data_line = (
            len(parts) >= 2 and 
            (re.match(r'\w+[-–]\w+', parts[0]) or parts[0].isdigit())
        )
        if not is_valid_data_line:
            continue

        if parts[0].isdigit():
            ref_no = parts[0]
            part_no = parts[1]
            rest = parts[2:]
            prev_ref_no = ref_no
        else:
            ref_no = prev_ref_no
            part_no = parts[0]
            rest = parts[1:]

        rest = " ".join(rest).split()
        description = remarks = ""
        numbers = []
        found_numbers = False
        for item in rest:
            if item.isdigit():
                numbers.append(item)
                found_numbers = True
                continue
            if not found_numbers:
                description += item + " "
            else:
                remarks += item + " "
        if len(numbers) > num_model:
            description += numbers[0]

        image_id = "_".join([pdf_id, section])

        rows.append([pdf_id, year, "Yamaha", model, section, c_name, ref_no, part_no, description.strip(), remarks.strip(), image_id])

    return pd.DataFrame(rows, columns=[
        'pdf_id', 'year', 'brand', 'model', 'section', 'component_name',
        'ref_no', 'part_no', 'description', 'remarks', 'image_id'
    ])

# ---------- MAIN PROCESS ----------
def yamaha_data_extraction(pdf_path):

    pdf_id = extract_pdf_id(pdf_path)
    year = extract_year(pdf_path)
    model = extract_model(pdf_path)

    SessionLocal = sessionmaker(bind=engine)
    session = SessionLocal()
    try:
        df_images = extract_images_with_fig_labels(pdf_path, pdf_id, engine)
        image_message = f"[INFO] Inserted {len(df_images)} new images for '{pdf_id}'."
        if not df_images.empty:
            df_images.to_sql("parts_images", engine, if_exists="append", index=False, method="multi")
            print(image_message)
        else:
            print(image_message + f" All images for '{pdf_id}' already exist.")

        # Step 2: Check if parts data already exists
        existing = session.execute(
            select(1).select_from(master_parts_list).where(master_parts_list.c.pdf_id == pdf_id)
        ).first()

        if existing:
            print(f"[INFO] Master Parts data for '{pdf_id}' already exists.")
            return

    finally:
        session.close()
            
    # Step 3: Extract and process parts data (outside session scope)
    all_text = extract_text_from_pdf(pdf_path)
    df_parts = yamaha_process_data(all_text, pdf_id, year, model, num_model=3)

    if not df_parts.empty:
        #print(df_parts.to_string(index=False))
        df_parts.to_sql("master_parts_list", engine, if_exists="append", index=False, method="multi")
        print(f"[INFO] Inserted parts data for '{pdf_id}'.")
    else:
        print(f"[INFO] Error, no parts data extracted for '{pdf_id}'.")

brand = "Yamaha"
supported_brands = ['Yamaha', 'Honda']

if brand in supported_brands:
    if brand == "Yamaha":
        yamaha_data_extraction(pdf_path)
    elif brand == "Honda":
        print("Hi")
else:
    print (f'"{brand}" not supported \nAvailable Brands: {supported_brands}')

[INFO] Inserted 38 new images for 'AEROX155'.
[INFO] Inserted parts data for 'AEROX155'.


In [None]:
# (Reference) Original Image Extraction Code

from PIL import ImageOps

def normalize_image_background(image_bytes):
    img = Image.open(BytesIO(image_bytes)).convert("L")  # Convert to grayscale
    mean_brightness = sum(img.getdata()) / (img.width * img.height)

    if mean_brightness < 128:  # Likely white lines on dark background
        img = ImageOps.invert(img)

    img = img.convert("RGB")  # Convert back to RGB

    # Save to bytes
    output = BytesIO()
    img.save(output, format="PNG")
    return output.getvalue()

def extract_images_with_fig_labels(pdf_path, pdf_id):
    doc = fitz.open(pdf_path)
    data = []
    seen_figs = set()

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()

        matches = re.findall(r"FIG\.\s*([\w-]+)", text)
        if not matches:
            continue

        fig_no = matches[0]
        if fig_no in seen_figs:
            continue

        image_list = page.get_images(full=True)
        if not image_list:
            continue

        xref = image_list[0][0]
        base_image = doc.extract_image(xref)

        # ✅ Normalize background here
        image = normalize_image_background(base_image["image"])

        data.append({
            "pdf_id": pdf_id,
            "fig_no": fig_no,
            "image": image
        })
        seen_figs.add(fig_no)

    df = pd.DataFrame(data)
    return df

# Function to resize and display images from DataFrame
def display_resized_image(image, size=(100, 100)):  # Resize to 100x100
    img = Image.open(BytesIO(image))  # Open image from bytes
    img.thumbnail(size)  # Resize while maintaining aspect ratio

    # Save to a byte stream in PNG format
    byte_io = BytesIO()
    img.save(byte_io, format='PNG')
    byte_io.seek(0)  # Ensure we're at the start of the byte stream

    # Show the resized image as a PNG
    display(IPImage(data=byte_io.read(), format='png'))

# Updated part to display images with a small size
def show_images_in_df(df):
    for idx, row in df.iterrows():
        fig_no = row['fig_no']
        print(f"Displaying image for FIG. {fig_no}")
        display_resized_image(row['image'], size=(100, 100))  # Resize image to 100x100

# 1. Function to get existing (pdf_id, fig_no) pairs from DB
def get_existing_fig_combos(engine, pdf_id):
    with engine.connect() as conn:
        result = conn.execute(
            text("SELECT fig_no FROM parts_images WHERE pdf_id = :pdf_id"),
            {"pdf_id": pdf_id}
        )
        return set(str(row[0]) for row in result.fetchall())  # Cast to str for consistent comparison

# 2. Main processing function
def main(pdf_path):
    pdf_id = extract_pdf_id(pdf_path)
    df = extract_images_with_fig_labels(pdf_path, pdf_id)
    show_images_in_df(df.head(5))
    # Ensure fig_no is string for comparison
    df["fig_no"] = df["fig_no"].astype(str)

    print(f"Found {len(df)} images in PDF: {pdf_path}")

    # Get existing combos from the DB
    existing_figs = get_existing_fig_combos(engine, pdf_id)

    # Filter only new fig_no entries
    df = df[~df["fig_no"].isin(existing_figs)]

    print(f"{len(df)} new images to insert.")
    
    # Show first few images
    if not df.empty:
        print(df.head(5).to_string(index=False))
        # Insert into database
        #df.to_sql("parts_images", engine, if_exists="append", index=False, method="multi")
        print("Insertion complete.")
    else:
        print("No new images to insert.")

main(pdf_path)

In [None]:
# (Reference) Original Function to extract text from the PDF
def extract_text_from_pdf(pdf_path):
    all_text = ""
    
    with pdfplumber.open(pdf_path) as pdf:
        # Iterate over all pages (for the whole manual)
        for page_num, page in enumerate(pdf.pages):
            text = page.extract_text() or ""
            lines = text.split('\n')
            first_line = lines[0].strip() if lines else ""

            # Skip pages with no spare parts data (pages without FIG.)
            if not any("FIG." in line for line in lines):
                continue

            # Stop if numerical index is reached
            if "NUMERICAL INDEX" in first_line:
                print(f"Found 'NUMERICAL INDEX' on page {page_num + 1}. Stopping further processing.")
                break

            # Accumulate text from valid pages
            all_text += f"\n--- Page {page_num + 1} ---\n{text}\n"
    
    return all_text

# Process function now accepts a pdf_id as argument
def process_data(text, pdf_id, year, num_model=3):
    rows = []
    lines = text.strip().split('\n')

    # Initialize columns for data collection
    fig_no = ""
    c_name = ""
    prev_fig_no = ""
    prev_c_name = ""
    prev_ref_no = ""
    collect_data = False

    for line_num, line in enumerate(lines):
        line = line.strip()
        if not line:
            continue

        # Detect if page is a Parts Information pages then stores Fig Num and Component Name
        if line.startswith('FIG.'):
            tokens = line.split()
            if len(tokens) >= 3:
                fig_no = tokens[1]
                c_name = " ".join(tokens[2:])
                prev_fig_no = fig_no
                prev_c_name = c_name
                collect_data = True
            continue

        # Safety Net 1: Skip lines until valid data table starts
        if not collect_data:
            continue

        # Safety Net 2: Use previous figure number and component name if not explicitly found in the current line
        if not fig_no:
            fig_no = prev_fig_no
            c_name = prev_c_name

        parts = line.split()
    
        # Allow data to be collected even if ref_no is missing
        is_valid_data_line = (
            len(parts) >= 2 and  # Check if the line has at least part number and description
            (re.match(r'\w+[-–]\w+', parts[0]) or re.match(r'\d+', parts[0]))  # Check if first part is a valid part number or ref number
        )

        if not is_valid_data_line:
            continue

        # Determine ref_no and part_no
        if parts[0].isdigit():
            ref_no = parts[0]
            part_no = parts[1]
            rest = parts[2:] # Description + Quantity + Remarks
            prev_ref_no = ref_no
        else:
            ref_no = prev_ref_no
            part_no = parts[0]
            rest = parts[1:] # Description + Quantity + Remarks

        rest = " ".join(rest).split()

        description = ""
        remarks = ""
        numbers = []
        found_numbers = False

        for item in rest:
            if item.isdigit():
                numbers.append(item)
                found_numbers = True
                continue
            if not found_numbers:
                description += item + " "
            else:
                remarks += item + " "
        
        if len(numbers) > num_model:
            description += numbers[0]
        
        description = description.strip()
        remarks = remarks.strip()

        # Append pdf_id and year along with the rest of the data
        rows.append([pdf_id, year, "Yamaha", fig_no, c_name, ref_no, part_no, description, remarks])

    df = pd.DataFrame(
        rows,
        columns=['pdf_id', 'year', 'brand', 'fig_no', 'component_name', 'ref_no', 'part_no', 'description', 'remarks']
    )
    return df

# Extract the first part of the filename (before any year or bracketed sections)
def extract_pdf_id(pdf_path):
    # Extract the base filename without the extension
    base_filename = os.path.basename(pdf_path).split('.')[0]
    
    # Match the first part before any year or parentheses, i.e., extract 'AEROX 155' or 'FJR1300A'
    match = re.match(r"([A-Za-z0-9 ]+)", base_filename)
    
    if match:
        return match.group(1).strip()  # Return the matched pdf_id, cleaned of extra spaces
    else:
        return None

# Extract year from the filename (if available)
def extract_year(pdf_path):
    year_match = re.search(r"'(\d{2})", pdf_path)
    if year_match:
        return f"20{year_match.group(1)}"  # Convert '15' to '2015'
    else:
        return None  # If no year is found, set it to None

# Main function to process PDF
def main(pdf_path):
    # Extract pdf_id and year
    pdf_id = extract_pdf_id(pdf_path)
    year = extract_year(pdf_path)

    # Extract all text from the PDF
    all_text = extract_text_from_pdf(pdf_path)

    # Process the extracted text to generate structured data
    df = process_data(all_text, pdf_id, year)

    # Display the structured table
    print(df.to_string(index=False))

    # Adding data to Database
    df.to_sql("master_parts_list", engine, if_exists="append", index=False, method="multi")

# Run the main function
main(pdf_path)

In [None]:
#Final Working VER

# Test
# ---------- HELPERS ----------
def extract_pdf_id(pdf_path):
    base_filename = os.path.basename(pdf_path).split('.')[0]
    match = re.match(r"([A-Za-z0-9 ]+)", base_filename)
    if match:
        return match.group(1).replace(" ", "")  # Remove all spaces
    return None

def extract_year(pdf_path):
    year_match = re.search(r"'(\d{2})", pdf_path)
    return f"20{year_match.group(1)}" if year_match else None

def extract_model(pdf_path):
    base_filename = os.path.basename(pdf_path)
    match = re.search(r"\((.*?)\)", base_filename)
    if match:
        return match.group(1)  # e.g., "B65P, B65R, B65S"
    return None

# ---------- IMAGE EXTRACTION ----------
def normalize_image_background(image_bytes):
    img = Image.open(BytesIO(image_bytes)).convert("L")  # Grayscale
    mean_brightness = sum(img.getdata()) / (img.width * img.height)

    if mean_brightness < 128:  # Invert if it's a dark background
        img = ImageOps.invert(img)

    img = img.convert("RGB")  # Convert back to RGB

    output = BytesIO()
    img.save(output, format="PNG")
    return output.getvalue()

def extract_images_with_fig_labels(pdf_path, pdf_id, engine):
    doc = fitz.open(pdf_path)
    data = []

    # Step 1: Get existing (pdf_id, fig_no) combos from DB
    existing_figs = get_existing_fig_combos(engine, pdf_id)

    seen_figs = set()  # Track unique figs within the PDF

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()

        matches = re.findall(r"FIG\.\s*([\w-]+)", text)
        if not matches:
            continue

        fig_no = matches[0]

        if fig_no in seen_figs or fig_no in existing_figs:
            continue  # Skip if already handled or exists in DB

        image_list = page.get_images(full=True)
        if not image_list:
            continue

        xref = image_list[0][0]
        base_image = doc.extract_image(xref)
        image = normalize_image_background(base_image["image"])

        image_id = "_".join([pdf_id, fig_no])

        data.append({
            "image_id" : image_id,
            "pdf_id": pdf_id,
            "fig_no": fig_no,
            "image": image
        })
        seen_figs.add(fig_no)

    return pd.DataFrame(data)

def get_existing_fig_combos(engine, pdf_id):
    with engine.connect() as conn:
        result = conn.execute(
            text("SELECT fig_no FROM parts_images WHERE pdf_id = :pdf_id"),
            {"pdf_id": pdf_id}
        )
        return set(str(row[0]) for row in result.fetchall())  # Cast to str for consistent comparison

# ---------- TEXT EXTRACTION ----------
def extract_text_from_pdf(pdf_path):
    all_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            text = page.extract_text() or ""
            lines = text.split('\n')
            first_line = lines[0].strip() if lines else ""
            if not any("FIG." in line for line in lines):
                continue
            if "NUMERICAL INDEX" in first_line:
                break
            all_text += f"\n--- Page {page_num + 1} ---\n{text}\n"
    return all_text

def yamaha_process_data(text, pdf_id, year, model, num_model):
    rows = []
    lines = text.strip().split('\n')
    section = c_name = prev_fig_no = prev_c_name = prev_ref_no = ""
    collect_data = False

    for line in lines:
        line = line.strip()
        if not line: continue
        if line.startswith('FIG.'):
            tokens = line.split()
            if len(tokens) >= 3:
                section = tokens[1]
                c_name = " ".join(tokens[2:])
                prev_fig_no, prev_c_name = section, c_name
                collect_data = True
            continue
        if not collect_data: continue
        if not section:
            section, c_name = prev_fig_no, prev_c_name

        parts = line.split()
        is_valid_data_line = (
            len(parts) >= 2 and 
            (re.match(r'\w+[-–]\w+', parts[0]) or parts[0].isdigit())
        )
        if not is_valid_data_line:
            continue

        if parts[0].isdigit():
            ref_no = parts[0]
            part_no = parts[1]
            rest = parts[2:]
            prev_ref_no = ref_no
        else:
            ref_no = prev_ref_no
            part_no = parts[0]
            rest = parts[1:]

        rest = " ".join(rest).split()
        description = remarks = ""
        numbers = []
        found_numbers = False
        for item in rest:
            if item.isdigit():
                numbers.append(item)
                found_numbers = True
                continue
            if not found_numbers:
                description += item + " "
            else:
                remarks += item + " "
        if len(numbers) > num_model:
            description += numbers[0]

        image_id = "_".join([pdf_id, section])

        rows.append([pdf_id, year, "Yamaha", model, section, c_name, ref_no, part_no, description.strip(), remarks.strip(), image_id])

    return pd.DataFrame(rows, columns=[
        'pdf_id', 'year', 'brand', 'model', 'section', 'component_name',
        'ref_no', 'part_no', 'description', 'remarks', 'image_id'
    ])

# ---------- MAIN PROCESS ----------
def yamaha_data_extraction(pdf_path):

    pdf_id = extract_pdf_id(pdf_path)
    year = extract_year(pdf_path)
    model = extract_model(pdf_path)

    SessionLocal = sessionmaker(bind=engine)
    session = SessionLocal()
    try:
        df_images = extract_images_with_fig_labels(pdf_path, pdf_id, engine)
        image_message = f"[INFO] Inserted {len(df_images)} new images for '{pdf_id}'."
        if not df_images.empty:
            df_images.to_sql("parts_images", engine, if_exists="append", index=False, method="multi")
            print(image_message)
        else:
            print(image_message + f" All images for '{pdf_id}' already exist.")

        # Step 2: Check if parts data already exists
        existing = session.execute(
            select(1).select_from(master_parts_list).where(master_parts_list.c.pdf_id == pdf_id)
        ).first()

        if existing:
            print(f"[INFO] Master Parts data for '{pdf_id}' already exists.")
            return

    finally:
        session.close()
            
    # Step 3: Extract and process parts data (outside session scope)
    all_text = extract_text_from_pdf(pdf_path)
    df_parts = yamaha_process_data(all_text, pdf_id, year, model, num_model=3)

    if not df_parts.empty:
        #print(df_parts.to_string(index=False))
        df_parts.to_sql("master_parts_list", engine, if_exists="append", index=False, method="multi")
        print(f"[INFO] Inserted parts data for '{pdf_id}'.")
    else:
        print(f"[INFO] Error, no parts data extracted for '{pdf_id}'.")

brand = "Yamaha"
supported_brands = ['Yamaha', 'Honda']

if brand in supported_brands:
    if brand == "Yamaha":
        yamaha_data_extraction(pdf_path)
    elif brand == "Honda":
        print("Hi")
else:
    print (f'"{brand}" not supported \nAvailable Brands: {supported_brands}')

[INFO] Inserted 0 new images for 'FJR1300A'. All images for 'FJR1300A' already exist.
[INFO] Master Parts data for 'FJR1300A' already exists.


### Week 4: