In [31]:
import re
import pdfplumber

def extract_section_with_layout(pdf_path: str, section_code: str, section_title: str) -> str:
    """
    Finds the specified section by plain-text scan (no layout=True),
    locates the 'Reqd. QTY' start of the main table, then finds the
    next section header to mark the end. Finally re-extracts that
    range in layout mode to preserve spacing.
    """
    code = section_code.upper()
    title = section_title.upper()

    # Patterns
    next_sec_re     = re.compile(r'^[A-Z]+-\d+', re.IGNORECASE)
    table_header_re = re.compile(r'\bReqd\.?\s*QTY\b', re.IGNORECASE)

    # Phase 1: plain-text scan to find start_page, header_hit, end_page
    start_page = None
    header_hit = False
    end_page   = None

    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text() or ""
            for ln in text.splitlines():
                up = ln.strip().upper()
                if start_page is None:
                    # Detect section header (FRAMEGROUP or simple)
                    if ("FRAMEGROUP" in up and up.startswith(code) and title in up) or \
                       (up.startswith(code) and title in up):
                        start_page = i
                        break
                elif not header_hit:
                    # Detect the table header
                    if table_header_re.search(up):
                        header_hit = True
                    # continue scanning same page to look for end only after header
                else:
                    # after header, detect next section
                    if next_sec_re.match(up) and not up.startswith(code):
                        end_page = i
                        break
            if end_page is not None:
                break

        # Validation
        if start_page is None or not header_hit:
            return f"Section '{section_code} {section_title}' not found or missing table header."

        if end_page is None:
            end_page = len(pdf.pages)

        # Phase 2: layout-mode extraction on full range [start_page..end_page)
        collected = []
        in_table  = False

        for pi in range(start_page, end_page):
            layout = pdf.pages[pi].extract_text(layout=True) or ""
            for ln in layout.splitlines():
                up = ln.strip().upper()
                if not in_table:
                    if table_header_re.search(up):
                        in_table = True
                    continue
                # Stop at next section if it sneaks in layout text
                if next_sec_re.match(up) and not up.startswith(code):
                    break
                collected.append(ln)

        return "\n".join(collected).rstrip()


if __name__ == "__main__":
    pdf_file = "CRF1000 A_PC_13MJPG02_(G.H).pdf"
    print(extract_section_with_layout(
        pdf_file,
        section_code="E-12",
        section_title="OIL PAN/OIL PUMP",
    ))

             Ref.                                                                                                   
             No. Part No.      Description   CRF1000 CRF1000A CRF1000D Serial No. Parts catalogue code              
                                             G H  G  H G H                                                          
              1 11210-MJP-G50 PAN COMP., OIL •••••••••••••• 1 - 1 - - - -------- --------2ED,2GS,2RU,2U,3ED,3GS,3RU,3TH,3U,ED,GS,RU,TH,U
               11210-MJP-G51 •••••••••••••••••••••••••••• - 1 1 - - - -------- --------2CH,4CH,U                    
                                             - -  - 1  - -                                                          
               11210-MJP-G81 •••••••••••••••••••••••••••• - - - - 1 - -------- --------2CH,2KO,3KO,4CH              
               11210-MJP-G80 •••••••••••••••••••••••••••• - - - - 1 - -------- --------2ED,2GS,2RU,2U,3ED,3GS,3RU,3U,ED,GS,RU,TH
               11210-MJP-G81 •••

In [None]:
import re
import pdfplumber

def extract_section_with_layout(pdf_path: str, section_code: str, section_title: str):
    """
    Finds a specified section, locates 'Reqd. QTY', extracts in layout mode,
    then parses each part and variant into [REF_NO, PART_NO, DESCRIPTION, CATALOGUE_CODE].
    Stops collecting once it encounters any line containing 'PART', 'NO', and 'INDEX'.
    """
    code = section_code.upper()
    title = section_title.upper()

    next_sec_re     = re.compile(r'^[A-Z]+-\d+', re.IGNORECASE)
    table_header_re = re.compile(r'\bReqd\.?\s*QTY\b', re.IGNORECASE)
    part_no_re      = re.compile(r'\b[0-9]{5,}(?:-[A-Z0-9-]+)+\b')
    end_re          = re.compile(r'.*PART\s*NO\.?\s*INDEX.*', re.IGNORECASE)

    # Phase 1: locate page range
    start_page = header_hit = None
    end_page = None
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            for ln in (page.extract_text() or "").splitlines():
                u = ln.strip().upper()
                if start_page is None:
                    if (("FRAMEGROUP" in u and u.startswith(code) and title in u)
                        or (u.startswith(code) and title in u)):
                        start_page = i
                        break
                elif not header_hit:
                    if table_header_re.search(u):
                        header_hit = True
                else:
                    if next_sec_re.match(u) and not u.startswith(code):
                        end_page = i
                        break
            if end_page is not None:
                break
        if start_page is None or not header_hit:
            raise ValueError(f"Section '{section_code} {section_title}' not found or missing table header.")
        if end_page is None:
            end_page = len(pdf.pages)

        # Phase 2: collect layout-preserved lines
        collected = []
        in_table = False
        stop_all = False
        for pi in range(start_page, end_page):
            for ln in (pdf.pages[pi].extract_text(layout=True) or "").splitlines():
                u = ln.strip().upper()
                # if we see "PART ... NO ... INDEX" in any spacing, stop entirely
                if end_re.match(u):
                    stop_all = True
                    break
                if not in_table:
                    if table_header_re.search(u):
                        in_table = True
                    continue
                if next_sec_re.match(u) and not u.startswith(code):
                    break
                collected.append(ln)
            if stop_all:
                break

    # Phase 3: group into per-part buffers
    records = []
    last_ref = ""
    for ln in collected:
        m_pno = part_no_re.search(ln)
        if m_pno:
            m_ref = re.match(r'^\s*(?:\((\d+)\)|(\d+))\s+', ln)
            if m_ref:
                last_ref = m_ref.group(1) or m_ref.group(2)
            ref = last_ref
            pno = m_pno.group(0)
            buf = [ln[m_pno.end():].strip()]
            records.append({"ref": ref, "part_no": pno, "buf": buf})
        else:
            if not records:
                continue
            txt = ln.strip()
            if re.fullmatch(r'\d+', txt) or re.fullmatch(r'\d{4}\.\d{2}\.\d{2}', txt):
                continue
            records[-1]["buf"].append(txt)

    # Phase 4: parse each buffer
    out = []
    for rec in records:
        raw = " ".join(rec["buf"])
        raw = raw.replace('∙','').replace('•','').replace('\uf020','')
        raw = re.sub(r'\s+', ' ', raw).strip()

        idx = raw.find("--------")
        desc_part = raw[:idx].strip() if idx != -1 else raw
        cat_part  = raw[idx+8:].strip() if idx != -1 else ""

        desc_part = re.sub(r'\.{2,}\s+\d.*$', '', desc_part).strip()
        desc_part = re.sub(r'\s+GK[A-Za-z0-9]+\s*$', '', desc_part)
        desc_part = re.sub(r'\s+(?:-+|\d+)+\s*$', '', desc_part)
        desc = re.sub(r'\s+\d+\s+\d{4}\.\d{2}\.\d{2}.*$', "", desc_part).strip()
        desc = re.sub(r'(?:\s+(?:\(\d+\)|-+|\d+))+$', "", desc).strip()
        desc = re.sub(r'\.{2,}$', "", desc).strip()
        desc = re.sub(r'(?:\s+[A-Z])+$', '', desc).strip()
        desc = "" if not re.search(r'[A-Za-z]', desc) else desc

        if cat_part.upper().startswith("GK") and len(cat_part) > 8:
            cat_clean = cat_part[8:].split()[0]
        else:
            m_codes = re.match(r'[-\s]*([0-9A-Z,\s]+)', cat_part)
            raw_codes = m_codes.group(1) if m_codes else ""
            cat_clean = raw_codes.replace(" ", "")
            cat_clean = re.sub(r'([A-Z])(?=\d)', r'\1,', cat_clean)
            cat_clean = re.sub(
                r'(?<=[0-9A-Z]{2})(?=[A-Z]{2}(?:,|$))',
                ',',
                cat_clean
            )

        cat_clean = re.sub(r'\d{4}$', '', cat_clean)
        tokens = [t for t in cat_clean.split(',') if t]
        seen = set()
        final_codes = [c for c in tokens if c not in seen and not seen.add(c)]
        cat = ",".join(final_codes)

        m3 = re.match(r'^(.+?)([A-Z]{3,})$', rec["part_no"])
        if m3:
            core, suf = m3.group(1), m3.group(2)
            pno = core + suf[:2]
            desc = f"{suf[2:]} {desc}".strip()
        else:
            pno = rec["part_no"]

        out.append([rec["ref"], pno, desc, cat])

    return out

if __name__ == "__main__":
    pdf_file = "CRF1000 A_PC_13MJPG02_(G.H).pdf"
    for row in extract_section_with_layout(pdf_file, "E-7", "CLUTCH(CRF1000/CRF1000A)"):
        print(row)


['1', '22100-MJP-G50', 'OUTER COMP., CLUTCH', '']
['2', '22116-MJP-G50', 'GUIDE, CLUTCH OUTER', '']
['3', '22121-MJP-G51', 'CENTER, CLUTCH', '']
['4', '22125-HP6-A00', 'SEAT, JUDDER SPRING', '']
['5', '22210-MJP-305', 'DISK SET, CLUTCH', '']
['6', '22325-MJP-G51', 'SPRING, JUDDER', '']
['7', '22350-MJP-G51', 'PLATE COMP., CLUTCH PRESSURE', '']
['8', '22361-MJP-G51', 'PLATE, SETTING', '']
['9', '22401-MJP-G51', 'SPRING, CLUTCH', '']
['10', '22425-MJP-G51', 'SEAT, CLUTCH SPRING', '']
['11', '22847-MJP-G50', 'PIN, CLUTCH LIFTER', '']
['12', '23103-MJP-G50', 'GEAR, PRIMARY DRIVE(45T)', '']
['13', '23104-MJP-G50', 'SUB GEAR, PRIMARY(45T)', '']
['14', '23115-MW4-000', 'SPRING, PRIMARY DAMPER', '']
['15', '90013-MJP-G50', 'BOLT, SPECIAL, 10X25', '']
['16', '90050-MJP-G51', 'BOLT, FLANGE, 6X35', '']
['17', '90231-MS2-610', 'NUT, LOCK, 25MM', '']
['18', '90401-MEA-670', 'WASHER, 35X50X3.2', '']
['19', '90401-MS2-610', 'WASHER, THRUST, 28.2X56X2', '']
['20', '90414-MGE-D00', 'WASHER, 10.5X36X4',

In [37]:
import re
import pandas as pd
import pdfplumber

def extract_section_with_layout(pdf_path: str, section_code: str, section_title: str):
    """
    Finds a specified section, locates 'Reqd. QTY', extracts in layout mode,
    then parses each part and variant into ref_no, part_no, description, remarks.
    Stops collecting once it encounters any line containing 'PART', 'NO', and 'INDEX'.
    Returns a DataFrame with columns ref_no, part_no, description, remarks.
    """
    code = section_code.upper()
    title = section_title.upper()

    next_sec_re     = re.compile(r'^[A-Z]+-\d+(?:-\d+)*', re.IGNORECASE)
    table_header_re = re.compile(r'\bReqd\.?\s*QTY\b', re.IGNORECASE)
    part_no_re      = re.compile(r'\b[0-9]{5,}(?:-[A-Z0-9-]+)+\b')
    end_re          = re.compile(r'.*PART\s*NO\.?\s*INDEX.*', re.IGNORECASE)

    # Phase 1: locate page range
    start_page = header_hit = None
    end_page = None
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            for ln in (page.extract_text() or "").splitlines():
                u = ln.strip().upper()
                if start_page is None:
                    if (("FRAMEGROUP" in u and u.startswith(code) and title in u)
                        or (u.startswith(code) and title in u)):
                        start_page = i
                        break
                elif not header_hit:
                    if table_header_re.search(u):
                        header_hit = True
                else:
                    # skip blank lines to avoid u.split()[0] errors
                    if not u:
                        continue
                    first_token = u.split()[0]
                    if next_sec_re.match(u) and first_token != code:
                        end_page = i
                        break
            if end_page is not None:
                break
        if start_page is None or not header_hit:
            raise ValueError(f"Section '{section_code} {section_title}' not found or missing table header.")
        if end_page is None:
            end_page = len(pdf.pages)

        # Phase 2: collect layout-preserved lines
        collected = []
        in_table = False
        stop_all = False
        for pi in range(start_page, end_page):
            for ln in (pdf.pages[pi].extract_text(layout=True) or "").splitlines():
                u = ln.strip().upper()
                if end_re.match(u):
                    stop_all = True
                    break
                if not in_table:
                    if table_header_re.search(u):
                        in_table = True
                    continue
                # again guard against blank
                if not u:
                    collected.append(ln)
                    continue
                first_token = u.split()[0]
                if next_sec_re.match(u) and first_token != code:
                    break
                collected.append(ln)
            if stop_all:
                break

    # Phase 3: group into per-part buffers
    records = []
    last_ref = ""
    for ln in collected:
        m_pno = part_no_re.search(ln)
        if m_pno:
            m_ref = re.match(r'^\s*(?:\((\d+)\)|(\d+))\s+', ln)
            if m_ref:
                last_ref = m_ref.group(1) or m_ref.group(2)
            records.append({
                "ref":      last_ref,
                "part_no":  m_pno.group(0),
                "buf":      [ln[m_pno.end():].strip()]
            })
        else:
            if not records:
                continue
            txt = ln.strip()
            if re.fullmatch(r'\d+', txt) or re.fullmatch(r'\d{4}\.\d{2}\.\d{2}', txt):
                continue
            records[-1]["buf"].append(txt)

    # Phase 4: parse each buffer directly into column-lists
    ref_nos      = []
    part_nos     = []
    descriptions = []
    remarks_list = []

    for rec in records:
        raw = " ".join(rec["buf"])
        raw = raw.replace('∙','').replace('•','').replace('\uf020','')
        raw = re.sub(r'\s+', ' ', raw).strip()

        idx       = raw.find("--------")
        desc_part = raw[:idx].strip() if idx != -1 else raw
        cat_part  = raw[idx+8:].strip() if idx != -1 else ""
        cat_part  = re.sub(r'^[0-9]+\s*', '', cat_part)
        desc_part = re.sub(r'\s\d+(?:\s+\d+)+.*$', '', desc_part).strip()

        # clean up description
        desc_part = re.sub(r'\.{2,}\s+\d.*$', '', desc_part).strip()
        desc_part = re.sub(r'\s+GK[A-Za-z0-9]+\s*$', '', desc_part)
        desc_part = re.sub(r'\s+(?:-+|\d+)+\s*$', '', desc_part)
        desc      = re.sub(r'\s+\d+\s+\d{4}\.\d{2}\.\d{2}.*$', "", desc_part).strip()
        desc      = re.sub(r'(?:\s+(?:\(\d+\)|-+|\d+))+$',     "", desc).strip()
        desc      = re.sub(r'\.{2,}$',                         "", desc).strip()
        desc      = re.sub(r'(?:\s+[A-Z])+$',                  "", desc).strip()
        desc      = re.sub(r'\s+[-\d ]+$',                     "", desc).strip()
        desc      = "" if not re.search(r'[A-Za-z]', desc) else desc

        # clean up catalogue codes → remarks
        if cat_part.upper().startswith("GK") and len(cat_part) > 8:
            cat_clean = cat_part[8:].split()[0]
        else:
            m_codes   = re.match(r'[-\s]*([0-9A-Z,\s]+)', cat_part)
            raw_codes = m_codes.group(1) if m_codes else ""
            cat_clean = raw_codes.replace(" ", "")
            cat_clean = re.sub(r'([A-Z])(?=\d)', r'\1,', cat_clean)
            cat_clean = re.sub(r'(?<=[0-9A-Z]{2})(?=[A-Z]{2}(?:,|$))', ',', cat_clean)
        cat_clean    = re.sub(r'\d{4}$', '', cat_clean)
        tokens       = [t for t in cat_clean.split(',') if t]
        if len(tokens) > 1 and re.fullmatch(r'[A-Z]+', tokens[0]):
            m = re.match(r'^(\d+)', tokens[1])
            if m:
                tokens[0] = m.group(1) + tokens[0]
        seen         = set()
        final_codes  = [c for c in tokens if c not in seen and not seen.add(c)]
        remarks      = ",".join(final_codes)

        # adjust part_no suffix logic
        m3 = re.match(r'^(.+?)([A-Z]{3,})$', rec["part_no"])
        if m3:
            core, suf = m3.group(1), m3.group(2)
            part_no    = core + suf[:2]
            desc       = f"{suf[2:]} {desc}".strip()
        else:
            part_no = rec["part_no"]

        # append to column lists
        ref_nos.append(rec["ref"])
        part_nos.append(part_no)
        descriptions.append(desc)
        remarks_list.append(remarks)

    # build and return DataFrame
    df = pd.DataFrame({
        'ref_no':      ref_nos,
        'part_no':     part_nos,
        'description': descriptions,
        'remarks':     remarks_list
    })
    return df


if __name__ == "__main__":
    pdf_file = "CRF1000 A_PC_13MJPG02_(G.H).pdf"
    df = extract_section_with_layout(pdf_file, "E-5", "RIGHT CRANKCASE COVER(CRF1000/CRF1000A)")
    print(df.tail(20))


   ref_no         part_no                 description remarks
0       1   11330-MJP-G50   COVER COMP., R. CRANKCASE        
1       2   11376-MJP-G50        COVER, R. RR. ENGINE        
2       3   11385-MJF-A00                      COLLAR        
3       4   11394-MJP-G51  GASKET, R. CRANKCASE COVER        
4       5   19200-MJP-G50           PUMP COMP., WATER        
5       6   19226-MJP-G50     GASKET, WATER PUMP BODY        
6       7   22810-MJP-G50         LEVER COMP., CLUTCH        
7       8   22815-413-000        SPRING, CLUTCH LEVER        
8       9   22821-MJP-G50      RECEIVER, CLUTCH CABLE        
9      10   90004-MJP-G50   BOLT, FLANGE SOCKET, 6X28        
10     11   90085-MGC-920         BOLT, SPECIAL, 6X17        
11     12   90488-425-000        WASHER, SEALING, 6MM        
12     13   91053-MFJ-D01   BEARING, NEEDLE, 12X16X10        
13     14   91205-KF0-003     OIL SEAL, 12X20X5(ARAI)        
14     15   91315-MJP-G51            SEAL, WATER PIPE        
15     1

In [40]:
import re
import pandas as pd
import pdfplumber

def extract_section_with_layout(pdf_path: str, section_code: str, section_title: str):
    """
    Finds a specified section, locates 'Reqd. QTY', extracts in layout mode,
    then parses each part and variant into ref_no, part_no, description, remarks.
    Stops collecting once it encounters any line containing 'PART', 'NO', and 'INDEX'.
    Returns a DataFrame with columns ref_no, part_no, description, remarks.
    """
    code = section_code.upper()
    title = section_title.upper()

    next_sec_re     = re.compile(r'^[A-Z]+-\d+(?:-\d+)*', re.IGNORECASE)
    table_header_re = re.compile(r'\bReqd\.?\s*QTY\b', re.IGNORECASE)
    part_no_re      = re.compile(r'\b[0-9]{5,}(?:-[A-Z0-9-]+)+\b')
    end_re          = re.compile(r'.*PART\s*NO\.?\s*INDEX.*', re.IGNORECASE)

    # Phase 1: locate page range
    start_page = header_hit = None
    end_page = None
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            for ln in (page.extract_text() or "").splitlines():
                u = ln.strip().upper()
                if start_page is None:
                    if (("FRAMEGROUP" in u and u.startswith(code) and title in u)
                        or (u.startswith(code) and title in u)):
                        start_page = i
                        break
                elif not header_hit:
                    if table_header_re.search(u):
                        header_hit = True
                else:
                    # skip blank lines to avoid u.split()[0] errors
                    if not u:
                        continue
                    first_token = u.split()[0]
                    if next_sec_re.match(u) and first_token != code:
                        end_page = i
                        break
            if end_page is not None:
                break
        if start_page is None or not header_hit:
            raise ValueError(f"Section '{section_code} {section_title}' not found or missing table header.")
        if end_page is None:
            end_page = len(pdf.pages)

        # Phase 2: collect layout-preserved lines
        collected = []
        in_table = False
        stop_all = False
        for pi in range(start_page, end_page):
            for ln in (pdf.pages[pi].extract_text(layout=True) or "").splitlines():
                u = ln.strip().upper()
                if end_re.match(u):
                    stop_all = True
                    break
                if not in_table:
                    if table_header_re.search(u):
                        in_table = True
                    continue
                # again guard against blank
                if not u:
                    collected.append(ln)
                    continue
                first_token = u.split()[0]
                if next_sec_re.match(u) and first_token != code:
                    break
                collected.append(ln)
            if stop_all:
                break

    # Phase 3: group into per-part buffers
    records = []
    last_ref = ""
    for ln in collected:
        m_pno = part_no_re.search(ln)
        if m_pno:
            m_ref = re.match(r'^\s*(?:\((\d+)\)|(\d+))\s+', ln)
            if m_ref:
                last_ref = m_ref.group(1) or m_ref.group(2)
            records.append({
                "ref":      last_ref,
                "part_no":  m_pno.group(0),
                "buf":      [ln[m_pno.end():].strip()]
            })
        else:
            if not records:
                continue
            txt = ln.strip()
            if re.fullmatch(r'\d+', txt) or re.fullmatch(r'\d{4}\.\d{2}\.\d{2}', txt):
                continue
            records[-1]["buf"].append(txt)

    # Phase 4: parse each buffer directly into column-lists
    ref_nos      = []
    part_nos     = []
    descriptions = []
    remarks_list = []

    for rec in records:
        raw = " ".join(rec["buf"])
        raw = raw.replace('∙','').replace('•','').replace('\uf020','')
        raw = re.sub(r'\s+', ' ', raw).strip()

        idx       = raw.find("--------")
        desc_part = raw[:idx].strip() if idx != -1 else raw
        cat_part  = raw[idx+8:].strip() if idx != -1 else ""
        cat_part  = re.sub(r'^[0-9]+\s*', '', cat_part)
        # strip quantity columns from description only
        desc_part = re.sub(r'\s\d+(?:\s+\d+)+.*$', '', desc_part).strip()

        # clean up description
        desc_part = re.sub(r'\.{2,}\s+\d.*$', '', desc_part).strip()
        desc_part = re.sub(r'\s+GK[A-Za-z0-9]+\s*$', '', desc_part)
        desc_part = re.sub(r'\s+(?:-+|\d+)+\s*$', '', desc_part)
        desc      = re.sub(r'\s+\d+\s+\d{4}\.\d{2}\.\d{2}.*$', "", desc_part).strip()
        desc      = re.sub(r'(?:\s+(?:\(\d+\)|-+|\d+))+$',     "", desc).strip()
        desc      = re.sub(r'\.{2,}$',                         "", desc).strip()
        desc      = re.sub(r'(?:\s+[A-Z])+$',                  "", desc).strip()
        desc      = re.sub(r'\s+[-\d ]+$',                     "", desc).strip()
        desc      = "" if not re.search(r'[A-Za-z]', desc) else desc

        # clean up catalogue codes → remarks
        if cat_part.upper().startswith("GK") and len(cat_part) > 8:
            cat_clean = cat_part[8:].split()[0]
        else:
            m_codes   = re.match(r'[-\s]*([0-9A-Z,\s]+)', cat_part)
            raw_codes = m_codes.group(1) if m_codes else ""
            cat_clean = raw_codes.replace(" ", "")
            cat_clean = re.sub(r'([A-Z])(?=\d)', r'\1,', cat_clean)
            cat_clean = re.sub(r'(?<=[0-9A-Z]{2})(?=[A-Z]{2}(?:,|$))', ',', cat_clean)
        cat_clean    = re.sub(r'\d{4}$', '', cat_clean)
        tokens       = [t for t in cat_clean.split(',') if t]
        if len(tokens) > 1 and re.fullmatch(r'[A-Z]+', tokens[0]):
            m = re.match(r'^(\d+)', tokens[1])
            if m:
                tokens[0] = m.group(1) + tokens[0]
        seen         = set()
        final_codes  = [c for c in tokens if c not in seen and not seen.add(c)]
        remarks      = ",".join(final_codes)

        # adjust part_no suffix logic
        m3 = re.match(r'^(.+?)([A-Z]{3,})$', rec["part_no"])
        if m3:
            core, suf = m3.group(1), m3.group(2)
            part_no    = core + suf[:2]
            desc       = f"{suf[2:]} {desc}".strip()
        else:
            part_no = rec["part_no"]

        ref_nos.append(rec["ref"])
        part_nos.append(part_no)
        descriptions.append(desc)
        remarks_list.append(remarks)

    # build and return DataFrame
    df = pd.DataFrame({
        'ref_no':      ref_nos,
        'part_no':     part_nos,
        'description': descriptions,
        'remarks':     remarks_list
    })
    return df

def extract_all_sections_one_pass(pdf_path: str, output_csv: str) -> pd.DataFrame:
    """
    Opens the PDF once, walks through it page by page, detects sections via next_sec_re,
    collects each section’s lines (with the shim‐prefix_re logic you added),
    and as soon as any end_re is hit, stops the entire extraction afterwards.
    Writes CSV with columns section_no, section_name, ref_no, part_no, description, remarks.
    """
    next_sec_re     = re.compile(r'^[A-Z]+-\d+(?:-\d+)*', re.IGNORECASE)
    table_header_re = re.compile(r'\bReqd\.?\s*QTY\b', re.IGNORECASE)
    part_no_re      = re.compile(r'\b[0-9]{5,}(?:-[A-Z0-9-]+)+\b')
    end_re          = re.compile(r'.*PART\s*NO\.?\s*INDEX.*', re.IGNORECASE)

    section_nos   = []
    section_names = []
    ref_nos       = []
    part_nos      = []
    descriptions  = []
    remarks_list  = []

    current = None
    done    = False

    def _flush(cur):
        """Phase 3+4 verbatim, with your prefix_re shim logic and all the desc/cat fixes."""
        records = []; last_ref = ""
        prefix_re = re.compile(r'^\s*\(?(\d+)\)?\s+(' + part_no_re.pattern + r')', re.IGNORECASE)

        # Phase 3: grouping
        for ln in cur['collected']:
            # same grouping logic
            m0 = prefix_re.match(ln)
            if m0:
                last_ref, pno = m0.group(1), m0.group(2)
                rest = ln[m0.end():].strip()
                records.append({'ref': last_ref, 'part_no': pno, 'buf': [rest]})
            else:
                m_pno = part_no_re.search(ln)
                if m_pno:
                    pno  = m_pno.group(0)
                    rest = ln[m_pno.end():].strip()
                    records.append({'ref': last_ref, 'part_no': pno, 'buf': [rest]})
                else:
                    if not records:
                        continue
                    txt = ln.strip()
                    if re.fullmatch(r'\d+', txt) or re.fullmatch(r'\d{4}\.\d{2}\.\d{2}', txt):
                        continue
                    records[-1]['buf'].append(txt)

        # Phase 4: parsing & cleanup
        for rec in records:
            raw = " ".join(rec['buf']).replace('∙','').replace('•','').replace('\uf020','')
            raw = re.sub(r'\s+', ' ', raw).strip()

            idx = raw.find("--------")
            desc_part = raw[:idx].strip() if idx != -1 else raw
            cat_part  = raw[idx+8:].strip() if idx != -1 else ""

            # — NEW: strip any stray leading serials from cat_part
            cat_part = re.sub(r'^[0-9]+\s*', '', cat_part)

            # — NEW: strip quantity columns from desc_part
            desc_part = re.sub(r'\s\d+(?:\s+\d+)+.*$', '', desc_part).strip()

            # description cleanup
            desc_part = re.sub(r'\.{2,}\s+\d.*$', '', desc_part).strip()
            desc_part = re.sub(r'\s+GK[A-Za-z0-9]+\s*$', '', desc_part)
            desc_part = re.sub(r'\s+(?:-+|\d+)+\s*$', '', desc_part)
            desc = re.sub(r'\s+\d+\s+\d{4}\.\d{2}\.\d{2}.*$', "", desc_part).strip()
            desc = re.sub(r'(?:\s+(?:\(\d+\)|-+|\d+))+$', "", desc).strip()
            desc = re.sub(r'\.{2,}$', "", desc).strip()
            desc = re.sub(r'(?:\s+[A-Z])+$', "", desc).strip()
            desc = "" if not re.search(r'[A-Za-z]', desc) else desc

            # remarks cleanup
            if cat_part.upper().startswith("GK") and len(cat_part) > 8:
                cat_clean = cat_part[8:].split()[0]
            else:
                m_codes   = re.match(r'[-\s]*([0-9A-Z,\s]+)', cat_part)
                raw_codes = m_codes.group(1) if m_codes else ""
                cat_clean = raw_codes.replace(" ", "")
                cat_clean = re.sub(r'([A-Z])(?=\d)', r'\1,', cat_clean)
                cat_clean = re.sub(r'(?<=[0-9A-Z]{2})(?=[A-Z]{2}(?:,|$))', ',', cat_clean)
            cat_clean = re.sub(r'\d{4}$', '', cat_clean)

            # — NEW: if first token is pure letters but second starts with a digit, prefix it
            tokens = [t for t in cat_clean.split(',') if t]
            if len(tokens) > 1 and re.fullmatch(r'[A-Z]+', tokens[0]):
                m = re.match(r'^(\d+)', tokens[1])
                if m:
                    tokens[0] = m.group(1) + tokens[0]

            # dedupe
            seen  = set()
            codes = [c for c in tokens if c not in seen and not seen.add(c)]
            remarks = ",".join(codes)

            # part_no suffix logic
            m3 = re.match(r'^(.+?)([A-Z]{3,})$', rec['part_no'])
            if m3:
                core, suf = m3.group(1), m3.group(2)
                pno        = core + suf[:2]
                desc       = f"{suf[2:]} {desc}".strip()
            else:
                pno = rec['part_no']

            section_nos.append(cur['code'])
            section_names.append(cur['title'])
            ref_nos.append(rec['ref'])
            part_nos.append(pno)
            descriptions.append(desc)
            remarks_list.append(remarks)

    # --- the rest of extract_all_sections_one_pass is unchanged ---
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            if done:
                break

            plain  = (page.extract_text() or "").splitlines()
            layout = (page.extract_text(layout=True) or "").splitlines()

            for ln in plain:
                u = ln.strip().upper()
                if next_sec_re.match(u):
                    if current:
                        _flush(current)
                    parts = ln.strip().split(None, 1)
                    raw_title = parts[1].strip() if len(parts) > 1 else ""
                    title     = re.sub(r'\b[A-Z]+GROUP\b\s*', '', raw_title, re.IGNORECASE)
                    current = {
                        'code':       parts[0].upper(),
                        'title':      title,
                        'header_hit': False,
                        'collected':  []
                    }

            if current:
                for ln in layout:
                    u = ln.strip().upper()
                    if end_re.match(u):
                        _flush(current)
                        done = True
                        current = None
                        break
                    if not current['header_hit']:
                        if table_header_re.search(u):
                            current['header_hit'] = True
                        continue
                    first_token = u.split()[0] if u else ""
                    if next_sec_re.match(u) and first_token != current['code']:
                        _flush(current)
                        current = None
                        break
                    collected = current['collected']
                    collected.append(ln)

    if current and not done:
        _flush(current)

    final_df = pd.DataFrame({
        'section_no':   section_nos,
        'section_name': section_names,
        'ref_no':       ref_nos,
        'part_no':      part_nos,
        'description':  descriptions,
        'remarks':      remarks_list
    })
    final_df.to_csv(output_csv, index=False)
    return final_df


pdf_file = "NC750XAP_13MKWM02_PC_2022_2023.pdf"
df = extract_all_sections_one_pass(pdf_file, "all_sections.csv")
print(f"Written {len(df)} rows to all_sections2.csv")

Written 1560 rows to all_sections2.csv
