In [242]:
import re
import pdfplumber

def extract_section_with_layout(pdf_path: str, section_code: str, section_title: str) -> str:
    """
    Finds the specified section by plain-text scan (no layout=True),
    locates the 'Reqd. QTY' start of the main table, then finds the
    next section header to mark the end. Finally re-extracts that
    range in layout mode to preserve spacing.
    """
    code = section_code.upper()
    title = section_title.upper()

    # Patterns
    next_sec_re     = re.compile(r'^[A-Z]+-\d+', re.IGNORECASE)
    table_header_re = re.compile(r'\bReqd\.?\s*QTY\b', re.IGNORECASE)

    # Phase 1: plain-text scan to find start_page, header_hit, end_page
    start_page = None
    header_hit = False
    end_page   = None

    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text() or ""
            for ln in text.splitlines():
                up = ln.strip().upper()
                if start_page is None:
                    # Detect section header (FRAMEGROUP or simple)
                    if ("FRAMEGROUP" in up and up.startswith(code) and title in up) or \
                       (up.startswith(code) and title in up):
                        start_page = i
                        break
                elif not header_hit:
                    # Detect the table header
                    if table_header_re.search(up):
                        header_hit = True
                    # continue scanning same page to look for end only after header
                else:
                    # after header, detect next section
                    if next_sec_re.match(up) and not up.startswith(code):
                        end_page = i
                        break
            if end_page is not None:
                break

        # Validation
        if start_page is None or not header_hit:
            return f"Section '{section_code} {section_title}' not found or missing table header."

        if end_page is None:
            end_page = len(pdf.pages)

        # Phase 2: layout-mode extraction on full range [start_page..end_page)
        collected = []
        in_table  = False

        for pi in range(start_page, end_page):
            layout = pdf.pages[pi].extract_text(layout=True) or ""
            for ln in layout.splitlines():
                up = ln.strip().upper()
                if not in_table:
                    if table_header_re.search(up):
                        in_table = True
                    continue
                # Stop at next section if it sneaks in layout text
                if next_sec_re.match(up) and not up.startswith(code):
                    break
                collected.append(ln)

        return "\n".join(collected).rstrip()


if __name__ == "__main__":
    pdf_file = "NC750XAP_13MKWM02_PC_2022_2023.pdf"
    print(extract_section_with_layout(
        pdf_file,
        section_code="F-50",
        section_title="MARK",
    ))

             Ref.                                                                                                   
      3      No. Part No.     Description     NC750XA NC750XD Serial No.         Parts catalogue code               
                                             M N  P M  N P                                                          
              1 11378-MKS-E50 PLATE, EMBLEM∙∙∙∙∙∙∙∙∙∙∙∙∙∙∙∙ - - - 1 1 1                                             
              2 86201-MKA-A30ZB MARK, R. WING(105MM)                                                                
                         *TYPE2*...................... 1 1 1 1 1 1                                                  
               86201-MKW-D00ZC *TYPE1*...................... 1 1 - 1 1 - -------- --------2ED,CH,ED,FO,GS           
               86201-MKW-D00ZB *TYPE2*...................... 1 1 - 1 1 -                                            
               86201-MKW-D00ZA *TYPE3*...................... 1 -

In [228]:
import re
import pdfplumber

def extract_section_with_layout(pdf_path: str, section_code: str, section_title: str):
    """
    Finds a specified section, locates 'Reqd. QTY', extracts in layout mode,
    then parses each part and variant into [REF_NO, PART_NO, DESCRIPTION, CATALOGUE_CODE].
    """
    code = section_code.upper()
    title = section_title.upper()

    next_sec_re     = re.compile(r'^[A-Z]+-\d+', re.IGNORECASE)
    table_header_re = re.compile(r'\bReqd\.?\s*QTY\b', re.IGNORECASE)
    part_no_re      = re.compile(r'\b[0-9]{5,}(?:-[A-Z0-9-]+)+\b')

    # Phase 1: locate page range
    start_page = header_hit = None
    end_page = None
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            for ln in (page.extract_text() or "").splitlines():
                u = ln.strip().upper()
                if start_page is None:
                    if (("FRAMEGROUP" in u and u.startswith(code) and title in u)
                        or (u.startswith(code) and title in u)):
                        start_page = i
                        break
                elif not header_hit:
                    if table_header_re.search(u):
                        header_hit = True
                else:
                    if next_sec_re.match(u) and not u.startswith(code):
                        end_page = i
                        break
            if end_page is not None:
                break
        if start_page is None or not header_hit:
            raise ValueError(f"Section '{section_code} {section_title}' not found or missing table header.")
        if end_page is None:
            end_page = len(pdf.pages)

        # Phase 2: collect layout-preserved lines
        collected = []
        in_table = False
        for pi in range(start_page, end_page):
            layout = pdf.pages[pi].extract_text(layout=True) or ""
            for ln in layout.splitlines():
                u = ln.strip().upper()
                if not in_table:
                    if table_header_re.search(u):
                        in_table = True
                    continue
                if next_sec_re.match(u) and not u.startswith(code):
                    break
                collected.append(ln)

    # Phase 3: group into per-part buffers
    records = []
    last_ref = ""
    for ln in collected:
        m_pno = part_no_re.search(ln)
        if m_pno:
            m_ref = re.match(r'^\s*(?:\((\d+)\)|(\d+))\s+', ln)
            if m_ref:
                last_ref = m_ref.group(1) or m_ref.group(2)
            ref = last_ref
            pno = m_pno.group(0)
            buf = [ln[m_pno.end():].strip()]
            records.append({"ref": ref, "part_no": pno, "buf": buf})
        else:
            if not records:
                continue
            txt = ln.strip()
            if re.fullmatch(r'\d+', txt) or re.fullmatch(r'\d{4}\.\d{2}\.\d{2}', txt):
                continue
            records[-1]["buf"].append(txt)

    # Phase 4: parse each buffer
    out = []
    for rec in records:
        raw = " ".join(rec["buf"])
        raw = raw.replace('∙','').replace('•','').replace('\uf020','')
        raw = re.sub(r'\s+', ' ', raw).strip()

        # split on the first run of dashes
        idx = raw.find("--------")
        desc_part = raw[:idx].strip() if idx != -1 else raw
        cat_part  = raw[idx+8:].strip() if idx != -1 else ""

        # clean description
        desc_part = re.sub(r'\.{2,}\s+\d.*$', '', desc_part).strip()
        desc_part = re.sub(r'\s+GK[A-Za-z0-9]+\s*$', '', desc_part)
        desc_part = re.sub(r'\s+(?:-+|\d+)+\s*$', '', desc_part)
        desc = re.sub(r'\s+\d+\s+\d{4}\.\d{2}\.\d{2}.*$', "", desc_part).strip()
        desc = re.sub(r'(?:\s+(?:\(\d+\)|-+|\d+))+$', "", desc).strip()
        desc = re.sub(r'\.{2,}$', "", desc).strip()
        desc = "" if not re.search(r'[A-Za-z]', desc) else desc

        # extract catalogue codes
        if cat_part.upper().startswith("GK") and len(cat_part) > 8:
            # drop the first 8 chars of GK-prefix, then take up to the first space
            cat_clean = cat_part[8:].split()[0]
        else:
            m_codes   = re.match(r'[-\s]*([0-9A-Z,\s]+)', cat_part)
            raw_codes = m_codes.group(1) if m_codes else ""
            # drop spaces
            cat_clean = raw_codes.replace(" ", "")
            # insert comma between letter and digit runs, e.g. "TH3U" → "TH,3U"
            cat_clean = re.sub(r'([A-Z])(?=\d)', r'\1,', cat_clean)
            # insert comma before any two-letter code glued at the end or before another code
            cat_clean = re.sub(
                r'(?<=[0-9A-Z]{2})(?=[A-Z]{2}(?:,|$))',
                ',',
                cat_clean
            )

        # drop any trailing 4-digit year if present
        cat_clean = re.sub(r'\d{4}$', '', cat_clean)

        tokens = [t for t in cat_clean.split(',') if t]
        seen = set()
        final_codes = [c for c in tokens if c not in seen and not seen.add(c)]
        cat = ",".join(final_codes)

        # suffix-splitting logic unchanged
        m3 = re.match(r'^(.+?)([A-Z]{3,})$', rec["part_no"])
        if m3:
            core, suf = m3.group(1), m3.group(2)
            pno = core + suf[:2]
            desc = f"{suf[2:]} {desc}".strip()
        else:
            pno = rec["part_no"]

        out.append([rec["ref"], pno, desc, cat])

    return out

if __name__ == "__main__":
    pdf_file = "NC750XAP_13MKWM02_PC_2022_2023.pdf"
    for row in extract_section_with_layout(pdf_file, "F-19-40", "REAR WHEEL"):
        print(row)


['1', '06410-MGS-D30', 'DAMPER SET, WHEEL', '']
['2', '41201-MGS-D30', 'SPROCKET, FINAL DRIVEN(43T)', '']
['2', '41201-MKW-D10', 'SPROCKET, FINAL DRIVEN(41T)', '']
['3', '42301-MGS-D31', 'AXLE, RR. WHEEL', '']
['4', '42311-MGS-D10', 'COLLAR, L. RR. WHEEL SIDE', '']
['5', '42312-MGS-D10', 'COLLAR, RR. BRAKE SIDE', '']
['6', '42515-MGS-D80', 'RING, RR. PULSER', '']
['7', '42615-MGS-D30', 'FLANGE SUB ASSY., RR. DRIVEN', '']
['8', '42620-MGS-D10', 'COLLAR, RR. AXLE DISTANCE', '']
['9', '42625-MGS-D30', 'COLLAR B, RR. WHEEL DISTANCE', '']
['10', '42650-MKA-D81ZB', 'WHEEL SUB ASSY., RR. *TYPE1*', '']
['11', '42704-MER-D00', 'WEIGHT, BALANCE(10G) N N N N N N', '']
['11', '42705-MER-D00', 'WEIGHT, BALANCE(20G) N N N N N N', '']
['11', '42706-MER-D00', 'WEIGHT, BALANCE(30G) N N N N N N', '']
['12', '42711-MJL-D34', 'TIRE, RR.(DUNLOP) (160/60ZR17 M/C 69W)', '']
['12', '42711-MKW-D01', 'TIRE, RR.(METZELER) (160/60ZR17 M/C 69W)', '']
['13', '42755-MKA-D81', 'VALVE ASSY., CLAMP IN', '']
['14', '427

In [239]:
import re
import pdfplumber

def extract_section_with_layout(pdf_path: str, section_code: str, section_title: str):
    """
    Finds a specified section, locates 'Reqd. QTY', extracts in layout mode,
    then parses each part and variant into [REF_NO, PART_NO, DESCRIPTION, CATALOGUE_CODE].
    """
    code = section_code.upper()
    title = section_title.upper()

    next_sec_re     = re.compile(r'^[A-Z]+-\d+', re.IGNORECASE)
    table_header_re = re.compile(r'\bReqd\.?\s*QTY\b', re.IGNORECASE)
    part_no_re      = re.compile(r'\b[0-9]{5,}(?:-[A-Z0-9-]+)+\b')

    # Phase 1: locate page range
    start_page = header_hit = None
    end_page = None
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            for ln in (page.extract_text() or "").splitlines():
                u = ln.strip().upper()
                if start_page is None:
                    if (("FRAMEGROUP" in u and u.startswith(code) and title in u)
                        or (u.startswith(code) and title in u)):
                        start_page = i
                        break
                elif not header_hit:
                    if table_header_re.search(u):
                        header_hit = True
                else:
                    if next_sec_re.match(u) and not u.startswith(code):
                        end_page = i
                        break
            if end_page is not None:
                break
        if start_page is None or not header_hit:
            raise ValueError(f"Section '{section_code} {section_title}' not found or missing table header.")
        if end_page is None:
            end_page = len(pdf.pages)

        # Phase 2: collect layout-preserved lines
        collected = []
        in_table = False
        for pi in range(start_page, end_page):
            layout = pdf.pages[pi].extract_text(layout=True) or ""
            for ln in layout.splitlines():
                u = ln.strip().upper()
                if not in_table:
                    if table_header_re.search(u):
                        in_table = True
                    continue
                if next_sec_re.match(u) and not u.startswith(code):
                    break
                collected.append(ln)

    # Phase 3: group into per-part buffers
    records = []
    last_ref = ""
    for ln in collected:
        m_pno = part_no_re.search(ln)
        if m_pno:
            m_ref = re.match(r'^\s*(?:\((\d+)\)|(\d+))\s+', ln)
            if m_ref:
                last_ref = m_ref.group(1) or m_ref.group(2)
            ref = last_ref
            pno = m_pno.group(0)
            buf = [ln[m_pno.end():].strip()]
            records.append({"ref": ref, "part_no": pno, "buf": buf})
        else:
            if not records:
                continue
            txt = ln.strip()
            if re.fullmatch(r'\d+', txt) or re.fullmatch(r'\d{4}\.\d{2}\.\d{2}', txt):
                continue
            records[-1]["buf"].append(txt)

    # Phase 4: parse each buffer
    out = []
    for rec in records:
        raw = " ".join(rec["buf"])
        raw = raw.replace('∙','').replace('•','').replace('\uf020','')
        raw = re.sub(r'\s+', ' ', raw).strip()

        # split on the first run of dashes
        idx = raw.find("--------")
        desc_part = raw[:idx].strip() if idx != -1 else raw
        cat_part  = raw[idx+8:].strip() if idx != -1 else ""

        # clean description
        desc_part = re.sub(r'\.{2,}\s+\d.*$', '', desc_part).strip()
        desc_part = re.sub(r'\s+GK[A-Za-z0-9]+\s*$', '', desc_part)
        desc_part = re.sub(r'\s+(?:-+|\d+)+\s*$', '', desc_part)
        desc = re.sub(r'\s+\d+\s+\d{4}\.\d{2}\.\d{2}.*$', "", desc_part).strip()
        desc = re.sub(r'(?:\s+(?:\(\d+\)|-+|\d+))+$', "", desc).strip()
        desc = re.sub(r'\.{2,}$', "", desc).strip()
        # — NEW: drop trailing single-letter columns like " N N N"
        desc = re.sub(r'(?:\s+[A-Z])+$', '', desc).strip()
        desc = "" if not re.search(r'[A-Za-z]', desc) else desc

        # extract catalogue codes
        if cat_part.upper().startswith("GK") and len(cat_part) > 8:
            cat_clean = cat_part[8:].split()[0]
        else:
            m_codes   = re.match(r'[-\s]*([0-9A-Z,\s]+)', cat_part)
            raw_codes = m_codes.group(1) if m_codes else ""
            cat_clean = raw_codes.replace(" ", "")
            cat_clean = re.sub(r'([A-Z])(?=\d)', r'\1,', cat_clean)
            cat_clean = re.sub(
                r'(?<=[0-9A-Z]{2})(?=[A-Z]{2}(?:,|$))',
                ',',
                cat_clean
            )

        # drop any trailing 4-digit year if present
        cat_clean = re.sub(r'\d{4}$', '', cat_clean)

        tokens = [t for t in cat_clean.split(',') if t]
        seen = set()
        final_codes = [c for c in tokens if c not in seen and not seen.add(c)]
        cat = ",".join(final_codes)

        # suffix-splitting logic unchanged
        m3 = re.match(r'^(.+?)([A-Z]{3,})$', rec["part_no"])
        if m3:
            core, suf = m3.group(1), m3.group(2)
            pno = core + suf[:2]
            desc = f"{suf[2:]} {desc}".strip()
        else:
            pno = rec["part_no"]

        out.append([rec["ref"], pno, desc, cat])

    return out

if __name__ == "__main__":
    pdf_file = "CRF1000 A_PC_13MJPG02_(G.H).pdf"
    for row in extract_section_with_layout(pdf_file, "F-40-2", "MARK/EMBLEM(CRF COLOR)"):
        print(row)


['1', '86101-MJP-G50ZA', 'MARK , HONDA(65MM) *TYPE1*', '3ED,3GS,3IN,3KO,3RU,3TH,3U']
['2', '86102-MJP-G50ZA', 'MARK , HONDA(55MM) *TYPE1*', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['3', '86171-MJP-F50ZA', 'STRIPE A, R. FUEL TANK *TYPE1*', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['4', '86172-MJP-F50ZA', 'STRIPE A, L. FUEL TANK *TYPE1*', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['5', '86173-MJP-F50ZA', 'STRIPE B, R. FUEL TANK *TYPE1*', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['6', '86174-MJP-F50ZA', 'STRIPE B, L. FUEL TANK *TYPE1*', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['7', '86211-MJP-G50', 'BADGE, R. PRODUCT', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['8', '86212-MJP-G50', 'BADGE, L. PRODUCT', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['9', '86611-MJP-F50ZA', 'STRIPE , R. FR. FENDER *TYPE1*', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['10', '86612-MJP-F50ZA', 'STRIPE , L. FR. FENDER *TYPE1*', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['11', '86641-MJP-F50ZA', 'STRIPE A, R. MIDDLE COWL *TYPE1*', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['12', '86642-M

In [256]:
import re
import pdfplumber

def extract_section_with_layout(pdf_path: str, section_code: str, section_title: str):
    """
    Finds a specified section, locates 'Reqd. QTY', extracts in layout mode,
    then parses each part and variant into [REF_NO, PART_NO, DESCRIPTION, CATALOGUE_CODE].
    Stops collecting once it encounters any line containing 'PART', 'NO', and 'INDEX'.
    """
    code = section_code.upper()
    title = section_title.upper()

    next_sec_re     = re.compile(r'^[A-Z]+-\d+', re.IGNORECASE)
    table_header_re = re.compile(r'\bReqd\.?\s*QTY\b', re.IGNORECASE)
    part_no_re      = re.compile(r'\b[0-9]{5,}(?:-[A-Z0-9-]+)+\b')
    end_re          = re.compile(r'.*PART\s*NO\.?\s*INDEX.*', re.IGNORECASE)

    # Phase 1: locate page range
    start_page = header_hit = None
    end_page = None
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            for ln in (page.extract_text() or "").splitlines():
                u = ln.strip().upper()
                if start_page is None:
                    if (("FRAMEGROUP" in u and u.startswith(code) and title in u)
                        or (u.startswith(code) and title in u)):
                        start_page = i
                        break
                elif not header_hit:
                    if table_header_re.search(u):
                        header_hit = True
                else:
                    if next_sec_re.match(u) and not u.startswith(code):
                        end_page = i
                        break
            if end_page is not None:
                break
        if start_page is None or not header_hit:
            raise ValueError(f"Section '{section_code} {section_title}' not found or missing table header.")
        if end_page is None:
            end_page = len(pdf.pages)

        # Phase 2: collect layout-preserved lines
        collected = []
        in_table = False
        stop_all = False
        for pi in range(start_page, end_page):
            for ln in (pdf.pages[pi].extract_text(layout=True) or "").splitlines():
                u = ln.strip().upper()
                # if we see "PART ... NO ... INDEX" in any spacing, stop entirely
                if end_re.match(u):
                    stop_all = True
                    break
                if not in_table:
                    if table_header_re.search(u):
                        in_table = True
                    continue
                if next_sec_re.match(u) and not u.startswith(code):
                    break
                collected.append(ln)
            if stop_all:
                break

    # Phase 3: group into per-part buffers
    records = []
    last_ref = ""
    for ln in collected:
        m_pno = part_no_re.search(ln)
        if m_pno:
            m_ref = re.match(r'^\s*(?:\((\d+)\)|(\d+))\s+', ln)
            if m_ref:
                last_ref = m_ref.group(1) or m_ref.group(2)
            ref = last_ref
            pno = m_pno.group(0)
            buf = [ln[m_pno.end():].strip()]
            records.append({"ref": ref, "part_no": pno, "buf": buf})
        else:
            if not records:
                continue
            txt = ln.strip()
            if re.fullmatch(r'\d+', txt) or re.fullmatch(r'\d{4}\.\d{2}\.\d{2}', txt):
                continue
            records[-1]["buf"].append(txt)

    # Phase 4: parse each buffer
    out = []
    for rec in records:
        raw = " ".join(rec["buf"])
        raw = raw.replace('∙','').replace('•','').replace('\uf020','')
        raw = re.sub(r'\s+', ' ', raw).strip()

        idx = raw.find("--------")
        desc_part = raw[:idx].strip() if idx != -1 else raw
        cat_part  = raw[idx+8:].strip() if idx != -1 else ""

        desc_part = re.sub(r'\.{2,}\s+\d.*$', '', desc_part).strip()
        desc_part = re.sub(r'\s+GK[A-Za-z0-9]+\s*$', '', desc_part)
        desc_part = re.sub(r'\s+(?:-+|\d+)+\s*$', '', desc_part)
        desc = re.sub(r'\s+\d+\s+\d{4}\.\d{2}\.\d{2}.*$', "", desc_part).strip()
        desc = re.sub(r'(?:\s+(?:\(\d+\)|-+|\d+))+$', "", desc).strip()
        desc = re.sub(r'\.{2,}$', "", desc).strip()
        desc = re.sub(r'(?:\s+[A-Z])+$', '', desc).strip()
        desc = "" if not re.search(r'[A-Za-z]', desc) else desc

        if cat_part.upper().startswith("GK") and len(cat_part) > 8:
            cat_clean = cat_part[8:].split()[0]
        else:
            m_codes = re.match(r'[-\s]*([0-9A-Z,\s]+)', cat_part)
            raw_codes = m_codes.group(1) if m_codes else ""
            cat_clean = raw_codes.replace(" ", "")
            cat_clean = re.sub(r'([A-Z])(?=\d)', r'\1,', cat_clean)
            cat_clean = re.sub(
                r'(?<=[0-9A-Z]{2})(?=[A-Z]{2}(?:,|$))',
                ',',
                cat_clean
            )

        cat_clean = re.sub(r'\d{4}$', '', cat_clean)
        tokens = [t for t in cat_clean.split(',') if t]
        seen = set()
        final_codes = [c for c in tokens if c not in seen and not seen.add(c)]
        cat = ",".join(final_codes)

        m3 = re.match(r'^(.+?)([A-Z]{3,})$', rec["part_no"])
        if m3:
            core, suf = m3.group(1), m3.group(2)
            pno = core + suf[:2]
            desc = f"{suf[2:]} {desc}".strip()
        else:
            pno = rec["part_no"]

        out.append([rec["ref"], pno, desc, cat])

    return out

if __name__ == "__main__":
    pdf_file = "CRF1000 A_PC_13MJPG02_(G.H).pdf"
    for row in extract_section_with_layout(pdf_file, "F-40-2", "MARK/EMBLEM(CRF COLOR)"):
        print(row)


['1', '86101-MJP-G50ZA', 'MARK , HONDA(65MM) *TYPE1*', '3ED,3GS,3IN,3KO,3RU,3TH,3U']
['2', '86102-MJP-G50ZA', 'MARK , HONDA(55MM) *TYPE1*', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['3', '86171-MJP-F50ZA', 'STRIPE A, R. FUEL TANK *TYPE1*', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['4', '86172-MJP-F50ZA', 'STRIPE A, L. FUEL TANK *TYPE1*', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['5', '86173-MJP-F50ZA', 'STRIPE B, R. FUEL TANK *TYPE1*', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['6', '86174-MJP-F50ZA', 'STRIPE B, L. FUEL TANK *TYPE1*', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['7', '86211-MJP-G50', 'BADGE, R. PRODUCT', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['8', '86212-MJP-G50', 'BADGE, L. PRODUCT', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['9', '86611-MJP-F50ZA', 'STRIPE , R. FR. FENDER *TYPE1*', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['10', '86612-MJP-F50ZA', 'STRIPE , L. FR. FENDER *TYPE1*', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['11', '86641-MJP-F50ZA', 'STRIPE A, R. MIDDLE COWL *TYPE1*', '3ED,3GS,3IN,3KO,3RU,3TH,3U,4CH']
['12', '86642-M

In [1]:
import re
import pandas as pd
import pdfplumber

def extract_section_with_layout(pdf_path: str, section_code: str, section_title: str):
    """
    Finds a specified section, locates 'Reqd. QTY', extracts in layout mode,
    then parses each part and variant into ref_no, part_no, description, remarks.
    Stops collecting once it encounters any line containing 'PART', 'NO', and 'INDEX'.
    Returns a DataFrame with columns ref_no, part_no, description, remarks.
    """
    code = section_code.upper()
    title = section_title.upper()

    next_sec_re     = re.compile(r'^[A-Z]+-\d+', re.IGNORECASE)
    table_header_re = re.compile(r'\bReqd\.?\s*QTY\b', re.IGNORECASE)
    part_no_re      = re.compile(r'\b[0-9]{5,}(?:-[A-Z0-9-]+)+\b')
    end_re          = re.compile(r'.*PART\s*NO\.?\s*INDEX.*', re.IGNORECASE)

    # Phase 1: locate page range
    start_page = header_hit = None
    end_page = None
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            for ln in (page.extract_text() or "").splitlines():
                u = ln.strip().upper()
                if start_page is None:
                    if (("FRAMEGROUP" in u and u.startswith(code) and title in u)
                        or (u.startswith(code) and title in u)):
                        start_page = i
                        break
                elif not header_hit:
                    if table_header_re.search(u):
                        header_hit = True
                else:
                    if next_sec_re.match(u) and not u.startswith(code):
                        end_page = i
                        break
            if end_page is not None:
                break
        if start_page is None or not header_hit:
            raise ValueError(f"Section '{section_code} {section_title}' not found or missing table header.")
        if end_page is None:
            end_page = len(pdf.pages)

        # Phase 2: collect layout-preserved lines
        collected = []
        in_table = False
        stop_all = False
        for pi in range(start_page, end_page):
            for ln in (pdf.pages[pi].extract_text(layout=True) or "").splitlines():
                u = ln.strip().upper()
                if end_re.match(u):
                    stop_all = True
                    break
                if not in_table:
                    if table_header_re.search(u):
                        in_table = True
                    continue
                if next_sec_re.match(u) and not u.startswith(code):
                    break
                collected.append(ln)
            if stop_all:
                break

    # Phase 3: group into per-part buffers
    records = []
    last_ref = ""
    for ln in collected:
        m_pno = part_no_re.search(ln)
        if m_pno:
            m_ref = re.match(r'^\s*(?:\((\d+)\)|(\d+))\s+', ln)
            if m_ref:
                last_ref = m_ref.group(1) or m_ref.group(2)
            records.append({
                "ref":      last_ref,
                "part_no":  m_pno.group(0),
                "buf":      [ln[m_pno.end():].strip()]
            })
        else:
            if not records:
                continue
            txt = ln.strip()
            if re.fullmatch(r'\d+', txt) or re.fullmatch(r'\d{4}\.\d{2}\.\d{2}', txt):
                continue
            records[-1]["buf"].append(txt)

    # Phase 4: parse each buffer directly into column-lists
    ref_nos      = []
    part_nos     = []
    descriptions = []
    remarks_list = []

    for rec in records:
        raw = " ".join(rec["buf"])
        raw = raw.replace('∙','').replace('•','').replace('\uf020','')
        raw = re.sub(r'\s+', ' ', raw).strip()

        idx       = raw.find("--------")
        desc_part = raw[:idx].strip() if idx != -1 else raw
        cat_part  = raw[idx+8:].strip() if idx != -1 else ""

        # clean up description
        desc_part = re.sub(r'\.{2,}\s+\d.*$', '', desc_part).strip()
        desc_part = re.sub(r'\s+GK[A-Za-z0-9]+\s*$', '', desc_part)
        desc_part = re.sub(r'\s+(?:-+|\d+)+\s*$', '', desc_part)
        desc      = re.sub(r'\s+\d+\s+\d{4}\.\d{2}\.\d{2}.*$', "", desc_part).strip()
        desc      = re.sub(r'(?:\s+(?:\(\d+\)|-+|\d+))+$',     "", desc).strip()
        desc      = re.sub(r'\.{2,}$',                         "", desc).strip()
        desc      = re.sub(r'(?:\s+[A-Z])+$',                  "", desc).strip()
        desc      = "" if not re.search(r'[A-Za-z]', desc) else desc

        # clean up catalogue codes → remarks
        if cat_part.upper().startswith("GK") and len(cat_part) > 8:
            cat_clean = cat_part[8:].split()[0]
        else:
            m_codes   = re.match(r'[-\s]*([0-9A-Z,\s]+)', cat_part)
            raw_codes = m_codes.group(1) if m_codes else ""
            cat_clean = raw_codes.replace(" ", "")
            cat_clean = re.sub(r'([A-Z])(?=\d)', r'\1,', cat_clean)
            cat_clean = re.sub(r'(?<=[0-9A-Z]{2})(?=[A-Z]{2}(?:,|$))', ',', cat_clean)
        cat_clean    = re.sub(r'\d{4}$', '', cat_clean)
        tokens       = [t for t in cat_clean.split(',') if t]
        seen         = set()
        final_codes  = [c for c in tokens if c not in seen and not seen.add(c)]
        remarks      = ",".join(final_codes)

        # adjust part_no suffix logic
        m3 = re.match(r'^(.+?)([A-Z]{3,})$', rec["part_no"])
        if m3:
            core, suf = m3.group(1), m3.group(2)
            part_no    = core + suf[:2]
            desc       = f"{suf[2:]} {desc}".strip()
        else:
            part_no = rec["part_no"]

        # append to column lists
        ref_nos.append(rec["ref"])
        part_nos.append(part_no)
        descriptions.append(desc)
        remarks_list.append(remarks)

    # build and return DataFrame
    df = pd.DataFrame({
        'ref_no':      ref_nos,
        'part_no':     part_nos,
        'description': descriptions,
        'remarks':     remarks_list
    })
    return df


if __name__ == "__main__":
    pdf_file = "NC750XAP_13MKWM02_PC_2022_2023.pdf"
    df = extract_section_with_layout(pdf_file, "F-50", "MARK")
    print(df.head(20))


   ref_no          part_no                   description          remarks
0       1    11378-MKS-E50                 PLATE, EMBLEM                 
1       2  86201-MKA-A30ZB  MARK, R. WING(105MM) *TYPE2*                 
2       2  86201-MKW-D00ZC                       *TYPE1*  2ED,CH,ED,FO,GS
3       2  86201-MKW-D00ZB                       *TYPE2*                 
4       2  86201-MKW-D00ZA                       *TYPE3*           2ED,ED
5       2  86201-MKW-D20ZB                       *TYPE2*           2ED,ED
6       2  86201-MKW-D20ZC                       *TYPE1*                 
7       2  86201-MKW-D20ZA                       *TYPE3*        2ED,ED,GS
8       3  86202-MKA-A30ZB  MARK, L. WING(105MM) *TYPE2*                 
9       3  86202-MKW-D00ZC                       *TYPE1*  2ED,CH,ED,FO,GS
10      3  86202-MKW-D00ZB                       *TYPE2*                 
11      3  86202-MKW-D00ZA                       *TYPE3*           2ED,ED
12      3  86202-MKW-D20ZB            

In [None]:
import re
import pandas as pd
import pdfplumber

def extract_section_with_layout(pdf_path: str, section_code: str, section_title: str) -> pd.DataFrame:
    """
    Finds a specified section, locates 'Reqd. QTY', extracts in layout mode,
    then parses each part and variant into ref_no, part_no, description, remarks.
    Stops collecting once it encounters any line containing 'PART', 'NO', and 'INDEX'.
    Returns a DataFrame with columns ref_no, part_no, description, remarks.
    """
    code = section_code.upper()
    title = section_title.upper()

    next_sec_re     = re.compile(r'^[A-Z]+-\d+', re.IGNORECASE)
    table_header_re = re.compile(r'\bReqd\.?\s*QTY\b', re.IGNORECASE)
    part_no_re      = re.compile(r'\b[0-9]{5,}(?:-[A-Z0-9-]+)+\b')
    end_re          = re.compile(r'.*PART\s*NO\.?\s*INDEX.*', re.IGNORECASE)

    # Phase 1: locate page range
    start_page = header_hit = None
    end_page = None
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            for ln in (page.extract_text() or "").splitlines():
                u = ln.strip().upper()
                if start_page is None:
                    if (("FRAMEGROUP" in u and u.startswith(code) and title in u)
                        or (u.startswith(code) and title in u)):
                        start_page = i
                        break
                elif not header_hit:
                    if table_header_re.search(u):
                        header_hit = True
                else:
                    if next_sec_re.match(u) and not u.startswith(code):
                        end_page = i
                        break
            if end_page is not None:
                break
        if start_page is None or not header_hit:
            raise ValueError(f"Section '{section_code} {section_title}' not found or missing table header.")
        if end_page is None:
            end_page = len(pdf.pages)

        # Phase 2: collect layout-preserved lines
        collected = []
        in_table = False
        stop_all = False
        for pi in range(start_page, end_page):
            for ln in (pdf.pages[pi].extract_text(layout=True) or "").splitlines():
                u = ln.strip().upper()
                if end_re.match(u):
                    stop_all = True
                    break
                if not in_table:
                    if table_header_re.search(u):
                        in_table = True
                    continue
                if next_sec_re.match(u) and not u.startswith(code):
                    break
                collected.append(ln)
            if stop_all:
                break

    # Phase 3: group into per-part buffers
    records = []
    last_ref = ""
    for ln in collected:
        m_pno = part_no_re.search(ln)
        if m_pno:
            m_ref = re.match(r'^\s*(?:\((\d+)\)|(\d+))\s+', ln)
            if m_ref:
                last_ref = m_ref.group(1) or m_ref.group(2)
            records.append({
                "ref":      last_ref,
                "part_no":  m_pno.group(0),
                "buf":      [ln[m_pno.end():].strip()]
            })
        else:
            if not records:
                continue
            txt = ln.strip()
            if re.fullmatch(r'\d+', txt) or re.fullmatch(r'\d{4}\.\d{2}\.\d{2}', txt):
                continue
            records[-1]["buf"].append(txt)

    # Phase 4: parse each buffer directly into column-lists
    ref_nos      = []
    part_nos     = []
    descriptions = []
    remarks_list = []

    for rec in records:
        raw = " ".join(rec["buf"])
        raw = raw.replace('∙','').replace('•','').replace('\uf020','')
        raw = re.sub(r'\s+', ' ', raw).strip()

        idx       = raw.find("--------")
        desc_part = raw[:idx].strip() if idx != -1 else raw
        cat_part  = raw[idx+8:].strip() if idx != -1 else ""

        # clean up description
        desc_part = re.sub(r'\.{2,}\s+\d.*$', '', desc_part).strip()
        desc_part = re.sub(r'\s+GK[A-Za-z0-9]+\s*$', '', desc_part)
        desc_part = re.sub(r'\s+(?:-+|\d+)+\s*$', '', desc_part)
        desc      = re.sub(r'\s+\d+\s+\d{4}\.\d{2}\.\d{2}.*$', "", desc_part).strip()
        desc      = re.sub(r'(?:\s+(?:\(\d+\)|-+|\d+))+$',     "", desc).strip()
        desc      = re.sub(r'\.{2,}$',                         "", desc).strip()
        desc      = re.sub(r'(?:\s+[A-Z])+$',                  "", desc).strip()
        desc      = "" if not re.search(r'[A-Za-z]', desc) else desc

        # clean up catalogue codes → remarks
        if cat_part.upper().startswith("GK") and len(cat_part) > 8:
            cat_clean = cat_part[8:].split()[0]
        else:
            m_codes   = re.match(r'[-\s]*([0-9A-Z,\s]+)', cat_part)
            raw_codes = m_codes.group(1) if m_codes else ""
            cat_clean = raw_codes.replace(" ", "")
            cat_clean = re.sub(r'([A-Z])(?=\d)', r'\1,', cat_clean)
            cat_clean = re.sub(r'(?<=[0-9A-Z]{2})(?=[A-Z]{2}(?:,|$))', ',', cat_clean)
        cat_clean    = re.sub(r'\d{4}$', '', cat_clean)
        tokens       = [t for t in cat_clean.split(',') if t]
        seen         = set()
        final_codes  = [c for c in tokens if c not in seen and not seen.add(c)]
        remarks      = ",".join(final_codes)

        # adjust part_no suffix logic
        m3 = re.match(r'^(.+?)([A-Z]{3,})$', rec["part_no"])
        if m3:
            core, suf = m3.group(1), m3.group(2)
            part_no    = core + suf[:2]
            desc       = f"{suf[2:]} {desc}".strip()
        else:
            part_no = rec["part_no"]

        ref_nos.append(rec["ref"])
        part_nos.append(part_no)
        descriptions.append(desc)
        remarks_list.append(remarks)

    df = pd.DataFrame({
        'ref_no':      ref_nos,
        'part_no':     part_nos,
        'description': descriptions,
        'remarks':     remarks_list
    })
    return df


def extract_all_sections_one_pass(pdf_path: str, output_csv: str) -> pd.DataFrame:
    """
    Opens the PDF once, walks through it page by page, detects sections using
    next_sec_re, collects each section’s lines, inlines Phase 3+4 verbatim,
    stops entirely when end_re is first encountered, strips any leading
    "*GROUP" from titles, and writes a CSV with columns
    section_no, section_name, ref_no, part_no, description, remarks.
    """
    next_sec_re     = re.compile(r'^[A-Z]+-\d+', re.IGNORECASE)
    table_header_re = re.compile(r'\bReqd\.?\s*QTY\b', re.IGNORECASE)
    part_no_re      = re.compile(r'\b[0-9]{5,}(?:-[A-Z0-9-]+)+\b')
    end_re          = re.compile(r'.*PART\s*NO\.?\s*INDEX.*', re.IGNORECASE)

    section_nos   = []
    section_names = []
    ref_nos       = []
    part_nos      = []
    descriptions  = []
    remarks_list  = []

    current = None
    done    = False

    def _flush(cur):
        """Phase 3+4 logic verbatim, flushing cur['collected'] into our lists."""
        records = []; last_ref = ""
        for ln in cur['collected']:
            m_pno = part_no_re.search(ln)
            if m_pno:
                m_ref = re.match(r'^\s*(?:\((\d+)\)|(\d+))\s+', ln)
                if m_ref:
                    last_ref = m_ref.group(1) or m_ref.group(2)
                records.append({
                    'ref': last_ref,
                    'part_no': m_pno.group(0),
                    'buf': [ln[m_pno.end():].strip()]
                })
            else:
                if not records: continue
                txt = ln.strip()
                if re.fullmatch(r'\d+', txt) or re.fullmatch(r'\d{4}\.\d{2}\.\d{2}', txt):
                    continue
                records[-1]['buf'].append(txt)

        for rec in records:
            raw = " ".join(rec['buf']).replace('∙','').replace('•','').replace('\uf020','')
            raw = re.sub(r'\s+', ' ', raw).strip()
            idx = raw.find("--------")
            desc_part = raw[:idx].strip() if idx != -1 else raw
            cat_part  = raw[idx+8:].strip() if idx != -1 else ""

            # description cleanup
            desc_part = re.sub(r'\.{2,}\s+\d.*$', '', desc_part).strip()
            desc_part = re.sub(r'\s+GK[A-Za-z0-9]+\s*$', '', desc_part)
            desc_part = re.sub(r'\s+(?:-+|\d+)+\s*$', '', desc_part)
            desc = re.sub(r'\s+\d+\s+\d{4}\.\d{2}\.\d{2}.*$', "", desc_part).strip()
            desc = re.sub(r'(?:\s+(?:\(\d+\)|-+|\d+))+$', "", desc).strip()
            desc = re.sub(r'\.{2,}$', "", desc).strip()
            desc = re.sub(r'(?:\s+[A-Z])+$', "", desc).strip()
            desc = "" if not re.search(r'[A-Za-z]', desc) else desc

            # remarks cleanup
            if cat_part.upper().startswith("GK") and len(cat_part) > 8:
                cat_clean = cat_part[8:].split()[0]
            else:
                m_codes   = re.match(r'[-\s]*([0-9A-Z,\s]+)', cat_part)
                raw_codes = m_codes.group(1) if m_codes else ""
                cat_clean = raw_codes.replace(" ", "")
                cat_clean = re.sub(r'([A-Z])(?=\d)', r'\1,', cat_clean)
                cat_clean = re.sub(r'(?<=[0-9A-Z]{2})(?=[A-Z]{2}(?:,|$))', ',', cat_clean)
            cat_clean   = re.sub(r'\d{4}$', '', cat_clean)
            tokens      = [t for t in cat_clean.split(',') if t]
            seen        = set()
            final_codes = [c for c in tokens if c not in seen and not seen.add(c)]
            remarks     = ",".join(final_codes)

            m3 = re.match(r'^(.+?)([A-Z]{3,})$', rec['part_no'])
            if m3:
                core, suf = m3.group(1), m3.group(2)
                pno        = core + suf[:2]
                desc       = f"{suf[2:]} {desc}".strip()
            else:
                pno = rec['part_no']

            section_nos.append(cur['code'])
            section_names.append(cur['title'])
            ref_nos.append(rec['ref'])
            part_nos.append(pno)
            descriptions.append(desc)
            remarks_list.append(remarks)

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            if done:
                break

            plain  = (page.extract_text() or "").splitlines()
            layout = (page.extract_text(layout=True) or "").splitlines()

            # detect new section headers
            for ln in plain:
                if done:
                    break
                u = ln.strip().upper()
                if next_sec_re.match(u):
                    if current:
                        _flush(current)
                    parts = ln.strip().split(None, 1)
                    raw_title = parts[1].strip() if len(parts) > 1 else ""
                    # strip any leading "*GROUP"
                    title = re.sub(r'\b[A-Z]+GROUP\b\s*', '', raw_title, flags=re.IGNORECASE)
                    current = {
                        'code':       parts[0].upper(),
                        'title':      title,
                        'header_hit': False,
                        'collected':  []
                    }

            # collect layout lines
            if current:
                for ln in layout:
                    u = ln.strip().upper()
                    if end_re.match(u):
                        _flush(current)
                        done = True
                        break
                    if not current['header_hit']:
                        if table_header_re.search(u):
                            current['header_hit'] = True
                        continue
                    if next_sec_re.match(u) and not u.startswith(current['code']):
                        _flush(current)
                        current = None
                        break
                    current['collected'].append(ln)

    if current and not done:
        _flush(current)

    final_df = pd.DataFrame({
        'section_no':   section_nos,
        'section_name': section_names,
        'ref_no':       ref_nos,
        'part_no':      part_nos,
        'description':  descriptions,
        'remarks':      remarks_list
    })
    final_df.to_csv(output_csv, index=False)
    return final_df


if __name__ == "__main__":
    pdf_file = "CRF1000 A_PC_13MJPG02_(G.H).pdf"
    df = extract_all_sections_one_pass(pdf_file, "all_sections2.csv")


Written 1962 rows to all_sections.csv
