In [21]:
from pathlib import Path
import re, json, pdfplumber

def find_repo_root(start=Path.cwd()):
    for p in [start, *start.parents]:
        if (p / ".git").exists():
            return p
    return Path.cwd()

ROOT = find_repo_root()
IN_DIR = ROOT / "src" / "docs"          # adjust if you moved PDFs
OUT_DIR = ROOT / "output"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Broader AFSC patterns (allow optional suffix letters)
AFSC_PATTERNS = [
    re.compile(r"\b\d{1}[A-Z]\d[A-Z]\d[A-Z]?\b"),    # enlisted e.g., 1N4X1, 1N4X1A
    re.compile(r"\b\d{2}[A-Z]\d[A-Z]?\b"),           # officer e.g., 11F3, 11F3A
]

def guess_section(lines):
    for ln in lines[:12]:
        if any(p.search(ln) for p in AFSC_PATTERNS):
            return ln.strip()
    for ln in lines[:12]:
        t = ln.strip()
        if len(t) > 4 and t.isupper():
            return t
    return None

def clean_text(t: str) -> str:
    t = t.replace("\u00ad", "")                   # soft hyphen
    t = re.sub(r"(\w)-\n(\w)", r"\1\2", t)       # join hyphenated linebreaks
    t = re.sub(r"[ \t]+", " ", t)
    t = re.sub(r"\n{2,}", "\n\n", t)
    return t.strip()

def parse_pdf_to_jsonl(pdf_path, doc_id, doc_title, source_type, source_url="", version_date="", output_path=None):
    out_path = Path(output_path) if output_path else (OUT_DIR / f"{doc_id}.jsonl")
    out_path.parent.mkdir(parents=True, exist_ok=True)
    wrote, empty = 0, 0

    with pdfplumber.open(pdf_path) as pdf, open(out_path, "w", encoding="utf-8") as out:
        for i, page in enumerate(pdf.pages, start=1):
            try:
                txt = page.extract_text() or ""
            except Exception as e:
                txt = ""
            txt = clean_text(txt)
            if not txt:
                empty += 1
            lines = [ln for ln in txt.splitlines() if ln.strip()]
            section = guess_section(lines)
            rec = {
                "doc_id": doc_id, "doc_title": doc_title,
                "source_type": source_type, "source_url": source_url,
                "version_date": version_date, "page": i,
                "section": section, "span_start": 0, "span_end": len(txt),
                "text": txt,
            }
            out.write(json.dumps(rec, ensure_ascii=False) + "\n")
            wrote += 1

    # tiny manifest for sanity
    (OUT_DIR / f"{doc_id}__manifest.json").write_text(
        json.dumps({"doc_id": doc_id, "pages": wrote, "empty_pages": empty, "output": str(out_path)}, indent=2),
        encoding="utf-8"
    )
    print(f"[{doc_id}] pages={wrote}, empty_pages={empty} → {out_path}")
    return out_path


In [22]:
# PDF → JSONL with page text + provenance
import json
import re
from pathlib import Path
import pdfplumber

AFSC_PATTERNS = [
    re.compile(r"\b\d[A-Z]\d[A-Z]\d\b"),   # e.g., 1N4X1
    re.compile(r"\b\d{2}[A-Z]\d\b"),       # e.g., 11F3
]

def guess_section(lines):
    """Return the first line that looks like an AFSC header or ALL CAPS heading."""
    for ln in lines[:12]:
        if any(p.search(ln) for p in AFSC_PATTERNS):
            return ln.strip()
    for ln in lines[:12]:
        t = ln.strip()
        if len(t) > 4 and t.isupper():
            return t
    return None

def clean_text(t):
    t = t.replace("\u00ad", "")      # soft hyphen
    t = re.sub(r"[ \t]+", " ", t)
    t = re.sub(r"\n{2,}", "\n\n", t)
    return t.strip()

def parse_pdf_to_jsonl(pdf_path, doc_id, doc_title, source_type, source_url="", version_date="", output_path=None):
    """
    Parse PDF to JSONL format with page text and provenance
    """
    if output_path is None:
        output_path = f"../../output/{doc_id}.jsonl"
    
    out_path = Path(output_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    
    wrote = 0
    with pdfplumber.open(pdf_path) as pdf, open(out_path, "w", encoding="utf-8") as out:
        for i, page in enumerate(pdf.pages, start=1):
            txt = page.extract_text() or ""
            txt = clean_text(txt)
            lines = [ln for ln in txt.splitlines() if ln.strip()]
            section = guess_section(lines)
            
            rec = {
                "doc_id": doc_id,
                "doc_title": doc_title,
                "source_type": source_type,
                "source_url": source_url,
                "version_date": version_date,
                "page": i,
                "section": section,
                "span_start": 0,
                "span_end": len(txt),
                "text": txt,
            }
            out.write(json.dumps(rec, ensure_ascii=False) + "\n")
            wrote += 1
    
    print(f"Wrote {wrote} page records → {out_path}")
    return out_path

# Test the function
print("Parser functions loaded successfully!")

Parser functions loaded successfully!


In [23]:
# Parse AFOCD
afocd_output = parse_pdf_to_jsonl(
    pdf_path="../../src/docs/AFOCD_2024.pdf",
    doc_id="AFOCD_2024", 
    doc_title="Air Force Officer Classification Directory 2024",
    source_type="AFOCD",
    version_date="2024-04-01"
)

# Parse AFECD  
afecd_output = parse_pdf_to_jsonl(
    pdf_path="../../src/docs/AFECD_2024.pdf",
    doc_id="AFECD_2024",
    doc_title="Air Force Enlisted Classification Directory 2024", 
    source_type="AFECD",
    version_date="2024-04-01"
)

Wrote 305 page records → ..\..\output\AFOCD_2024.jsonl


Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray non-stroke color because /'P5' is an invalid float value
Cannot set gray non-stroke color because /'P6' is an invalid float value
Cannot set gray non-stroke color because /'P7' is an invalid float value


Wrote 410 page records → ..\..\output\AFECD_2024.jsonl


In [24]:
# Check the output files exist and examine a sample
import json

# Load and examine first few records from AFOCD
with open("../../output/AFOCD_2024.jsonl", "r", encoding="utf-8") as f:
    afocd_lines = f.readlines()

print(f"AFOCD total lines: {len(afocd_lines)}")
print("\nFirst record sample:")
first_record = json.loads(afocd_lines[0])
for key, value in first_record.items():
    if key == "text":
        print(f"{key}: {value[:100]}..." if len(str(value)) > 100 else f"{key}: {value}")
    else:
        print(f"{key}: {value}")

AFOCD total lines: 305

First record sample:
doc_id: AFOCD_2024
doc_title: Air Force Officer Classification Directory 2024
source_type: AFOCD
source_url: 
version_date: 2024-04-01
page: 1
section: DEPARTMENT OF THE AIR FORCE
span_start: 0
span_end: 265
text: DAFOCD, 31 Oct 24
31 October 2024
DEPARTMENT OF THE AIR FORCE
OFFICER CLASSIFICATION DIRECTORY
(DAFO...


In [25]:
# Examine section detection across multiple pages
sections_found = []
for i in range(min(10, len(afocd_lines))):  # Check first 10 pages
    record = json.loads(afocd_lines[i])
    sections_found.append({
        'page': record['page'], 
        'section': record['section']
    })

print("Section detection sample:")
for item in sections_found:
    print(f"Page {item['page']}: {item['section']}")

Section detection sample:
Page 1: DEPARTMENT OF THE AIR FORCE
Page 2: SUMMARY OF REVISIONS
Page 3: DAFOCD
Page 4: SECTION I
Page 5: SECTION I-A
Page 6: SECTION I-B
Page 7: PREFIX B
Page 8: PREFIX C
Page 9: PREFIX D
Page 10: PREFIX E


In [26]:
# Check the output file sizes
import os
print("Files in output directory:")
for file in os.listdir("../../output/"):
    file_path = os.path.join("../../output/", file)
    if os.path.isfile(file_path):
        size_mb = os.path.getsize(file_path) / (1024*1024)
        print(f"{file}: {size_mb:.2f} MB")

Files in output directory:
AFECD_2024.jsonl: 1.48 MB
AFOCD_2024.jsonl: 0.92 MB
