In [3]:
import re, pandas as pd, pathlib

# 🔧―― 1.  Point to your two source CSVs
base   = pathlib.Path(r"C:\Users\Kyle\Desktop\Grad School\IS Demo\Phase 2 Rebuild\Phase 3")
files  = [base / "DAFOCD-31-Oct-24.csv",   # officer
          base / "DAFECD-31-Oct-24.csv"]   # enlisted

def parse_afsc_csv(path: pathlib.Path) -> pd.DataFrame:
    """
    Read a PDF‑to‑CSV dump (one line per row) and return a tidy dataframe
    with columns [AFSC_Code, description].
    """
    # Read as single text column (no header)
    raw = pd.read_csv(path, header=None, dtype=str, encoding="utf‑8", na_filter=False)[0].str.strip()

    # Pattern that marks the *start* of a new block
    start_pat = re.compile(r"^(?:AFSC\s|CEM\sCode\s)", re.I)

    blocks, cur_code, buf = {}, None, []
    for line in raw:
        if start_pat.match(line):
            if cur_code and buf:
                blocks[cur_code] = "\n".join(buf).strip()
            cur_code, buf = line, []           # new block
        else:
            buf.append(line)
    # flush last block
    if cur_code and buf:
        blocks[cur_code] = "\n".join(buf).strip()

    df = pd.DataFrame({
        "AFSC_Code": list(blocks.keys()),
        "description": list(blocks.values())
    })
    return df

# 🔧―― 2.  Parse & merge
all_df = pd.concat([parse_afsc_csv(p) for p in files], ignore_index=True)
all_df = all_df[all_df["description"].str.len() > 0]      # drop empties

print(f"✅ Parsed {len(all_df):,} AFSC entries")
display(all_df.head(3))

# 🔧―― 3.  Save cleaned dataset next to originals
out_path = base / "afsc_clean.csv"
all_df.to_csv(out_path, index=False, encoding="utf‑8")
print("📄 Saved cleaned file →", out_path)


✅ Parsed 305 AFSC entries


Unnamed: 0,AFSC_Code,description
0,AFSC 10C0,OPERATIONS COMMANDER\n(Changed 31 Oct 08)\n1. ...
1,"AFSC 10C0, completion of a current T5 Investig...","32\nDAFOCD, 31 Oct 24\nPILOT UTILIZATION FIELD..."
2,"AFSC 11B1*, Entry/Student",BOMBER PILOT\n(Changed 30 Apr 23)\n1. Specialt...


📄 Saved cleaned file → C:\Users\Kyle\Desktop\Grad School\IS Demo\Phase 2 Rebuild\Phase 3\afsc_clean.csv
