In [None]:
from pathlib import Path

p = Path(r"C:/Users/sokade/Downloads/sloppy_ml/labels_template.csv")
lines = [ln.strip() for ln in p.read_text(encoding="utf-8-sig").splitlines() if ln.strip()]
out = ["report_id,label_codes"]
for s in lines[1:]:
    
    if s.startswith('"') and s.endswith('"'):
        s = s[1:-1]
    out.append(s)
p.write_text("\n".join(out) + "\n", encoding="utf-8")
print("Rewrote labels_template.csv with 2 columns.")

Rewrote labels_template.csv with 2 columns.


In [11]:
import pandas as pd
df = pd.read_csv(r"C:/Users/sokade/Downloads/sloppy_ml/labels_template.csv", encoding="utf-8-sig")
print("COLUMNS:", list(df.columns))
print(df.head(5).to_string(index=False))

COLUMNS: ['report_id', 'label_codes']
 report_id                             label_codes
   4083505 ATTRIBUTES_PARTIAL;MATERIALS_NOT_BOOKED
   4083506 ATTRIBUTES_PARTIAL;MATERIALS_NOT_BOOKED
   4083507 ATTRIBUTES_PARTIAL;MATERIALS_NOT_BOOKED
   4083508 ATTRIBUTES_PARTIAL;MATERIALS_NOT_BOOKED
   4083509 ATTRIBUTES_PARTIAL;MATERIALS_NOT_BOOKED


In [1]:
from pathlib import Path
import re, json
import pandas as pd


In [2]:

# Input and output directories
BASE_DIR = Path("C:/Users/sokade/Downloads/sloppy_reports")
OUT_FINDINGS  = BASE_DIR / "findings"
OUT_QUOTES    = BASE_DIR / "quote_stub"
OUT_STRUCT    = BASE_DIR / "structured"

# Create output folders if they don't exist
for d in (OUT_FINDINGS, OUT_QUOTES, OUT_STRUCT):
    d.mkdir(exist_ok=True)

# --- PDF text extractor ---
def extract_text_from_pdf(pdf_path: Path) -> str:
    text = ""
    try:
        from PyPDF2 import PdfReader
        text = "\n".join([(p.extract_text() or "") for p in PdfReader(str(pdf_path)).pages])
    except Exception:
        pass
    if not text.strip():
        from pdfminer.high_level import extract_text
        text = extract_text(str(pdf_path))
    return text

# --- Helpers
def find(p, text, flags=re.I|re.S):
    m = re.search(p, text, flags)
    return m.group(1).strip() if m else None

def has(p, text, flags=re.I|re.S): 
    return re.search(p, text, flags) is not None



In [None]:

# EDIT THIS BLOCK
# --- Process each PDF ---
for pdf in BASE_DIR.glob("*.pdf"):
    print(f"Processing {pdf.name} ...")
    text = extract_text_from_pdf(pdf)

    # --- Light extraction
    report_id = find(r"(?:Service\s*report|Servicerapport)\s*#?\s*(\d+)", text)
    arrival   = find(r"(?:Time of Arrival|Arrival)[^\d]*([0-2]?\d:[0-5]\d)", text)
    departure = find(r"(?:Time of Departure|Departure)[^\d]*([0-2]?\d:[0-5]\d)", text)
    total     = find(r"(?:Total time spent working|Working hours)[^\d]*([0-2]?\d:[0-5]\d)", text)

    attributes_block = find(r"Attributes\s*:?\s*(.*?)(?:Executed maintenance|Comments|Signature|$)", text)
    attributes_filled = bool(attributes_block and re.search(r"\d|\bV\b|\bA\b|\bL\b|\bbar\b|\b°C\b", attributes_block or ""))

    comments = find(r"(?:Comments|Notes)\s*:?\s*(.*?)(?:Signature|Executed maintenance|Situation on arrival|$)", text) or ""

    # Problem cues
    fuel_polisher_leak = has(r"fuel\s+polisher\s+pump.*leak", text)
    fuel_level_indicator_issue = has(r"fuel\s+level\s+indicator.*(not|fault|replace)", text)
    repair_advice_present = has(r"(repair|replacement)\s+(advice|advies)", text)
    run_log_line = find(r"(Record data on run log[^\n]*)", text)
    run_log_incomplete = bool(run_log_line and re.search(r"(not|n/?a|ordered|missing|later|resched)", run_log_line, re.I))

    # Findings
    findings = []
    if not total or total in ("0:00","00:00"):
        findings.append(dict(category="Admin", issue="Working hours missing/zero",
                             action_request="Enter arrival, departure, and total working time."))

    if not attributes_filled:
        findings.append(dict(category="Attributes", issue="Attributes not filled",
                             action_request="Fill power/battery/capacity/spec fields."))

    if fuel_polisher_leak and not re.search(r"(action|remedy|repaired|replaced|vervangen)", comments, re.I):
        findings.append(dict(category="Fuel System", issue="Action missing for fuel polisher pump leak",
                             action_request="Add remedy (repair/replace), parts, and hours estimate."))

    if fuel_level_indicator_issue:
        findings.append(dict(category="Fuel System", issue="Fuel level indicator decision missing",
                             action_request="Record customer decision (do not use / do not replace / replace)."))

    if run_log_incomplete:
        findings.append(dict(category="Electrical/Logging", issue="Run log not completed",
                             action_request="Attach metering run log or reschedule with tooling."))

    # Quote stub
    quote_stub = []
    if fuel_polisher_leak or repair_advice_present:
        quote_stub.append(dict(repair_advice="Fuel polisher pump repair/replacement",
                               material_code="", specification="", quantity="", hours_estimate=""))
    if fuel_level_indicator_issue:
        quote_stub.append(dict(repair_advice="Fuel l3evel indicator replacement",
                               material_code="", specification="", quantity="", hours_estimate=""))

    # Save outputs
    rid = report_id or pdf.stem.split("_")[2]  # fallback: use the middle ID from filename
    pd.DataFrame(findings).to_csv(OUT_FINDINGS / f"findings_{rid}.csv", index=False)
    pd.DataFrame(quote_stub).to_csv(OUT_QUOTES / f"quote_stub_{rid}.csv", index=False)

    structured = dict(
        report_id=rid,
        arrival=arrival, departure=departure, total_time_spent=total,
        attributes_filled=attributes_filled,
        flags=dict(
            fuel_polisher_pump_leak=bool(fuel_polisher_leak),
            fuel_level_indicator_issue=bool(fuel_level_indicator_issue),
            repair_advice_present=bool(repair_advice_present),
            run_log_incomplete=bool(run_log_incomplete),
        ),
        excerpts=dict(comments=comments[:400], attributes=(attributes_block or "")[:400])
    )
    with open(OUT_STRUCT / f"structured_{rid}.json", "w", encoding="utf-8") as f:
        json.dump(structured, f, indent=2, ensure_ascii=False)


Processing 10038876_20250728_4083512_Microsoft AMS08 COLO2-CELLD SET_72H_SM04_1426.pdf ...


In [4]:

print("DONE → processed all PDFs in sloppy_reports/")


DONE → processed all PDFs in sloppy_reports/
