In [8]:
import os
import re
from PyPDF2 import PdfReader

RAW_DIR = "raw"
PARSED_DIR = "parsed"

# Case-insensitive, tolerant to letter-spacing and punctuation in PDFs
INTRO_RE = r"I\W*N\W*T\W*R\W*O\W*D\W*U\W*C\W*T\W*I\W*O\W*N"
SUMMARY_RE = r"S\W*U\W*M\W*M\W*A\W*R\W*Y"
HEADING_RE = re.compile(
    rf"\b(?:{SUMMARY_RE}|{INTRO_RE})\b\s*[:\-–—.]?\s*",
    re.IGNORECASE,
)

def read_pdf_text(path: str) -> str:
    reader = PdfReader(path)
    parts = []
    for page in reader.pages:
        t = page.extract_text() or ""
        parts.append(t)
    text = "\n".join(parts)
    # soften common PDF artifacts to improve matching (doesn't affect \W*-tolerant regex)
    text = text.replace("-\n", "")  # join hyphenated line breaks
    return text

def extract_after_first_heading(text: str) -> tuple[str | None, str | None]:
    """
    Returns (matched_heading, text_after) or (None, None) if nothing matched.
    matched_heading is 'summary' or 'introduction' (lowercase) based on which fired first.
    """
    m = HEADING_RE.search(text)
    if not m:
        return None, None
    matched = m.group(0).lower()
    which = "summary" if "s" in matched and "y" in matched else "introduction"
    return which, text[m.end():].strip()

def process_all_pdfs(raw_dir=RAW_DIR, parsed_dir=PARSED_DIR):
    os.makedirs(parsed_dir, exist_ok=True)
    pdfs = [f for f in os.listdir(raw_dir) if f.lower().endswith(".pdf")]
    if not pdfs:
        print(f"No PDFs in {raw_dir}")
        return

    for fname in pdfs:
        in_path = os.path.join(raw_dir, fname)
        out_path = os.path.join(parsed_dir, os.path.splitext(fname)[0] + ".txt")
        try:
            print(f"Processing: {fname}")
            text = read_pdf_text(in_path)
            if not text.strip():
                print("  Skipped: no extractable text (likely scanned).")
                continue

            which, after = extract_after_first_heading(text)
            if not after:
                print("  Skipped: no 'summary' or 'introduction' heading found.")
                continue

            with open(out_path, "w", encoding="utf-8") as f:
                f.write(after)
            print(f"  Saved -> {out_path} (started at: {which})")
        except Exception as e:
            print(f"  Error on {fname}: {e}")

if __name__ == "__main__":
    process_all_pdfs()





Processing: Gardner and another v. Geldenhuys.pdf
  Saved -> parsed/Gardner and another v. Geldenhuys.txt (started at: introduction)
Processing: Behm v. 6-4-1 Holdings and others.pdf
  Saved -> parsed/Behm v. 6-4-1 Holdings and others.txt (started at: introduction)
Processing: MacDonald v. Najafi and another (No. 2).pdf
  Saved -> parsed/MacDonald v. Najafi and another (No. 2).txt (started at: introduction)
Processing: Harrison v. Nixon Safety Consulting and others (No. 3).pdf
  Saved -> parsed/Harrison v. Nixon Safety Consulting and others (No. 3).txt (started at: introduction)
Processing: MacGarvie v. Friedmann (No. 4).pdf
  Saved -> parsed/MacGarvie v. Friedmann (No. 4).txt (started at: introduction)
Processing: Koblensky v Westwood.pdf
  Saved -> parsed/Koblensky v Westwood.txt (started at: introduction)
Processing: Tyler v. Robnik and Mobility World (No. 2).pdf
  Saved -> parsed/Tyler v. Robnik and Mobility World (No. 2).txt (started at: introduction)
Processing: Kwan v. Marzara a