# 00_config_and_checks

In [1]:
from pathlib import Path
import os, json, platform

HERE = Path.cwd().resolve()

def find_repo_root(start: Path) -> Path:
    for p in [start, *start.parents]:
        if (p / "input").exists() and (p / "notebooks").exists():
            return p
    return start

ROOT = find_repo_root(HERE)

# optional: force cwd to repo root so relative paths behave
os.chdir(ROOT)

INPUT_DIR = ROOT / "input"
DATA_DIR = ROOT / "data"
PROCESSED_DIR = DATA_DIR / "processed"
MODELS_DIR = ROOT / "models"
OUTPUTS_DIR = ROOT / "outputs"
FIG_DIR = OUTPUTS_DIR / "figures"

for p in [DATA_DIR, PROCESSED_DIR, MODELS_DIR, OUTPUTS_DIR, FIG_DIR, FIG_DIR/"text_model", FIG_DIR/"vision_model"]:
    p.mkdir(parents=True, exist_ok=True)

ENRON_CSV = INPUT_DIR / "enron.csv"
INVOICE1_CSV = INPUT_DIR / "invoices.csv"
INVOICE2_CSV = INPUT_DIR / "invoices_2.csv"
ARXIV_JSONL = INPUT_DIR / "arxiv.jsonl"
ARXIV_JSON = INPUT_DIR / "arxiv.json"
ARXIV_PATH = ARXIV_JSONL if ARXIV_JSONL.exists() else ARXIV_JSON

print("CWD:", Path.cwd())
print("ROOT:", ROOT)
print("INPUT_DIR exists:", INPUT_DIR.exists(), INPUT_DIR)
print("INPUT contents:", [p.name for p in INPUT_DIR.glob("*")][:20])

print("ENRON_CSV:", ENRON_CSV.exists(), ENRON_CSV)
print("INVOICE1_CSV:", INVOICE1_CSV.exists(), INVOICE1_CSV)
print("INVOICE2_CSV:", INVOICE2_CSV.exists(), INVOICE2_CSV)
print("ARXIV:", ARXIV_PATH.exists(), ARXIV_PATH)


CWD: C:\Users\viach\Downloads\document-classifier-portfolio-v2
ROOT: C:\Users\viach\Downloads\document-classifier-portfolio-v2
INPUT_DIR exists: True C:\Users\viach\Downloads\document-classifier-portfolio-v2\input
INPUT contents: ['arxiv.json', 'enron.csv', 'invoices.csv', 'invoices_2.csv', 'pdfs', 'README.md']
ENRON_CSV: True C:\Users\viach\Downloads\document-classifier-portfolio-v2\input\enron.csv
INVOICE1_CSV: True C:\Users\viach\Downloads\document-classifier-portfolio-v2\input\invoices.csv
INVOICE2_CSV: True C:\Users\viach\Downloads\document-classifier-portfolio-v2\input\invoices_2.csv
ARXIV: True C:\Users\viach\Downloads\document-classifier-portfolio-v2\input\arxiv.json


In [2]:
# Optional: PDF deps check (used in 04_pdf_inference)
missing = []
try:
    import fitz  # PyMuPDF
except Exception:
    missing.append("pymupdf")
try:
    import pdfplumber
except Exception:
    missing.append("pdfplumber")

print("PDF deps OK" if not missing else f"Missing: {missing}")

PDF deps OK


In [3]:
# Save config (used by other notebooks)
cfg = {
    "ROOT": str(ROOT),
    "INPUT_DIR": str(INPUT_DIR),
    "PROCESSED_DIR": str(PROCESSED_DIR),
    "MODELS_DIR": str(MODELS_DIR),
    "FIG_DIR": str(FIG_DIR),
    "ENRON_CSV": str(ENRON_CSV),
    "INVOICE1_CSV": str(INVOICE1_CSV),
    "INVOICE2_CSV": str(INVOICE2_CSV),
    "ARXIV_PATH": str(ARXIV_PATH),
}
CONFIG_PATH = ROOT / "project_config.json"
CONFIG_PATH.write_text(json.dumps(cfg, indent=2), encoding="utf-8")
print("Wrote:", CONFIG_PATH)

Wrote: C:\Users\viach\Downloads\document-classifier-portfolio-v2\project_config.json


In [4]:
from pathlib import Path
print((Path.cwd()/"project_config.json").exists(), Path.cwd()/"project_config.json")


True C:\Users\viach\Downloads\document-classifier-portfolio-v2\project_config.json
