# Prepare Datasets (Dev-only)\n\nThis notebook prepares training datasets into a local, offline-ready JSONL format: `text` + `entities` (gold spans).\n\nKey points:\n- Caches are redirected to `/mnt/c/ai_cache` (AI_WAREHOUSE 3.0).\n- Prepared datasets default to `/mnt/data/datasets/edge_deid/processed/...`.\n- Each prepared dataset also writes `manifest.json` and `quality.json` next to `dataset.jsonl`.\n- Network access is **disabled by default**; enable it explicitly when needed.\n

In [None]:
import os\nfrom pathlib import Path\n\n# AI_WAREHOUSE 3.0 cache layout (avoid $HOME/.cache)\nos.environ.setdefault('HF_HOME', '/mnt/c/ai_cache/huggingface')\nos.environ.setdefault('TRANSFORMERS_CACHE', os.environ['HF_HOME'])\nos.environ.setdefault('TORCH_HOME', '/mnt/c/ai_cache/torch')\nos.environ.setdefault('XDG_CACHE_HOME', '/mnt/c/ai_cache')\nos.environ.setdefault('PIP_CACHE_DIR', '/mnt/c/ai_cache/pip')\n\nfor key in ('HF_HOME', 'TORCH_HOME', 'XDG_CACHE_HOME', 'PIP_CACHE_DIR'):\n    Path(os.environ[key]).expanduser().mkdir(parents=True, exist_ok=True)\n\nprint('HF_HOME =', os.environ['HF_HOME'])\nprint('XDG_CACHE_HOME =', os.environ['XDG_CACHE_HOME'])\n

In [None]:
# Prepare a synthetic dataset (offline, deterministic).\n!PYTHONPATH=src python scripts/prepare_dataset.py \\\n  --dataset synthetic \\\n  --language zh \\\n  --split train \\\n  --max-examples 500\n

In [None]:
# Use the prepared dataset for training (dev-only).\n# NOTE: This requires a local tokenizer/model directory.\n# Example paths for AI_WAREHOUSE 3.0:\n#   --model-dir /mnt/c/ai_models/detection/edge_deid/bert-ner-zh\n#   --output-dir /mnt/data/training/runs/edge_deid/ner-demo\n#   --input-jsonl /mnt/data/datasets/edge_deid/processed/synthetic/train/dataset.jsonl\n\n# !PYTHONPATH=src python scripts/train_token_classifier.py \\\n#   --model-dir /mnt/c/ai_models/detection/edge_deid/bert-ner-zh \\\n#   --output-dir /mnt/data/training/runs/edge_deid/ner-demo \\\n#   --language zh \\\n#   --input-jsonl /mnt/data/datasets/edge_deid/processed/synthetic/train/dataset.jsonl \\\n#   --epochs 1 --batch-size 8 --max-length 256\n