# BallHolder RL Pipeline\n
\n
Run order: analyze dataset -> create splits (+ optional HF upload) -> analyze splits -> train -> benchmark.

In [None]:
# Environment setup (run once)\n
!pip install -q datasets pillow numpy scipy wandb python-dotenv\n
\n
# Ensure your keys exist in .env or pass --env-file to each command\n
!cat .env

## 1) Analyze original dataset

In [None]:
# Default\n
!python analyze_ballholder_dataset.py \
  --dataset-name maxs-m87/Ball-Holder \
  --split train \
  --streaming \
  --out-json outputs/ballholder_dataset_stats_raw.json\n
\n
# Recommended (larger sample + shuffle)\n
!python analyze_ballholder_dataset.py \
  --dataset-name maxs-m87/Ball-Holder \
  --split train \
  --streaming --shuffle --buffer-size 5000 --seed 42 \
  --max-samples 100000 \
  --out-json outputs/ballholder_dataset_stats_raw.json

## 2) Create splits (and optionally upload to HF)

In [None]:
# Default local split build\n
!python create_ballholder_splits.py \
  --dataset maxs-m87/Ball-Holder \
  --split train \
  --val-fraction 0.10 \
  --holdout-count 1000 \
  --empty-fraction 0.25\n
\n
# Recommended: also push to hub (edit repo name)\n
!python create_ballholder_splits.py \
  --dataset maxs-m87/Ball-Holder \
  --split train \
  --val-fraction 0.10 \
  --holdout-count 1000 \
  --empty-fraction 0.25 \
  --push-to-hub maxs-m87/Ball-Holder-splits-v1 \
  --hub-val-split validation \
  --hub-post-val-split test

## 3) Analyze splits to validate positive/negative balance

In [None]:
# Analyze locally saved train/val/post_val splits\n
!python analyze_ballholder_dataset.py \
  --dataset-path outputs/maxs-m87_Ball-Holder_splits \
  --all-splits \
  --out-json outputs/ballholder_dataset_stats_splits.json

## 4) Train (with W&B metrics)

In [None]:
# Default train command\n
!python train_ballholder.py \
  --dataset-path outputs/maxs-m87_Ball-Holder_splits \
  --split train \
  --val-split val \
  --num-steps 1000 \
  --batch-size 8 \
  --group-size 4 \
  --max-objects 1 \
  --best-metric eval_miou \
  --eval-every 25 \
  --save-every 25 \
  --wandb-project moondream-ballholder-rl\n
\n
# Recommended stronger run\n
!python train_ballholder.py \
  --dataset-path outputs/maxs-m87_Ball-Holder_splits \
  --split train \
  --val-split val \
  --num-steps 1500 \
  --batch-size 8 \
  --group-size 8 \
  --lr 0.002 \
  --max-objects 1 \
  --augment-prob 0.9 \
  --empty-keep-prob 0.5 \
  --off-policy \
  --best-metric eval_miou \
  --eval-every 25 \
  --save-every 25 \
  --eval-max-samples 2000 \
  --wandb-project moondream-ballholder-rl

## 5) Benchmark on unseen split (`post_val`)

In [None]:
# Replace MODEL with your finetuned checkpoint, e.g. moondream3-preview/<finetune_id>@<step>\n
MODEL = "moondream3-preview"\n
\n
# Option A: /detect inference (baseline model or deployed checkpoint)\n
!python benchmark_ballholder.py \\\n
  --model {MODEL} \\\n
  --dataset-path outputs/maxs-m87_Ball-Holder_splits \\\n
  --split post_val \\\n
  --max-objects 1 \\\n
  --iou-threshold 0.5 \\\n
  --save-viz \\\n
  --viz-limit 100 \\\n
  --out-json outputs/benchmark_metrics_post_val_detect_api.json\n
\n
# Option B: RL finetune inference via tuning interface (latest finetune state)\n
FINETUNE_ID = ""\n
!python benchmark_ballholder.py \\\n
  --finetune-id {FINETUNE_ID} \\\n
  --dataset-path outputs/maxs-m87_Ball-Holder_splits \\\n
  --split post_val \\\n
  --batch-size 16 --max-workers 16 \\\n
  --max-objects 1 \\\n
  --iou-threshold 0.5 \\\n
  --save-viz \\\n
  --viz-limit 100 \\\n
  --out-json outputs/benchmark_metrics_post_val_tuning_interface.json