In [1]:

import torch, sys
print("Python:", sys.version.splitlines()[0])
print("Torch:", getattr(torch, "__version__", "torch not installed"))
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU name:", torch.cuda.get_device_name(0))
else:
    raise SystemError("CUDA not available. Make sure you installed CUDA-enabled torch in this kernel.")


Python: 3.11.9 (tags/v3.11.9:de54cf5, Apr  2 2024, 10:12:12) [MSC v.1938 64 bit (AMD64)]
Torch: 2.11.0.dev20251219+cu128
CUDA available: True
GPU name: NVIDIA GeForce RTX 5070


In [2]:

from ultralytics import YOLO
from pathlib import Path
import shutil, time, os

DATA_ROOT = Path("pennfudan_yolo")        
DATA_YAML  = DATA_ROOT / "data.yaml"
RUNS_DIR   = Path("runs/detect")
ONNX_DIR   = Path("onnx_models")
ONNX_DIR.mkdir(parents=True, exist_ok=True)

MODELS = [
    {"name":"yolov8n.pt", "batch":16},   
    {"name":"yolov8s.pt", "batch":12},   
    {"name":"yolov8m.pt", "batch":8},    
]
IMG_SIZE = 640
TOTAL_EPOCHS = 200
STAGE1_EPOCHS = 10  
STAGE2_EPOCHS = TOTAL_EPOCHS - STAGE1_EPOCHS

DEVICE = 0   
SINGLE_CLASS = True
USE_HALF = True   

def find_weights(run_name):
    """Return best.pt if exists, else last.pt, else None"""
    wdir = RUNS_DIR / run_name / "weights"
    if not wdir.exists():
        return None
    best = wdir / "best.pt"
    last = wdir / "last.pt"
    if best.exists():
        return best
    if last.exists():
        return last
    
    pts = list(wdir.glob("*.pt"))
    return pts[0] if pts else None

for m in MODELS:
    base = m["name"]
    batch = m["batch"]
    model_stem = Path(base).stem
    print("\n" + "="*80)
    print(f"STARTING training for {model_stem} @ {time.strftime('%Y-%m-%d %H:%M:%S')}")
    print("Base checkpoint:", base)
    print("Batch size:", batch, "| img size:", IMG_SIZE, "| total epochs:", TOTAL_EPOCHS)
    print("="*80 + "\n")
    
    
    model = YOLO(base)   

    
    run_name_s1 = f"pennfudan_{model_stem}_stage1_freeze"
    try:
        print(f"Stage 1 -> freezing backbone, epochs={STAGE1_EPOCHS}, run name: {run_name_s1}")
        model.train(
            data=str(DATA_YAML),
            epochs=STAGE1_EPOCHS,
            imgsz=IMG_SIZE,
            batch=batch,
            device=DEVICE,
            name=run_name_s1,
            single_cls=SINGLE_CLASS,
            augment=True,
            
            freeze=10,
            half=USE_HALF,   
            workers=8,
            save=True
        )
    except TypeError as e:
        # Older/newer ultralytics APIs may not accept freeze or half names exactly — try without them
        print("Stage1: Received TypeError (maybe param compatibility). Retrying without 'freeze'/'half'.", e)
        model.train(
            data=str(DATA_YAML),
            epochs=STAGE1_EPOCHS,
            imgsz=IMG_SIZE,
            batch=batch,
            device=DEVICE,
            name=run_name_s1,
            single_cls=SINGLE_CLASS,
            augment=True,
            workers=8,
            save=True
        )

    # find weights from stage1
    w_stage1 = find_weights(run_name_s1)
    if not w_stage1:
        print("Warning: no weights found for stage1. Continuing with base checkpoint.")
        w_stage1 = None
    else:
        print("Stage1 checkpoint:", w_stage1)
    
    # Stage 2: unfreeze and long fine-tune
    run_name_s2 = f"pennfudan_{model_stem}_stage2_unfreeze"
    print(f"\nStage 2 -> unfreeze and fine-tune for {STAGE2_EPOCHS} epochs, run name: {run_name_s2}")
    # reload from stage1 checkpoint if available, else continue from base model variable
    if w_stage1:
        model = YOLO(str(w_stage1))
    else:
        model = YOLO(base)
    
    try:
        model.train(
            data=str(DATA_YAML),
            epochs=STAGE2_EPOCHS,
            imgsz=IMG_SIZE,
            batch=batch,
            device=DEVICE,
            name=run_name_s2,
            single_cls=SINGLE_CLASS,
            augment=True,
            resume=False,
            half=USE_HALF,
            workers=8,
            save=True
        )
    except TypeError as e:
        print("Stage2: TypeError on extra params, retrying without unknown params:", e)
        model.train(
            data=str(DATA_YAML),
            epochs=STAGE2_EPOCHS,
            imgsz=IMG_SIZE,
            batch=batch,
            device=DEVICE,
            name=run_name_s2,
            single_cls=SINGLE_CLASS,
            augment=True,
            resume=False,
            workers=8,
            save=True
        )

    # get final best weights from stage 2 run
    w_final = find_weights(run_name_s2)
    if not w_final:
        # fallback to last from stage2 or stage1
        w_final = find_weights(run_name_s2) or find_weights(run_name_s1)
    if not w_final:
        print("ERROR: No trained weights found for", model_stem)
        continue
    print("Final checkpoint selected for export:", w_final)

    # Export to ONNX (force device='cuda' for GPU ops)
    print("Exporting to ONNX (this may take a while)...")
    trained = YOLO(str(w_final))
    try:
        # instruct ultralytics to export to ONNX. Passing device='cuda' to ensure proper ops
        trained.export(format="onnx", imgsz=IMG_SIZE, simplify=True, device="cuda", opset=12)
    except TypeError:
        # fallback: some ultralytics versions accept device param at export, others not; try without
        trained.export(format="onnx", imgsz=IMG_SIZE, simplify=True, opset=12)
    # find produced .onnx (pick newest .onnx)
    candidates = list(Path('.').glob('*.onnx')) + list(Path('.').glob(str(RUNS_DIR/'**'/'*.onnx')))
    if candidates:
        newest = max(candidates, key=lambda p: p.stat().st_mtime)
        target = ONNX_DIR / f"{model_stem}.onnx"
        shutil.move(str(newest), str(target))
        print("Saved ONNX:", target)
    else:
        print("No ONNX file found after export. Check ultralytics export logs for errors.")

    print(f"Completed {model_stem}. Sleeping 2s before next model.")
    time.sleep(2)

print("\nAll done. Check the folders:")
print(" - runs:", RUNS_DIR)
print(" - onnx models:", ONNX_DIR)



STARTING training for yolov8n @ 2025-12-19 22:13:52
Base checkpoint: yolov8n.pt
Batch size: 16 | img size: 640 | total epochs: 200

Stage 1 -> freezing backbone, epochs=10, run name: pennfudan_yolov8n_stage1_freeze
Ultralytics 8.3.240  Python-3.11.9 torch-2.11.0.dev20251219+cu128 CUDA:0 (NVIDIA GeForce RTX 5070, 12227MiB)
[34m[1mengine\trainer: [0magnostic_nms=False, amp=True, augment=True, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=pennfudan_yolo\data.yaml, degrees=0.0, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=10, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=10, half=True, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, 