# üè™ ShelfWatch ‚Äî YOLO11 Training (H100 Optimized)

**Runtime:** GPU H100 (80GB VRAM)

This notebook:
1. Downloads SKU-110K from Roboflow
2. Trains YOLO11l with H100-optimized settings
3. Evaluates and visualizes results
4. Logs to MLflow & exports ONNX
5. Downloads best weights

## 0 ‚Äî Setup

In [None]:
!pip install -q ultralytics roboflow mlflow

import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA:    {torch.version.cuda}")
print(f"GPU:     {torch.cuda.get_device_name(0)}")
print(f"VRAM:    {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")

## 1 ‚Äî Download Dataset

In [None]:
!pip install roboflow

from roboflow import Roboflow
rf = Roboflow(api_key="0tOITA1bMoPm91ApiWnt")
project = rf.workspace("boisheba").project("sku-110k-bnaw9")
version = project.version(1)
dataset = version.download("yolov11")

print(f"\n‚úÖ Dataset at: {dataset.location}")

In [None]:
# Verify structure & find data.yaml
import os
import glob

# Find the data.yaml file
yaml_candidates = glob.glob(os.path.join(dataset.location, "**", "data.yaml"), recursive=True)
if not yaml_candidates:
    yaml_candidates = glob.glob(os.path.join(dataset.location, "**", "*.yaml"), recursive=True)

DATA_YAML = yaml_candidates[0] if yaml_candidates else os.path.join(dataset.location, "data.yaml")
print(f"Data YAML: {DATA_YAML}\n")

with open(DATA_YAML) as f:
    print(f.read())

# Count images
for split in ["train", "valid", "test"]:
    imgs = glob.glob(os.path.join(dataset.location, split, "images", "*"))
    print(f"{split}: {len(imgs)} images")

## 2 ‚Äî Train YOLO11 (H100 Optimized)

H100 optimizations applied:
- **`batch=64`** ‚Äî 80GB VRAM handles this easily for YOLO11l @ 640px
- **`amp=True`** ‚Äî automatic mixed precision (FP16/BF16), H100 excels at this
- **`workers=12`** ‚Äî more data loader workers to keep GPU fed
- **`imgsz=640`** ‚Äî standard; bump to 1280 if you want extra accuracy and have time
- **`cos_lr=True`** ‚Äî cosine annealing LR for better convergence
- **`patience=15`** ‚Äî early stopping to save time if plateaued

In [None]:
from ultralytics import YOLO

# ‚îÄ‚îÄ‚îÄ H100 Config ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
MODEL = "yolo11l.pt"       # Large ‚Äî sweet spot for H100
EPOCHS = 50
IMG_SIZE = 640
BATCH_SIZE = 64            # H100 80GB ‚Üí 64 is comfortable for yolo11l
WORKERS = 12               # keep GPU saturated
# ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

model = YOLO(MODEL)

results = model.train(
    data=DATA_YAML,
    epochs=EPOCHS,
    imgsz=IMG_SIZE,
    batch=BATCH_SIZE,
    workers=WORKERS,
    project="/kaggle/working/runs",
    name="shelfwatch",
    exist_ok=True,
    
    # ‚îÄ‚îÄ H100 Optimizations ‚îÄ‚îÄ
    amp=True,               # mixed precision (FP16/BF16) ‚Äî huge speedup on H100
    cos_lr=True,            # cosine LR schedule ‚Äî better convergence
    
    # ‚îÄ‚îÄ Training Quality ‚îÄ‚îÄ
    patience=15,            # early stopping
    save=True,
    save_period=10,         # checkpoint every 10 epochs
    plots=True,
    verbose=True,
)

## 3 ‚Äî Evaluate

In [None]:
best_model = YOLO("/kaggle/working/runs/shelfwatch/weights/best.pt")
metrics = best_model.val(data=DATA_YAML)

print("\n" + "="*50)
print("üìä EVALUATION RESULTS")
print("="*50)
print(f"  mAP50:     {metrics.box.map50:.4f}")
print(f"  mAP50-95:  {metrics.box.map:.4f}")
print(f"  Precision: {metrics.box.mp:.4f}")
print(f"  Recall:    {metrics.box.mr:.4f}")
print("="*50)

In [None]:
# Show training curves
from IPython.display import Image as IPImage, display

plots_dir = "/kaggle/working/runs/shelfwatch"
for plot_name in ["results.png", "confusion_matrix.png", "val_batch0_pred.png"]:
    plot_path = os.path.join(plots_dir, plot_name)
    if os.path.exists(plot_path):
        print(f"\nüìà {plot_name}")
        display(IPImage(filename=plot_path, width=800))

## 4 ‚Äî Test Inference

In [None]:
import glob
import time

# Grab test images
test_images = glob.glob(os.path.join(dataset.location, "test", "images", "*"))[:5]
if not test_images:
    test_images = glob.glob(os.path.join(dataset.location, "valid", "images", "*"))[:5]

for img_path in test_images:
    start = time.perf_counter()
    results = best_model.predict(img_path, imgsz=640, conf=0.25, save=True,
                                  project="/kaggle/working/predictions", exist_ok=True)
    latency = (time.perf_counter() - start) * 1000
    
    n = len(results[0].boxes)
    print(f"\nüîç {os.path.basename(img_path)} ‚Äî {n} products detected ({latency:.0f}ms)")
    
    pred_file = os.path.join("/kaggle/working/predictions/predict", os.path.basename(img_path))
    if os.path.exists(pred_file):
        display(IPImage(filename=pred_file, width=800))

## 5 ‚Äî Log to MLflow

In [None]:
import mlflow

mlflow.set_experiment("shelfwatch-training")

with mlflow.start_run(run_name=f"yolo11l-ep{EPOCHS}-h100"):
    mlflow.log_params({
        "model": MODEL,
        "epochs": EPOCHS,
        "img_size": IMG_SIZE,
        "batch_size": BATCH_SIZE,
        "dataset": "SKU-110K",
        "gpu": "H100",
        "amp": True,
        "cos_lr": True,
    })
    
    mlflow.log_metrics({
        "mAP50": float(metrics.box.map50),
        "mAP50-95": float(metrics.box.map),
        "precision": float(metrics.box.mp),
        "recall": float(metrics.box.mr),
    })
    
    best_pt = "/kaggle/working/runs/shelfwatch/weights/best.pt"
    if os.path.exists(best_pt):
        mlflow.log_artifact(best_pt, artifact_path="weights")
    
    print("‚úÖ Logged to MLflow")

## 6 ‚Äî Export ONNX

In [None]:
best_model.export(format="onnx", imgsz=640, simplify=True, half=True)
print("‚úÖ ONNX model exported (FP16)")

## 7 ‚Äî Download Weights

‚ö†Ô∏è **Download before the session ends!** Use the Output tab (right sidebar).

In [None]:
import shutil

os.makedirs("/kaggle/working/weights", exist_ok=True)

for fname in ["best.pt", "last.pt"]:
    src = f"/kaggle/working/runs/shelfwatch/weights/{fname}"
    if os.path.exists(src):
        shutil.copy2(src, f"/kaggle/working/weights/{fname}")
        print(f"‚úÖ {fname} ({os.path.getsize(src)/1e6:.1f} MB)")

onnx_src = "/kaggle/working/runs/shelfwatch/weights/best.onnx"
if os.path.exists(onnx_src):
    shutil.copy2(onnx_src, "/kaggle/working/weights/best.onnx")
    print(f"‚úÖ best.onnx ({os.path.getsize(onnx_src)/1e6:.1f} MB)")

print("\nüì¶ Download from Output tab ‚Üí")