In [None]:
# YOLO-Powered Hand Proximity Warning System with Virtual Object Zone

In [None]:
!pip install ultralytics opencv-python

In [1]:
import cv2
import numpy as np
from ultralytics import YOLO
import time

# -------------------------------
# 1. Initialize Webcam & ROI
# -------------------------------
cap = cv2.VideoCapture(0)
ret, frame = cap.read()
if not ret:
    print("Cannot open camera")
    exit()

# Let user select virtual object ROI
roi = cv2.selectROI("Select Virtual Object", frame, showCrosshair=True, fromCenter=False)
cv2.destroyWindow("Select Virtual Object")
x, y, w, h = roi
vx1, vy1, vx2, vy2 = x, y, x + w, y + h
obj_center = (x + w // 2, y + h // 2)

# -------------------------------
# 2. Load YOLO 3-class hand model
# -------------------------------
model = YOLO('https://huggingface.co/EtanHey/hand-detection-3class/resolve/main/model.pt')

# -------------------------------
# 3. Main Loop
# -------------------------------
while True:
    start_time = time.time()
    ret, frame = cap.read()
    if not ret:
        break

    results = model(frame, device='cpu', imgsz=640)

    hand_center = None

    for r in results:
        if r.boxes is not None:
            for box in r.boxes:
                class_id = int(box.cls[0])
                conf = float(box.conf[0])
                if class_id == 1:  # hand only
                    x1, y1, x2, y2 = map(int, box.xyxy[0])
                    cx = (x1 + x2) // 2
                    cy = (y1 + y2) // 2
                    hand_center = (cx, cy)

                    # Draw bounding box and label
                    classes = ['arm', 'hand', 'not_hand']
                    label = f"{classes[class_id]} {conf:.0%}"
                    cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 255), 2)
                    cv2.putText(frame, label, (x1, y1-10),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,255), 2)

    # -------------------------------
    # Distance-based state
    # -------------------------------
    state = "SAFE"
    color = (0, 255, 0)
    if hand_center is not None:
        dist = int(np.linalg.norm(np.array(hand_center) - np.array(obj_center)))
        if dist > 120:
            state = "SAFE"
            color = (0, 255, 0)
        elif dist <= 120 and dist > 60:
            state = "WARNING"
            color = (0, 255, 255)
        else:
            state = "DANGER"
            color = (0, 0, 255)
        cv2.putText(frame, f"Distance: {dist}px", (20, 40),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)

    # Draw virtual object
    cv2.rectangle(frame, (vx1, vy1), (vx2, vy2), color, 3)
    cv2.circle(frame, obj_center, 6, color, -1)

    # Display state
    cv2.putText(frame, state, (20, 80),
                cv2.FONT_HERSHEY_SIMPLEX, 1.2, color, 3)
    if state == "DANGER":
        cv2.putText(frame, "DANGER DANGER", (80, 130),
                    cv2.FONT_HERSHEY_SIMPLEX, 1.4, (0, 0, 255), 4)

    # Display FPS
    fps = 1 / (time.time() - start_time)
    cv2.putText(frame, f"FPS: {fps:.1f}", (20, 160),
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 0), 2)

    cv2.imshow("Hand Detection + Virtual Object", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

Found https://huggingface.co/EtanHey/hand-detection-3class/resolve/main/model.pt locally at weights/model.pt

0: 640x640 not_hand 0.94, hand 0.05, arm 0.02, 56.7ms
Speed: 24.8ms preprocess, 56.7ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 not_hand 0.94, hand 0.05, arm 0.01, 38.5ms
Speed: 12.6ms preprocess, 38.5ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 not_hand 0.94, hand 0.04, arm 0.02, 37.0ms
Speed: 11.3ms preprocess, 37.0ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 not_hand 0.94, hand 0.04, arm 0.02, 33.6ms
Speed: 10.2ms preprocess, 33.6ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 not_hand 0.94, hand 0.04, arm 0.02, 34.4ms
Speed: 11.5ms preprocess, 34.4ms inference, 0.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 not_hand 0.94, hand 0.04, arm 0.02, 31.1ms
Speed: 10.0ms preprocess, 31.1ms inference, 0.0ms postprocess per image