In [1]:
import cv2
from ultralytics import YOLO
import numpy as np

# Load YOLOv8
model = YOLO("yolov8n.pt")

cap = cv2.VideoCapture(0)

# Initialize previous box
prev_box = None
alpha = 0.2  # smoothing factor (0.1 = very smooth, 0.5 = sharper)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    results = model(frame, stream=True)

    person_boxes = []
    for r in results:
        for box in r.boxes:
            cls = int(box.cls[0])
            if model.names[cls] == "person":
                x1, y1, x2, y2 = map(int, box.xyxy[0])
                person_boxes.append((x1, y1, x2, y2))

    if person_boxes:
        # Pick largest person
        person_boxes.sort(key=lambda b: (b[2] - b[0]) * (b[3] - b[1]), reverse=True)
        x1, y1, x2, y2 = person_boxes[0]

        new_box = np.array([x1, y1, x2, y2], dtype=float)

        if prev_box is None:
            prev_box = new_box
        else:
            # Smooth transition
            prev_box = alpha * new_box + (1 - alpha) * prev_box

        x1, y1, x2, y2 = map(int, prev_box)

        # Draw smoothed box
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

        # Zoomed view
        h, w, _ = frame.shape
        x1, y1 = max(0, x1), max(0, y1)
        x2, y2 = min(w, x2), min(h, y2)

        if x2 > x1 and y2 > y1:
            zoomed = frame[y1:y2, x1:x2]
            zoomed = cv2.resize(zoomed, (640, 480))
            cv2.imshow("Auto-Framed View", zoomed)

    cv2.imshow("Original Webcam Feed", frame)

    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()

  from .autonotebook import tqdm as notebook_tqdm

0: 480x640 1 person, 318.0ms
Speed: 3.5ms preprocess, 318.0ms inference, 192.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 24.3ms
Speed: 3.9ms preprocess, 24.3ms inference, 95.9ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 20.3ms
Speed: 3.5ms preprocess, 20.3ms inference, 3.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 30.6ms
Speed: 2.0ms preprocess, 30.6ms inference, 4.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 27.0ms
Speed: 3.1ms preprocess, 27.0ms inference, 8.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 181.2ms
Speed: 5.5ms preprocess, 181.2ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 18.2ms
Speed: 4.3ms preprocess, 18.2ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 17.4ms
Speed: 2.3ms preprocess, 17.4ms 