In [1]:
import cv2
import time
import numpy as np
from PIL import Image
from transformers import pipeline

# --- 1. CONFIGURATION ---
# '0' tells OpenCV to look for the default connected webcam
VIDEO_SOURCE = 0
MODEL_NAME = "facebook/detr-resnet-50"

# --- 2. LOAD MODEL ---
print(f"[INFO] Loading model: {MODEL_NAME}...")
# We initialize the pipeline once
detector = pipeline("object-detection", model=MODEL_NAME)
print("[INFO] Model loaded. Starting webcam...")

# --- 3. WEBCAM SETUP ---
cap = cv2.VideoCapture(VIDEO_SOURCE)

# Set resolution to something manageable (e.g., 640x480) to speed up processing
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)

if not cap.isOpened():
    print("[ERROR] Could not access the webcam.")
    print("Ensure no other app (Zoom, Teams) is using it.")
    exit()

# Variables for controlling inference speed
frame_count = 0
skip_frames = 5  # Only run AI every 5 frames to prevent lag
last_results = [] # Store results to draw them during skipped frames

print("[INFO] Press 'q' to quit.")

try:
    while True:
        ret, frame = cap.read()
        if not ret:
            print("[ERROR] Failed to grab frame.")
            break

        frame_count += 1

        # --- AI INFERENCE (Throttled) ---
        # We only ask the AI to think every 'skip_frames' times
        if frame_count % skip_frames == 0:
            # Convert BGR (OpenCV) to RGB (PIL)
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(rgb_frame)

            # Run Inference
            last_results = detector(pil_image)

        # --- DRAWING (Every Frame) ---
        # We draw the *last known* detections so the boxes persist smoothly
        for result in last_results:
            box = result['box']
            label = result['label']
            score = result['score']

            if score > 0.5:
                xmin, ymin = int(box['xmin']), int(box['ymin'])
                xmax, ymax = int(box['xmax']), int(box['ymax'])

                # Draw Box (Green)
                cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)

                # Draw Label background for readability
                label_text = f"{label}: {score:.2f}"
                (w, h), _ = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)
                cv2.rectangle(frame, (xmin, ymin - 20), (xmin + w, ymin), (0, 255, 0), -1)

                cv2.putText(frame, label_text, (xmin, ymin - 5),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 2)

        # Show the live feed
        cv2.imshow('Real-Time DETR Detection', frame)

        # Exit on 'q' key
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

finally:
    cap.release()
    cv2.destroyAllWindows()
    print("[INFO] Webcam closed.")

  from .autonotebook import tqdm as notebook_tqdm


[INFO] Loading model: facebook/detr-resnet-50...


Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using a slow image processor as `use_fast` is unset and a slow processor was saved w

[INFO] Model loaded. Starting webcam...
[INFO] Press 'q' to quit.
[INFO] Webcam closed.
