In [1]:
# %pip install -q transformers torch torchvision pillow opencv-python numpy timm

### CLI Command for Running the DETR (DEtection TRansformer) Model by Facebook
- Specifically, ResNet-50 in the case of the detr-resnet-50 variant

```
!yolo predict model=rtdetr-l.pt source='/content/drive/MyDrive/video.mp4' device=0
```

### Ran on Google Colab

In [None]:
import cv2
import time
import numpy as np
from PIL import Image
from transformers import pipeline
# Import the Colab-specific patch
from google.colab.patches import cv2_imshow
from IPython.display import clear_output

# --- 1. CONFIGURATION ---
# Use 0 for webcam, or provide the exact path to your video file
VIDEO_PATH = '/content/drive/MyDrive/AI_Object_Detection_Project_P14/driver-action-recognition.mp4'
OUTPUT_PATH = 'detr_output.avi'

# We use the standard DETR model from Facebook (Meta)
MODEL_NAME = "facebook/detr-resnet-50"

# --- 2. LOAD HUGGING FACE PIPELINE ---
print(f"[INFO] Downloading/Loading model: {MODEL_NAME}...")
# The 'object-detection' pipeline handles preprocessing, inference, and post-processing
detector = pipeline("object-detection", model=MODEL_NAME)
print("[INFO] Model loaded successfully.")

# --- 3. VIDEO SETUP ---
cap = cv2.VideoCapture(VIDEO_PATH)

if not cap.isOpened():
    print(f"[ERROR] Cannot open video: {VIDEO_PATH}")
else:
    # Get video properties for the writer
    frame_width = int(cap.get(3))
    frame_height = int(cap.get(4))
    fps = int(cap.get(cv2.CAP_PROP_FPS)) if cap.get(cv2.CAP_PROP_FPS) > 0 else 20

    # Use MJPG/AVI for safety on your laptop/Colab
    out = cv2.VideoWriter(OUTPUT_PATH, cv2.VideoWriter_fourcc(*'MJPG'), fps, (frame_width, frame_height))

    print(f"[INFO] Starting inference on {VIDEO_PATH}...")
    print("[INFO] Processing... (This may take time)")

    # --- 4. PROCESSING LOOP ---
    start_time = time.time()
    frame_count = 0

    try:
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            frame_count += 1

            # CONVERSION STEP: OpenCV uses BGR, but Hugging Face needs RGB PIL Images
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(rgb_frame)

            # --- AI INFERENCE ---
            results = detector(pil_image)

            # --- DRAWING RESULTS ---
            for result in results:
                box = result['box']
                label = result['label']
                score = result['score']

                # Only draw if confident (e.g., > 50%)
                if score > 0.5:
                    xmin, ymin = int(box['xmin']), int(box['ymin'])
                    xmax, ymax = int(box['xmax']), int(box['ymax'])

                    # Draw Box (Green)
                    cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)

                    # Draw Label
                    label_text = f"{label}: {score:.2f}"
                    cv2.putText(frame, label_text, (xmin, ymin - 10),
                                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

            # Save the annotated frame to output file
            out.write(frame)

            # --- COLAB DISPLAY LOGIC ---
            # To avoid spamming the notebook with images, we clear output and show the latest frame
            # Using wait=True makes the transition smoother
            clear_output(wait=True)
            cv2_imshow(frame)

            # Note: cv2.waitKey works with cv2.imshow (local), but not cv2_imshow (colab).
            # We keep it for compatibility if you export this script later.
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    except KeyboardInterrupt:
        print("[INFO] Interrupted by user.")

    finally:
        # --- CLEANUP ---
        end_time = time.time()
        duration = end_time - start_time
        print(f"\n[INFO] Finished! Processed {frame_count} frames in {duration:.2f} seconds.")
        print(f"[INFO] Average FPS: {frame_count / duration:.2f}")
        print(f"[INFO] Output saved to: {OUTPUT_PATH}")

        cap.release()
        out.release()
        cv2.destroyAllWindows()