In [3]:
import os
import cv2
import supervision as sv
import numpy as np
from PIL import Image, ImageDraw, ImageFont
from rfdetr import RFDETRSmall

In [8]:
# ========== CONFIGURATION ==========
MODE = "webcam"  # Options: "video", "webcam"
MODEL_PATH = "RFDETRSmall_Mono.pth"  # Specific .pth path
CLASS_DICT = "monolabel"  # Options: "monolabel", "multilabel"

# Input/Output paths (for video mode)
INPUT_VIDEO = "salgadinos.mp4"
OUTPUT_VIDEO = "salgadinos_mono.mp4"

# Webcam settings
WEBCAM_ID = 0  # Default webcam

# Detection threshold
THRESHOLD = 0.5

# ========== CLASS DICTIONARIES ==========
CLASS_DICTS = {
    "monolabel": {
        0: "Salgado"
    },
    "multilabel": {
        0: "Bolinha de queijo",
        1: "Canapé",
        2: "Canudo",
        3: "Coxinha",
        4: "Croquete",
        5: "Empadinha",
        6: "Enroladinho de salsicha",
        7: "Esfiha",
        8: "Folhado",
        9: "Pastelzinho",
        10: "Pão de queijo",
        11: "Quibe",
        12: "Risoles",
        13: "Sanduiche",
    }
}

COCO_CLASSES = CLASS_DICTS[CLASS_DICT]

# ========== LOAD MODEL ==========
print(f"Loading model from: {MODEL_PATH}")
model = RFDETRSmall(pretrain_weights=MODEL_PATH)
model.optimize_for_inference()
print("Model loaded and optimized")

# ========== SETUP ANNOTATORS ==========
color = sv.ColorPalette.from_hex([
    "#ffff00", "#ff9b00", "#ff8080", "#ff66b2", "#ff66ff", "#b266ff",
    "#9999ff", "#3399ff", "#66ffff", "#33ff99", "#66ff66", "#99ff00"
])

def annotate_frame(frame, detections, counts_text):
    """Annotate a single frame with detections and counts"""
    # Convert BGR to RGB for PIL
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    
    text_scale = sv.calculate_optimal_text_scale(resolution_wh=image.size)
    thickness = sv.calculate_optimal_line_thickness(resolution_wh=image.size)
    
    bbox_annotator = sv.BoxAnnotator(color=color, thickness=thickness)
    label_annotator = sv.LabelAnnotator(
        color=color,
        text_color=sv.Color.BLACK,
        text_scale=text_scale,
        smart_position=True
    )
    
    labels = [
        f"{COCO_CLASSES[class_id]} {confidence:.2f}"
        for class_id, confidence
        in zip(detections.class_id, detections.confidence)
    ]
    
    annotated_image = bbox_annotator.annotate(image, detections)
    annotated_image = label_annotator.annotate(annotated_image, detections, labels)
    
    # Add count overlay
    draw = ImageDraw.Draw(annotated_image)
    try:
        font = ImageFont.truetype("VeraMono.ttf", 40)
    except:
        font = ImageFont.load_default()
    
    draw.multiline_text((10, 10), counts_text, fill="red", font=font)
    
    # Convert back to BGR for OpenCV
    return cv2.cvtColor(np.array(annotated_image), cv2.COLOR_RGB2BGR)

def process_frame(frame):
    """Process a single frame and return annotated frame"""
    # Convert to PIL Image
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    
    # Run detection
    detections = model.predict(image, threshold=THRESHOLD)
    
    # Count classes
    counts = {cid: 0 for cid in COCO_CLASSES.keys()}
    for cid in detections.class_id:
        counts[int(cid)] += 1
    
    # Build count text
    lines = []
    for cid in sorted(counts.keys()):
        if counts[cid] > 0:  # Only show detected classes
            lines.append(f"{COCO_CLASSES[cid]}: {counts[cid]}")
    counts_text = "\n".join(lines) if lines else "No detections"
    
    # Annotate frame
    return annotate_frame(frame, detections, counts_text)

# ========== VIDEO MODE ==========
if MODE == "video":
    print(f"Processing video: {INPUT_VIDEO}")
    
    cap = cv2.VideoCapture(INPUT_VIDEO)
    if not cap.isOpened():
        raise ValueError(f"Cannot open video: {INPUT_VIDEO}")
    
    # Get video properties
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    # Setup video writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(OUTPUT_VIDEO, fourcc, fps, (width, height))
    
    print(f"Video info: {width}x{height} @ {fps}fps, {total_frames} frames")
    
    frame_count = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        frame_count += 1
        print(f"Processing frame {frame_count}/{total_frames}", end='\r')
        
        annotated_frame = process_frame(frame)
        out.write(annotated_frame)
    
    cap.release()
    out.release()
    print(f"\nVideo saved to: {OUTPUT_VIDEO}")

# ========== WEBCAM MODE ==========
elif MODE == "webcam":
    print(f"Starting webcam (ID: {WEBCAM_ID}). Press 'q' to quit.")
    
    cap = cv2.VideoCapture(WEBCAM_ID)
    if not cap.isOpened():
        raise ValueError(f"Cannot open webcam: {WEBCAM_ID}")
    
    while True:
        ret, frame = cap.read()
        if not ret:
            print("Failed to grab frame")
            break
        
        annotated_frame = process_frame(frame)
        
        # Display the frame
        cv2.imshow('RFDETR Webcam Detection', annotated_frame)
        
        # Press 'q' to quit
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()
    print("Webcam stream closed")

else:
    raise ValueError(f"Invalid MODE: {MODE}. Use 'video' or 'webcam'")

print("Done!")

Loading model from: RFDETRSmall_Mono.pth
Using a different number of positional encodings than DINOv2, which means we're not loading DINOv2 backbone weights. This is not a problem if finetuning a pretrained RF-DETR model.
Using patch size 16 instead of 14, which means we're not loading DINOv2 backbone weights. This is not a problem if finetuning a pretrained RF-DETR model.


num_classes mismatch: pretrain weights has 0 classes, but your model has 90 classes
reinitializing detection head with 0 classes


Loading pretrain weights
Model loaded and optimized
Starting webcam (ID: 0). Press 'q' to quit.
Webcam stream closed
Done!
