In [33]:
import cv2
import torch
from PIL import Image
import numpy as np
from transformers import OwlViTProcessor, OwlViTForObjectDetection
from IPython.display import display, clear_output
import time
import csv
import os

In [34]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [36]:
# ========================== #
#     IMPORT DEPENDENCIES   #
# ========================== #
import cv2
import torch
from PIL import Image
import numpy as np
from transformers import OwlViTProcessor, OwlViTForObjectDetection
import csv
import time
import os

# ========================== #
#    LOAD OWL-ViT MODEL      #
# ========================== #
print("[INFO] Loading OWL-ViT model...")
processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")

# ========================== #
#     INITIAL LABEL SETUP    #
# ========================== #
# Custom labels to detect (not in COCO)
custom_labels = ["a black pen", "a pair of headphones"]

# ========================== #
#     CSV LOGGING SETUP      #
# ========================== #
log_file = "detection_log.csv"
log_fields = ["timestamp", "label", "confidence", "x1", "y1", "x2", "y2"]

# Create CSV file if not exists
if not os.path.exists(log_file):
    with open(log_file, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(log_fields)

# ========================== #
#       CONFIG SETTINGS      #
# ========================== #
threshold = 0.2                    # Detection confidence threshold
frame_interval = 5                # Skip frames for performance (FPS boost)
frame_count = 0                   # Counter to keep track of frames
prev_results = {"boxes": torch.empty((0, 4)), "labels": [], "scores": []}  # To store last detections

# ========================== #
#        START WEBCAM        #
# ========================== #
cap = cv2.VideoCapture(0)
if not cap.isOpened():
    print("[ERROR] Could not open webcam.")
    exit()

print("[INFO] Webcam started. Press 'q' to quit. Press 'e' to edit labels.")

# ========================== #
#        MAIN LOOP           #
# ========================== #
while True:
    ret, frame = cap.read()
    if not ret:
        print("[ERROR] Failed to grab frame.")
        break

    # Resize for faster inference
    resized_frame = cv2.resize(frame, (480, 360))
    image_rgb = cv2.cvtColor(resized_frame, cv2.COLOR_BGR2RGB)
    pil_image = Image.fromarray(image_rgb)

    # Run detection only every N frames
    if frame_count % frame_interval == 0:
        inputs = processor(text=custom_labels, images=pil_image, return_tensors="pt")

        with torch.no_grad():
            outputs = model(**inputs)

        # Prepare post-processing
        target_size = torch.tensor([pil_image.size[::-1]])  # [H, W]
        result = processor.post_process_object_detection(
            outputs, target_sizes=target_size, threshold=threshold
        )[0]

        # Store for skipped frames
        prev_results = result

    # ========================== #
    #    DRAW BOXES & LOGGING    #
    # ========================== #
    if prev_results["boxes"].numel() > 0:
        for box, label, score in zip(prev_results["boxes"], prev_results["labels"], prev_results["scores"]):
            x1, y1, x2, y2 = map(int, box.tolist())
            text = f"{custom_labels[label]}: {score:.2f}"

            # Draw rectangle & label
            cv2.rectangle(resized_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(resized_frame, text, (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (36, 255, 12), 2)

            # Log detection to CSV
            with open(log_file, mode="a", newline="") as file:
                writer = csv.writer(file)
                writer.writerow([time.time(), custom_labels[label], float(score), x1, y1, x2, y2])

    # ========================== #
    #        DISPLAY FRAME       #
    # ========================== #
    cv2.imshow("OWL-ViT Detection (press 'e' to edit labels)", resized_frame)

    # ========================== #
    #     KEYBOARD SHORTCUTS     #
    # ========================== #
    key = cv2.waitKey(1) & 0xFF
    if key == ord("q"):
        break
    elif key == ord("e"):
        # Live prompt editing
        new_input = input("Enter comma-separated new labels (e.g., 'a screwdriver, a bottle'): ")
        custom_labels = [label.strip() for label in new_input.split(",")]
        print("[INFO] Updated labels:", custom_labels)

    frame_count += 1

# ========================== #
#        CLEANUP             #
# ========================== #
cap.release()
cv2.destroyAllWindows()
print("[INFO] Webcam closed. Logging complete.")


[INFO] Loading OWL-ViT model...
[INFO] Webcam started. Press 'q' to quit. Press 'e' to edit labels.
[INFO] Webcam closed. Logging complete.


In [41]:
# COCO 80 class labels
#Used for checking labels in classes
COCO_CLASSES = set([
    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
    'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat',
    'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
    'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
    'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop',
    'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
    'toothbrush'
])
def check_custom_labels(custom_labels):
    in_coco = []
    not_in_coco = []
    
    for label in custom_labels:
        if label.lower() in COCO_CLASSES:
            in_coco.append(label)
        else:
            not_in_coco.append(label)

    print("\n✅ Custom labels NOT in COCO:")
    print(not_in_coco)
    print("\n❌ Labels that ARE already in COCO:")
    print(in_coco)
# Example usage
custom_labels = ["pen", "headphones"]
check_custom_labels(custom_labels)



✅ Custom labels NOT in COCO:
['pen', 'headphones']

❌ Labels that ARE already in COCO:
[]
