### Required Libraries

In [None]:
#If you have CUDA then run this command otherwise ignore this This is valid for CUDA 11.8 if you have other version then change the last digits according to the version
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


In [None]:
!pip install torch torchvision torchaudio

In [None]:
!pip install opencv-python

In [1]:
import cv2
import torch
import torchvision
from torchvision import transforms
import numpy as np

**cv2:** OpenCV library for handling video capture and display.

**torch:** PyTorch library for tensor computations.

**torchvision:** Contains models, datasets, and image transformations for computer vision.

**transforms:** For preprocessing the images.

**numpy:** For numerical operations.


In [10]:
# Run this if you got the SSL certification error while downloading the model Faster RCNN
import ssl
ssl._create_default_https_context = ssl._create_unverified_context


# Model

In [2]:
def get_device():
    """Get the available device: GPU if available, else CPU."""
    return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


Determines whether to use a GPU (if available) or CPU for computations.

In [6]:
def load_model():
    """Load the pre-trained Faster R-CNN model."""
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    # model.to(device)
    model.eval()
    return model



Loads a pre-trained Faster R-CNN model with a ResNet-50 backbone and Feature Pyramid Network (FPN).

Moves the model to the specified device (GPU or CPU).

Sets the model to evaluation mode to disable training-specific layers like dropout.

In [4]:
# Converts images from PIL format or NumPy arrays to PyTorch tensors.

def get_transform():
    """Define the image transforms."""
    return transforms.Compose([
        transforms.ToTensor(),
    ])

In [None]:
# Initialize device if you have CUDA otherwise ignore
# device = get_device()
# print(f"Using device: {device}")

In [12]:
# Load the model
model = load_model()
print("Faster R-CNN model loaded.")

Faster R-CNN model loaded.


In [13]:
# Class Names: Defines the COCO dataset class names for labeling detected objects.
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A',
    'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
    'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
    'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass', 'cup', 'fork',
    'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
    'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
    'potted plant', 'bed', 'N/A', 'dining table', 'N/A', 'N/A', 'toilet',
    'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

In [14]:
# Initialize video capture
cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Error: Could not open video stream!!! :)")
    exit

print("Starting real-time object detection with Faster R-CNN. Press 'q' to quit.")

# Define the transformation
transform = get_transform()

try:
    while True:
        # Capture frame-by-frame
        ret, frame = cap.read()
        if not ret:
            print("Failed to grab frame.")
            break

        # Convert the image from BGR to RGB
        img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Apply the transformations
        img_tensor = transform(img).to()

        # Add a batch dimension
        img_tensor = img_tensor.unsqueeze(0)

        with torch.no_grad():
            # Perform inference
            outputs = model(img_tensor)

        # Process the outputs
        # Get the scores, labels, and boxes
        scores = outputs[0]['scores'].detach().cpu().numpy()
        labels = outputs[0]['labels'].detach().cpu().numpy()
        boxes = outputs[0]['boxes'].detach().cpu().numpy()

        # Set a confidence threshold
        threshold = 0.5
        high_conf_indices = np.where(scores >= threshold)[0]

        for idx in high_conf_indices:
            box = boxes[idx]
            label = COCO_INSTANCE_CATEGORY_NAMES[labels[idx]]
            score = scores[idx]

            # Draw the bounding box
            cv2.rectangle(frame, 
                            (int(box[0]), int(box[1])), 
                            (int(box[2]), int(box[3])), 
                            (0, 255, 0), 2)

            # Put the label and score
            text = f"{label}: {score:.2f}"
            cv2.putText(frame, text, 
                        (int(box[0]), int(box[1])-10), 
                        cv2.FONT_HERSHEY_SIMPLEX, 
                        0.5, (0, 255, 0), 2)

        # Display the resulting frame
        cv2.imshow('Faster R-CNN Real-Time Object Detection', frame)

        # Exit if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord('q'):
            print("Exiting...")
            break

except KeyboardInterrupt:
    print("\nInterrupted by user.")

finally:
    # When everything done, release the capture
    cap.release()
    cv2.destroyAllWindows()



Starting real-time object detection with Faster R-CNN. Press 'q' to quit.
Exiting...


Captures each frame from the webcam.

Converts the frame from BGR (OpenCV format) to RGB (PyTorch format).

Applies transformations to convert the image to a tensor.

Performs object detection using the Faster R-CNN model.

Filters detections based on a confidence threshold (e.g., 0.5).

Draws bounding boxes and labels on detected objects.

Displays the annotated frame in a window.

Allows exiting the loop by pressing the 'q' key.