In [1]:
# Cell 1: Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import torch
import torchvision
from torchvision import models, transforms
from PIL import Image
import os

cuda_available = torch.cuda.is_available()

print(f"CUDA Available: {cuda_available}")

if cuda_available:
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available. PyTorch will use the CPU.")

CUDA Available: True
GPU 0: NVIDIA GeForce RTX 3080


In [2]:
# Cell 2: Load the pre-trained RetinaNet model
model = torchvision.models.detection.retinanet_resnet50_fpn(pretrained=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()



RetinaNet(
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(inplace=True)
          (downsample): Sequential(
            (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d(256, eps=0.0)


In [3]:
# Cell 3: Define the COCO labels
coco_labels = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

vehicle_labels = ['car', 'motorcycle', 'bus', 'truck']

In [4]:
# Cell 4: Define the transformation
transform = transforms.Compose([
    transforms.ToTensor()
])

In [5]:
# Cell 5: Detect objects in an image
def detect_objects(image):
    image_tensor = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(image_tensor)

    boxes = outputs[0]['boxes'].cpu().numpy()
    labels = outputs[0]['labels'].cpu().numpy()
    scores = outputs[0]['scores'].cpu().numpy()

    detected_objects = []

    for box, label, score in zip(boxes, labels, scores):
        if score >= 0.5:
            detected_objects.append((box, coco_labels[label], score))

    return detected_objects

In [6]:
def adjust_coordinates(box, original_size, preprocessed_size):
    original_width, original_height = original_size
    preprocessed_width, preprocessed_height = preprocessed_size

    scale_width = original_width / preprocessed_width
    scale_height = original_height / preprocessed_height

    xmin, ymin, xmax, ymax = box
    xmin = int(xmin * scale_width)
    xmax = int(xmax * scale_width)
    ymin = int(ymin * scale_height)
    ymax = int(ymax * scale_height)

    return xmin, ymin, xmax, ymax

def visualize_detections(image_path, detected_objects, preprocessed_image, confidence_threshold=0.7):
    image = cv2.imread(image_path)
    image_with_detections = image.copy()

    original_size = (image.shape[1], image.shape[0])  # (width, height)
    preprocessed_size = (preprocessed_image.shape[1], preprocessed_image.shape[0])  # (width, height)

    for box, label, score in detected_objects:
        if score >= confidence_threshold:
            adjusted_box = adjust_coordinates(box, original_size, preprocessed_size)
            xmin, ymin, xmax, ymax = adjusted_box

            cv2.rectangle(image_with_detections, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
            cv2.putText(image_with_detections, f"{label}: {score:.2f}", (xmin, ymin - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

    return image_with_detections


In [7]:
# Cell 7: Preprocess the image
def preprocess_image(image_path, target_size=(800, 800)):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    h, w, _ = image.shape
    scale = min(target_size[0] / w, target_size[1] / h)
    resized_image = cv2.resize(image, None, fx=scale, fy=scale)
    
    padded_image = np.zeros((target_size[0], target_size[1], 3), dtype=np.uint8)
    h, w, _ = resized_image.shape
    padded_image[:h, :w, :] = resized_image
    
    return padded_image

In [9]:
# Cell 8: Test object detection with preprocessing
import glob

dataset_path = "raw-images"
rgb_image_paths = []

for subfolder in os.listdir(dataset_path):
    if "rgb" in subfolder.lower():
        subfolder_path = os.path.join(dataset_path, subfolder)
        if os.path.isdir(subfolder_path):
            rgb_image_paths.extend(glob.glob(os.path.join(subfolder_path, "*.jpg")))
            rgb_image_paths.extend(glob.glob(os.path.join(subfolder_path, "*.png")))

np.random.shuffle(rgb_image_paths)

num_test_images = 10
test_image_paths = rgb_image_paths[:num_test_images]

for image_path in test_image_paths:
    original_image = cv2.imread(image_path)
    preprocessed_image = preprocess_image(image_path)
    print("Preprocessed Image Shape:", preprocessed_image.shape)
    
    pil_image = Image.fromarray(preprocessed_image)
    
    detected_objects = detect_objects(pil_image)
    print("Detected Objects:")
    for obj in detected_objects:
        print(obj)
    
    scale_x = original_image.shape[1] / preprocessed_image.shape[1]
    scale_y = original_image.shape[0] / preprocessed_image.shape[0]
    
    image_with_detections = original_image.copy()
    
    for box, label, score in detected_objects:
        if score >= 0.5:
            xmin, ymin, xmax, ymax = box
            xmin_scaled = int(xmin * scale_x)
            ymin_scaled = int(ymin * scale_y)
            xmax_scaled = int(xmax * scale_x)
            ymax_scaled = int(ymax * scale_y)
            
            cv2.rectangle(image_with_detections, (xmin_scaled, ymin_scaled), (xmax_scaled, ymax_scaled), (0, 255, 0), 2)
            cv2.putText(image_with_detections, f"{label}: {score:.2f}", (xmin_scaled, ymin_scaled - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
    
    print("Image with Detections Shape:", image_with_detections.shape)
    
    cv2.imshow("Image with Detections", image_with_detections)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

Preprocessed Image Shape: (800, 800, 3)
Detected Objects:
Image with Detections Shape: (720, 1280, 3)
Preprocessed Image Shape: (800, 800, 3)
Detected Objects:
(array([151.84396, 122.1436 , 294.6679 , 313.85718], dtype=float32), 'train', 0.76169896)
(array([576.03015, 169.4619 , 633.444  , 195.84962], dtype=float32), 'car', 0.67914516)
(array([381.1723 , 130.26294, 402.87695, 144.52979], dtype=float32), 'car', 0.572196)
(array([101.06355,  92.8962 , 133.27386, 131.2688 ], dtype=float32), 'car', 0.5034947)
Image with Detections Shape: (720, 1280, 3)
Preprocessed Image Shape: (800, 800, 3)
Detected Objects:
(array([140.24258, 154.71288, 165.87854, 179.33263], dtype=float32), 'car', 0.66215444)
(array([165.60815, 138.05038, 191.26547, 158.57187], dtype=float32), 'car', 0.53653294)
Image with Detections Shape: (720, 1280, 3)
Preprocessed Image Shape: (800, 800, 3)
Detected Objects:
(array([567.453  , 153.69333, 765.041  , 231.83661], dtype=float32), 'truck', 0.62256265)
(array([144.90263, 