In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import xml.etree.ElementTree as ET
import torch
import torchvision
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from PIL import Image
import os

In [2]:
class CustomDataset(Dataset):
    def __init__(self, dataset_path, split, transform=None):
        self.dataset_path = dataset_path
        self.split = split
        self.transform = transform
        
        self.images = []
        self.annotations = []
        
        images_dir = os.path.join(dataset_path, split, "images")
        annotations_dir = os.path.join(dataset_path, split, "annotations")
        
        for filename in os.listdir(images_dir):
            if filename.endswith(".jpg") or filename.endswith(".png"):
                image_path = os.path.join(images_dir, filename)
                self.images.append(image_path)
                
                annotation_path = os.path.join(annotations_dir, filename.split(".")[0] + ".xml")
                tree = ET.parse(annotation_path)
                root = tree.getroot()
                
                annotation = []
                for obj in root.findall("object"):
                    name = obj.find("name").text
                    bbox = obj.find("bndbox")
                    xmin = int(bbox.find("xmin").text)
                    ymin = int(bbox.find("ymin").text)
                    xmax = int(bbox.find("xmax").text)
                    ymax = int(bbox.find("ymax").text)
                    annotation.append((name, xmin, ymin, xmax, ymax))
                
                self.annotations.append(annotation)
    
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, index):
        image_path = self.images[index]
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        image = Image.fromarray(image)
        
        annotation = self.annotations[index]
        
        if self.transform:
            image = self.transform(image)
        
        return image, annotation

In [3]:
def preprocess_dataset(dataset):
    preprocessed_images = []
    preprocessed_annotations = []
    
    for image, annotation in dataset:
        if isinstance(image, torch.Tensor):
            image = transforms.ToPILImage()(image)
        else:
            image = Image.fromarray(image)
        
        image = transform(image)
        
        targets = []
        for obj in annotation:
            name, xmin, ymin, xmax, ymax = obj
            label = class_to_idx[name]
            xmin, ymin, xmax, ymax = xmin / image.shape[2], ymin / image.shape[1], xmax / image.shape[2], ymax / image.shape[1]
            targets.append([label, xmin, ymin, xmax, ymax])
        targets = torch.tensor(targets)
        
        preprocessed_images.append(image)
        preprocessed_annotations.append(targets)
    
    return preprocessed_images, preprocessed_annotations

In [4]:
# Dataset and DataLoader Creation
dataset_path = "dataset"
split = "train"

transform = transforms.Compose([
    transforms.Resize((800, 800)),  
    transforms.RandomHorizontalFlip(0.5),  
    transforms.ToTensor(),  
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  
])

dataset = CustomDataset(dataset_path, split, transform)

class_labels = set()
for _, annotation in dataset:
    for obj in annotation:
        name = obj[0]
        class_labels.add(name)

class_to_idx = {label: idx for idx, label in enumerate(class_labels)}
print("Class labels:", class_to_idx)

preprocessed_images, preprocessed_annotations = preprocess_dataset(dataset)

def collate_fn(batch):
    return tuple(zip(*batch))

train_dataset = list(zip(preprocessed_images, preprocessed_annotations))
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

Class labels: {'largeCar': 0, 'heavyTruck': 1, 'lightTruck': 2, 'smallCar': 3}


In [5]:
# Model Definition and Training
num_classes = len(class_to_idx) + 1
model = fasterrcnn_resnet50_fpn(weights="DEFAULT")
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=0.005, momentum=0.9, weight_decay=0.0005)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    
    for images, targets in train_loader:
        images = list(image.to(device) for image in images)
        targets = [{'boxes': t[:, 1:], 'labels': t[:, 0].long()} for t in targets]
        
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
        
        epoch_loss += losses.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(train_loader):.4f}")

torch.save(model.state_dict(), "trained_model.pth")

Epoch [1/10], Loss: 1.4418
Epoch [2/10], Loss: 0.3149
Epoch [3/10], Loss: 0.2285
Epoch [4/10], Loss: 0.3032
Epoch [5/10], Loss: 0.2469
Epoch [6/10], Loss: 0.1760
Epoch [7/10], Loss: 0.2289
Epoch [8/10], Loss: 0.2086
Epoch [9/10], Loss: 0.1681
Epoch [10/10], Loss: 0.1353


In [7]:
# Evaluation and Testing
model.eval()

test_dataset = CustomDataset(dataset_path, split="test", transform=transform)
test_images, test_annotations = preprocess_dataset(test_dataset)
test_dataset = list(zip(test_images, test_annotations))
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

for image, target in test_loader:
    image = image[0].unsqueeze(0).to(device)  # Add batch dimension and move to device
    with torch.no_grad():
        predictions = model(image)
    
    boxes = predictions[0]['boxes'].cpu().numpy()
    labels = predictions[0]['labels'].cpu().numpy()
    scores = predictions[0]['scores'].cpu().numpy()
    
    confidence_threshold = 0.5
    mask = scores >= confidence_threshold
    boxes = boxes[mask]
    labels = labels[mask]
    scores = scores[mask]
    
    image = image.squeeze(0).cpu().numpy().transpose((1, 2, 0))  # Remove batch dimension and change shape to [H, W, C]
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    
    for box, label, score in zip(boxes, labels, scores):
        xmin, ymin, xmax, ymax = box.astype(int)
        label_name = list(class_to_idx.keys())[list(class_to_idx.values()).index(label)]
        
        cv2.rectangle(image, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
        cv2.putText(image, f"{label_name}: {score:.2f}", (xmin, ymin - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
    
    cv2.imshow("Object Detection", image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

In [1]:
# Video Object Detection
model.eval()

input_video_path = "bridge_1.mp4"
input_video = cv2.VideoCapture(input_video_path)

while True:
    ret, frame = input_video.read()
    if not ret:
        break

    pil_image = Image.fromarray(frame)
    image = transform(pil_image).unsqueeze(0).to(device)

    with torch.no_grad():
        predictions = model(image)

    boxes = predictions[0]['boxes'].cpu().numpy()
    labels = predictions[0]['labels'].cpu().numpy()
    scores = predictions[0]['scores'].cpu().numpy()

    confidence_threshold = 0.5
    mask = scores >= confidence_threshold
    boxes = boxes[mask]
    labels = labels[mask]
    scores = scores[mask]

    for box, label, score in zip(boxes, labels, scores):
        xmin, ymin, xmax, ymax = box.astype(int)
        label_name = list(class_to_idx.keys())[list(class_to_idx.values()).index(label)]

        cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
        cv2.putText(frame, f"{label_name}: {score:.2f}", (xmin, ymin - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

    cv2.imshow("Object Detection", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

input_video.release()
cv2.destroyAllWindows()

NameError: name 'model' is not defined