In [29]:
import torch
import torchvision
from torchvision import transforms, models, datasets
from torch.utils.data import Dataset
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
import xml.etree.ElementTree as ET
import torchvision.transforms as transforms

In [30]:

dataset_path = "dataset"
split = "train"  
dataset = CustomDataset(dataset_path, split)

class CustomDataset(Dataset):
    def __init__(self, dataset_path, split):
        self.images = []
        self.annotations = []
        images_dir = os.path.join(dataset_path, split, "images")
        annotations_dir = os.path.join(dataset_path, split, "annotations")

        for filename in os.listdir(images_dir):
            if filename.endswith(".jpg") or filename.endswith(".png"):
                image_path = os.path.join(images_dir, filename)
                image = cv2.imread(image_path)
                self.images.append(image)

                annotation_path = os.path.join(annotations_dir, filename.split(".")[0] + ".xml")
                tree = ET.parse(annotation_path)
                root = tree.getroot()
                annotation = []
                for obj in root.findall("object"):
                    name = obj.find("name").text
                    bbox = obj.find("bndbox")
                    xmin = int(bbox.find("xmin").text)
                    ymin = int(bbox.find("ymin").text)
                    xmax = int(bbox.find("xmax").text)
                    ymax = int(bbox.find("ymax").text)
                    annotation.append((name, xmin, ymin, xmax, ymax))
                self.annotations.append(annotation)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, index):
        image = self.images[index]
        annotation = self.annotations[index]
        return image, annotation

In [31]:
import torchvision.transforms as transforms

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((800, 800)),  
    transforms.RandomHorizontalFlip(0.5),  
    transforms.ToTensor(),  
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  
])

class_labels = set()
for _, annotation in dataset:
    for obj in annotation:
        name = obj[0]
        class_labels.add(name)

class_to_idx = {label: idx for idx, label in enumerate(class_labels)}
print("Class labels:", class_to_idx)

def preprocess_dataset(dataset):
    preprocessed_images = []
    preprocessed_annotations = []
    
    for image, annotation in dataset:
        image = transform(image)
        
        targets = []
        for obj in annotation:
            name, xmin, ymin, xmax, ymax = obj
            label = class_to_idx[name]
            xmin, ymin, xmax, ymax = xmin / image.shape[2], ymin / image.shape[1], xmax / image.shape[2], ymax / image.shape[1]
            targets.append([label, xmin, ymin, xmax, ymax])
        targets = torch.tensor(targets)
        
        preprocessed_images.append(image)
        preprocessed_annotations.append(targets)
    
    return preprocessed_images, preprocessed_annotations

preprocessed_images, preprocessed_annotations = preprocess_dataset(dataset)

Class labels: {'heavyTruck': 0, 'lightTruck': 1, 'smallCar': 2, 'largeCar': 3}


In [32]:
import torchvision
from torchvision.models.detection import FasterRCNN
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.models import ResNet50_Weights

def get_fasterrcnn_model(num_classes):
    backbone = torchvision.models.resnet50(weights=ResNet50_Weights.DEFAULT)
    backbone = torch.nn.Sequential(*list(backbone.children())[:-2])
    backbone.out_channels = 2048
    anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                       aspect_ratios=((0.5, 1.0, 2.0),))
    roi_pooler = torchvision.ops.MultiScaleRoIAlign(featmap_names=['0'],
                                                    output_size=7,
                                                    sampling_ratio=2)
    model = FasterRCNN(backbone,
                       num_classes=num_classes,
                       rpn_anchor_generator=anchor_generator,
                       box_roi_pool=roi_pooler)
    return model

num_classes = len(class_to_idx) + 1
model = get_fasterrcnn_model(num_classes)

In [34]:
import os
import cv2
import torch
import xml.etree.ElementTree as ET
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, dataset_path, split, transform=None):
        self.dataset_path = dataset_path
        self.split = split
        self.transform = transform
        self.images = []
        self.annotations = []
        
        self.load_data()
    
    def load_data(self):
        images_dir = os.path.join(self.dataset_path, self.split, "images")
        annotations_dir = os.path.join(self.dataset_path, self.split, "annotations")
        
        for filename in os.listdir(images_dir):
            if filename.endswith(".jpg") or filename.endswith(".png"):
                image_path = os.path.join(images_dir, filename)
                annotation_path = os.path.join(annotations_dir, filename.split(".")[0] + ".xml")
                
                self.images.append(image_path)
                self.annotations.append(annotation_path)
    
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, index):
        image_path = self.images[index]
        annotation_path = self.annotations[index]
        
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        if self.transform:
            image = self.transform(image)
        
        tree = ET.parse(annotation_path)
        root = tree.getroot()
        
        boxes = []
        labels = []
        for obj in root.findall("object"):
            name = obj.find("name").text
            bbox = obj.find("bndbox")
            xmin = int(bbox.find("xmin").text)
            ymin = int(bbox.find("ymin").text)
            xmax = int(bbox.find("xmax").text)
            ymax = int(bbox.find("ymax").text)
            
            boxes.append([xmin, ymin, xmax, ymax])
            labels.append(class_to_idx[name])
        
        target = {
            "boxes": torch.as_tensor(boxes, dtype=torch.float32),
            "labels": torch.as_tensor(labels, dtype=torch.int64)
        }
        
        return image, target