<a href="https://colab.research.google.com/github/Jaseelkt007/ML/blob/master/Object%20detection%20using%20Yolov1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import torch
import torch.nn as nn

"""
about architecture config:
Tuple is structured by (kernel_size, filters, stride, padding)
"M" is maxpooling with stride 2x2 and kernel 2x2
List is structured by tuples and lastly int with number of repeats
"""

architecture_config = [
    (7, 64, 2, 3),
    "M",
    (3, 192, 1, 1),
    "M",
    (1, 128, 1, 0),
    (3, 256, 1, 1),
    (1, 256, 1, 0),
    (3, 512, 1, 1),
    "M",
    [(1, 256, 1, 0), (3, 512, 1, 1), 4],
    (1, 512, 1, 0),
    (3, 1024, 1, 1),
    "M",
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],
    (3, 1024, 1, 1),
    (3, 1024, 2, 1),
    (3, 1024, 1, 1),
    (3, 1024, 1, 1),
]


class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))


class Yolov1(nn.Module):
    def __init__(self, in_channels=3, **kwargs):
        super(Yolov1, self).__init__()
        self.architecture = architecture_config
        self.in_channels = in_channels
        self.darknet = self._create_conv_layers(self.architecture)
        self.fcs = self._create_fcs(**kwargs)

    def forward(self, x):
        x = self.darknet(x)
        x = self.fcs(torch.flatten(x, start_dim=1))
        return x

    def _create_conv_layers(self, architecture):
        layers = []
        in_channels = self.in_channels

        for x in architecture:
            if type(x) == tuple:
                layers += [ CNNBlock(in_channels, x[1], kernel_size=x[0], stride=x[2], padding=x[3],)]
                in_channels = x[1]

            elif type(x) == str:
                layers += [nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))]

            elif type(x) == list:
                conv1 = x[0]
                conv2 = x[1]
                num_repeats = x[2]

                for _ in range(num_repeats):
                    layers += [CNNBlock(in_channels, conv1[1], kernel_size=conv1[0], stride=conv1[2],padding=conv1[3], )]
                    layers += [CNNBlock(conv1[1], conv2[1], kernel_size=conv2[0], stride=conv2[2], padding=conv2[3], )]
                    in_channels = conv2[1]

        return nn.Sequential(*layers)

    def _create_fcs(self, split_size, num_boxes, num_classes):
        S, B, C = split_size, num_boxes, num_classes
        return nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * S * S, 496), # reduced to 496 for simplification
            nn.Dropout(0.0),
            nn.LeakyReLU(0.1),
            nn.Linear(496, S * S * (C + B * 5)),
        )

def test(S = 7 , B =2, C = 20):
    model = Yolov1(split_size =S , num_boxes = B ,  num_classes =C)
    x = torch.randn((2, 3, 448,448))
    print(model(x).shape)
#test()

In [9]:
from collections import Counter


def intersection_over_union(boxes_preds , box_labels , box_format = "midpoint"): # box [x1,y1, x2,y2]

    if box_format == "midpoint": # box = [xcenter, ycenter , w, h]
        box1_x1 = boxes_preds[...,0:1] - boxes_preds[...,2:3]/2
        box1_y1 = boxes_preds[...,1:2] - boxes_preds[..., 3:4]/2
        box1_x2 = boxes_preds[...,0:1] + boxes_preds[...,2:3]/2
        box1_y2 = boxes_preds[...,1:2] + boxes_preds[..., 3:4]/2

        box2_x1 = box_labels[...,0:1] - box_labels[...,2:3]/2
        box2_y1 = box_labels[...,1:2] - box_labels[..., 3:4]/2
        box2_x2 = box_labels[...,0:1] + box_labels[...,2:3]/2
        box2_y2 = box_labels[...,1:2] + box_labels[..., 3:4]/2

    elif box_format == "corners": # box [x1,y1, x2,y2]
        box1_x1 = boxes_preds[..., 0:1] # sliced to retain shape
        box1_y1 = boxes_preds[..., 1:2]
        box1_x2 = boxes_preds[..., 2:3]
        box1_y2 = boxes_preds[..., 3:4]
        box2_x1 = box_labels[..., 0:1]
        box2_y1 = box_labels[..., 1:2]
        box2_x2 = box_labels[..., 2:3]
        box2_y2 = box_labels[..., 3:4]

    x1 = torch.max(box1_x1 , box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    x2 = torch.min(box1_x2, box2_x2)
    y2 = torch.min(box1_y2, box2_y2)

    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)

    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y2))
    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))

    return intersection/ (box1_area + box2_area - intersection + 1e-6)


# Non Max Suppression
def non_max_suppresion(
    bboxes,
    prob_threshold,
    iou_threshold,
    box_format='corners',

):
# predictions -> [[1,0.9,x1,y1,x2,y2]]
    assert type(bboxes) == list

    bboxes = [box for box in bboxes if box[1] > prob_threshold] # select the box with high probability
    bboxes = sorted(bboxes , key= lambda x:x[1] , reverse=True) # sort in descending order based x[1] , ie, high probablity
    bboxes_after_nms =[]

    while(bboxes):
        chosen_box = bboxes.pop(0) # remove that element,return 1st element  which has the highest probability
        # check the IOU btw choosen and other are less than iou_threshold
        bboxes = [ box for box in bboxes if box[0] != chosen_box[0] or intersection_over_union(torch.tensor(chosen_box[2:]) , torch.tensor(box[2:], box_format = box_format)) < iou_threshold]
        bboxes_after_nms.append(chosen_box)

    return bboxes_after_nms


def mean_average_precision(
    pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint", num_classes=20
):
    # pred_boxes (list): [[train_idx, class_idx, prob_score, x1, y1, x2,y2], [],[],...]
    # list storing all AP for respective classes
    average_precisions = []
    epsilon = 1e-6

    for c in range(num_classes):
        detections = []
        ground_truths = []

        for detection in pred_boxes: # check the no of prediction made on the class c
            if detection[1] == c:
                detections.append(detection)

        for true_box in true_boxes: # only keep the ground truth for class c
            if true_box[1] == c:
                ground_truths.append(true_box)

        # count the frequency of each g[0] then return a dictionary --> how many ground truth in each image
        amount_bboxes = Counter([gt[0] for gt in ground_truths])

        for key, val in amount_bboxes.items():
            amount_bboxes[key] = torch.zeros(val)
            #0:torch.tensor([0,0,0]) , 1: torch.tensor([0,0,0,0,0])} , zeros will track whether a ground truth box has already benn matched to a predicted bbox

        detections.sort(key=lambda x: x[2], reverse=True)
        TP = torch.zeros((len(detections)))
        FP = torch.zeros((len(detections)))
        total_true_bboxes = len(ground_truths)

        if total_true_bboxes == 0:
            continue

        for detection_idx, detection in enumerate(detections):
            ground_truth_img = [
                bbox for bbox in ground_truths if bbox[0] == detection[0]
            ]
             # this list extracts all ground turth bbox for same image as currect prediction
            # eg: if img has 3 objects, then this will have 3 bboxes

            num_gts = len(ground_truth_img)
            best_iou = 0
            # now calculate the iou btw this predicted bbox with all bboxes in that specific image - ground_truth_img
            for idx, gt in enumerate(ground_truth_img):
                iou = intersection_over_union(
                    torch.tensor(detection[3:]),
                    torch.tensor(gt[3:]),
                    box_format=box_format,
                )

                if iou > best_iou:
                    best_iou = iou
                    best_gt_idx = idx
            # check if iou is greater than iou threshold and also we have to check whether we have covered this bbox before, then its is TP
            if best_iou > iou_threshold:
                # only detect ground truth detection once
                if amount_bboxes[detection[0]][best_gt_idx] == 0:
                    # true positive and add this bounding box to seen
                    TP[detection_idx] = 1
                    amount_bboxes[detection[0]][best_gt_idx] = 1
                else:
                    FP[detection_idx] = 1

            # if IOU is lower then the detection is a false positive
            else:
                FP[detection_idx] = 1

        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)
        recalls = TP_cumsum / (total_true_bboxes + epsilon)
        precisions = torch.divide(TP_cumsum, (TP_cumsum + FP_cumsum + epsilon))
        precisions = torch.cat((torch.tensor([1]), precisions))
        recalls = torch.cat((torch.tensor([0]), recalls))
        # torch.trapz for numerical integration
        average_precisions.append(torch.trapz(precisions, recalls))

    return sum(average_precisions) / len(average_precisions)

In [10]:
# YOLO loss

class YoloLoss(nn.Module):
    def __init__(self, S =7 , B =2 , C = 20):
        super(YoloLoss, self).__init__()
        self.mse = nn.MSELoss(reduction="sum")
        self.S = S
        self.B = B
        self.C = C
        self.lambda_coord = 5
        self.lambda_noobj = 0.5

    def forward(self, predictions, target):
        predictions = predictions.reshape(-1, self.S ,self.S, self.C + self.B *5)
        # Calculate IOU for both predictions
        iou_b1 = intersection_over_union(predictions[..., 21:25], target[..., 21:25])
        iou_b2 = intersection_over_union(predictions[...,26:30], target[..., 21:25])
        ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim = 0)
        iou_max , best_box = torch.max(ious, dim= 0) # best box and its index
        exists_box = target[..., 20].unsqueeze(3) # Iobj_i

        # Box cordinates
        box_predictions = exists_box * (best_box * predictions[..., 26:30] + (1 - best_box)*predictions[..., 21:25]) # x,y,w,h

        box_targets = exists_box * target[..., 21:25]
        box_predictions[...,2:4] = torch.sign(box_predictions[...,2:4]) * torch.sqrt(torch.abs(box_predictions[..., 2:4] + 1e-6)) # w,h
        box_targets[...,2:4] = torch.sqrt(box_targets[...,2:4])

        # (N, S,S, 4) --> (N * S * S , 4)
        box_loss = self.mse( torch.flatten(box_predictions, end_dim=-2), torch.flatten(box_targets, end_dim=-2),)

        # Object loss
        pred_box = ( best_box * predictions[..., 25:26] + (1 - best_box) * predictions[..., 20:21])
        # (N * S *S)
        object_loss = self.mse( torch.flatten(exists_box * pred_box) , torch.flatten(exists_box * target[..., 20:21])  )

        # For No Object for both bounding box
        # (N , S,S,1) -> (N ,S*S)
        no_object_loss = self.mse( torch.flatten((1-exists_box)*predictions[...,20:21], start_dim=1),  torch.flatten((1-exists_box)*target[...,20:21], start_dim=1))
        no_object_loss += self.mse( torch.flatten((1-exists_box)*predictions[...,25:26], start_dim=1),  torch.flatten((1-exists_box)*target[...,20:21], start_dim=1))

        # For Class Loss  , (N, S, S, 20) --> (N*S*S, 20)
        class_loss = self.mse( torch.flatten(exists_box * predictions[..., :20], end_dim=-2), torch.flatten(exists_box * target[...,:20], end_dim=-2))

        # Total loss
        loss = self.lambda_coord * box_loss  + object_loss + self.lambda_noobj * no_object_loss + class_loss

        return loss


In [6]:
# Download Pascal VOC 2012
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar

# Extract the dataset
!tar -xf VOCtrainval_11-May-2012.tar

--2024-10-10 17:05:15--  http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
Resolving host.robots.ox.ac.uk (host.robots.ox.ac.uk)... 129.67.94.152
Connecting to host.robots.ox.ac.uk (host.robots.ox.ac.uk)|129.67.94.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1999639040 (1.9G) [application/x-tar]
Saving to: ‘VOCtrainval_11-May-2012.tar’


2024-10-10 17:06:07 (36.6 MB/s) - ‘VOCtrainval_11-May-2012.tar’ saved [1999639040/1999639040]



In [11]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

def parse_voc_annotations(annotations_dir, img_dir, csv_output):
    annotations = []
    for xml_file in os.listdir(annotations_dir):
        tree = ET.parse(os.path.join(annotations_dir, xml_file)) # read the xml file, and convert to in to tree structure
        root = tree.getroot() # the get root element -> <annotation>
        filename = root.find("filename").text # get the text in the tag <filename> eg: image1.jpg
        filepath = os.path.join(img_dir, filename) # concatinate the dir and file name --> .../filename

        for obj in root.findall("object"):
            class_name = obj.find("name").text
            bbox = obj.find("bndbox") # <bndbox>
            xmin = float(bbox.find("xmin").text)
            ymin = float(bbox.find("ymin").text)
            xmax = float(bbox.find("xmax").text)
            ymax = float(bbox.find("ymax").text)

            #append this annotation as a list to the annotations list
            annotations.append([filepath, class_name, xmin, ymin, xmax, ymax])
    # Convert the list to dataframe
    annotations_df = pd.DataFrame(annotations , columns=["image_path", "class", "xmin", "ymin", "xmax", "ymax"])
    # convert df to csv
    annotations_df.to_csv(csv_output, index = False)


# Usage:
annotations_dir = "VOCdevkit/VOC2012/Annotations/"
img_dir = "VOCdevkit/VOC2012/JPEGImages/"
csv_output = "voc_annotations.csv"
parse_voc_annotations(annotations_dir, img_dir, csv_output)

small_annotations = pd.read_csv(csv_output).head(8)
small_annotations.to_csv("small_voc_annotations.csv" , index = False)

In [12]:
import torch
import os
import pandas as pd
from PIL import Image

class VOCDataset(torch.utils.data.Dataset):
    def __init__(self, csv_file, img_dir, label_dir, S=7, B=2, C=20, transform=None):
        self.annotations = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.label_dir = label_dir
        self.transform = transform
        self.S = S
        self.B = B
        self.C = C

    def __len__(self):
        # return total number of examples in the dataset
        return len(self.annotations)

    def __getitem__(self, index):
        # 1. Extract the image path and bounding box details from the CSV
        img_path = self.annotations.iloc[index, 0]  # get image file path
        boxes = []

        class_label = self.annotations.iloc[index, 1]  # object class (string)
        class_label = class_mapping[class_label]  # Map class name to class index (int)

        xmin = float(self.annotations.iloc[index, 2])
        ymin = float(self.annotations.iloc[index, 3])
        xmax = float(self.annotations.iloc[index, 4])
        ymax = float(self.annotations.iloc[index, 5])

        # 2. Convert the bbox to format required by Yolov1
        x_center = (xmin + xmax) / 2 / 448  # Normalize by image width (assuming 448x448 images)
        y_center = (ymin + ymax) / 2 / 448  # Normalize by image height
        width = (xmax - xmin) / 448         # Normalize by image width
        height = (ymax - ymin) / 448        # Normalize by image height
        boxes.append([class_label, x_center, y_center, width, height])
        boxes = torch.tensor(boxes)

        # 3. Load the image
        image = Image.open(img_path)

        # 4. Apply Transformation if any
        if self.transform:
            image, boxes = self.transform(image, boxes)

        # 5. Create the label matrix (S x S x (20 + 5xB))--> 7x7x30
        label_matrix = torch.zeros((self.S, self.S, self.C + 5 * self.B))

        # 6. Populate Label matrix for each bounding box
        for box in boxes:
            class_label, x, y, width, height = box.tolist()  # Unpack the bounding box
            class_label = int(class_label)

            i, j = int(self.S * y), int(self.S * x)  # find the grid cell where the center is
            x_cell, y_cell = self.S * x - j, self.S * y - i  # calculate the relative position of center of the bbox with respect to grid cell
            width_cell, height_cell = width * self.S, height * self.S  # Normalize the w, h with respect to grid cell.

            if label_matrix[i, j, 20] == 0:  # check if grid cell is already assigned
                label_matrix[i, j, 20] = 1  # assign confidence score 1, indicating that this grid cell is responsible for detecting an object.
                label_matrix[i, j, 21:25] = torch.tensor([x_cell, y_cell, width_cell, height_cell])
                label_matrix[i, j, class_label] = 1  # assign that specific class label to 1

        return image, label_matrix

class_mapping = {
    "aeroplane": 0,
    "bicycle": 1,
    "bird": 2,
    "boat": 3,
    "bottle": 4,
    "bus": 5,
    "car": 6,
    "cat": 7,
    "chair": 8,
    "cow": 9,
    "diningtable": 10,
    "dog": 11,
    "horse": 12,
    "motorbike": 13,
    "person": 14,
    "pottedplant": 15,
    "sheep": 16,
    "sofa": 17,
    "train": 18,
    "tvmonitor": 19
}

In [13]:
# Training
import torchvision.transforms as transforms
import torch.optim as optim
import torchvision.transforms.functional as FT
from tqdm import tqdm
from torch.utils.data import DataLoader

seed = 123
torch.manual_seed(seed)

# Hyperparameter
Learning_rate = 2e-5
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 1
weight_decay = 0
num_epochs = 100
num_workers = 2
pin_memory = True


class Compose(object):
    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, img, bboxes):
        for t in self.transforms:
            img, bboxes = t(img), bboxes

        return img, bboxes


transform = Compose([transforms.Resize((448, 448)), transforms.ToTensor(),])

def train_fn(train_loader, model, optimizer, loss_fn):
    loop = tqdm(train_loader, leave=True)
    mean_loss = []

    for batch_idx, (x, y) in enumerate(loop):
        x , y = x.to(device) , y.to(device)
        out = model(x)
        loss = loss_fn(out, y)
        mean_loss.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # update the progress bar
        loop.set_description(f"Epoch[{epoch}/{num_epochs}]")
        loop.set_postfix(loss = loss.item())
    print(f"Mean loss was {sum(mean_loss)/len(mean_loss)}")

def main():
    model = Yolov1(split_size = 7, num_boxes = 2 ,num_classes = 20 ).to(device)
    optimizer = optim.Adam(model.parameters(), lr = Learning_rate, weight_decay=weight_decay)
    loss_fn = YoloLoss()

    voc_dataset = VOCDataset(csv_file="small_voc_annotations.csv", img_dir=img_dir, label_dir=annotations_dir, transform=transform)
    train_loader = DataLoader(voc_dataset, batch_size=batch_size, num_workers=num_workers,
                              pin_memory=pin_memory, shuffle=True )
    for epoch in range(num_epochs):
        train_fn(train_loader, model, optimizer, loss_fn)

main()


100%|██████████| 8/8 [00:33<00:00,  4.22s/it, loss=16.7]


Mean loss was 28.70772409439087


100%|██████████| 8/8 [00:32<00:00,  4.12s/it, loss=34]


Mean loss was 19.498243510723114


100%|██████████| 8/8 [00:34<00:00,  4.27s/it, loss=12.9]


Mean loss was 11.936733722686768


100%|██████████| 8/8 [00:33<00:00,  4.13s/it, loss=5.35]


Mean loss was 14.49195647239685


100%|██████████| 8/8 [00:33<00:00,  4.17s/it, loss=15.4]


Mean loss was 7.253913015127182


100%|██████████| 8/8 [00:33<00:00,  4.15s/it, loss=10.6]


Mean loss was 7.212169796228409


 25%|██▌       | 2/8 [00:09<00:27,  4.52s/it, loss=2.34]


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import torch

def plot_image_with_boxes(image, boxes, title="Bounding Boxes"):
    """
    Plots an image and overlays bounding boxes
    Args:
        image: Tensor image (C, H, W)
        boxes: Tensor of bounding boxes, shape: (num_boxes, 5)
               where the 5 values are: (class, x_center, y_center, width, height)
    """
    # Convert tensor image to numpy and permute to (H, W, C)
    image = image.permute(1, 2, 0).cpu().numpy()

    # Create a figure and axis
    fig, ax = plt.subplots(1)
    ax.imshow(image)

    # Plot each bounding box
    for box in boxes:
        class_label, x_center, y_center, width, height = box.tolist()

        # Convert YOLO format to rectangle format
        x_min = (x_center - width / 2) * 448  # Convert normalized to image space
        y_min = (y_center - height / 2) * 448
        rect_width = width * 448
        rect_height = height * 448

        # Create a Rectangle patch
        rect = patches.Rectangle(
            (x_min, y_min), rect_width, rect_height, linewidth=2, edgecolor='r', facecolor='none'
        )
        # Add the rectangle to the plot
        ax.add_patch(rect)

    # Display the image with the bounding boxes
    plt.title(title)
    plt.show()


def evaluate_and_visualize(dataloader, model, num_images=1, device="cuda"):
    """
    Evaluates the model on the dataloader and visualizes the predictions and ground truth
    Args:
        dataloader: DataLoader for evaluation
        model: Trained YOLOv1 model
        num_images: Number of images to visualize
        device: 'cuda' or 'cpu'
    """
    model.eval()  # Set model to evaluation mode

    with torch.no_grad():
        for batch_idx, (images, targets) in enumerate(dataloader):
            images = images.to(device)

            # Forward pass through the model to get predictions
            predictions = model(images)

            # For now, visualize the first `num_images` images
            for i in range(min(num_images, len(images))):
                image = images[i]
                pred_boxes = predictions[i].detach().cpu()  # Predicted boxes for this image
                true_boxes = targets[i].detach().cpu()      # Ground truth boxes for this image

                # Convert to (x_center, y_center, width, height) format if needed
                pred_boxes = pred_boxes.view(-1, 5)  # Example format adjustment

                # Plot the ground truth and predicted bounding boxes on the image
                print(f"Image {i+1}: Ground truth vs Predicted boxes")
                plot_image_with_boxes(image, true_boxes, title="Ground Truth")
                plot_image_with_boxes(image, pred_boxes, title="Predicted")

            # Break after visualizing num_images
            if batch_idx >= num_images:
                break


# Assuming small_train_loader contains the 8 examples
evaluate_and_visualize(small_train_loader, model, num_images=8, device=device)