<a href="https://colab.research.google.com/github/JitindraFartiyal/Object-Detection/blob/object-detection-v1/Yolov1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Connecting to Google drive to upload dataset. This step is only required if you are using Google Colab and uploading dataset from Google Drive


In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Importing all the libraries

In [0]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import skimage
import cv2
from google.colab.patches import cv2_imshow
from torch.utils.data import Dataset, DataLoader
from skimage import io, transform
from torchvision import transforms, utils
from torchvision.models.detection import transform, transform


Defining a function to do categorical encoding manually


In [0]:
def convert_class(label):
    for i in range(len(label)):
        if label.iloc[i,0] == 'Car':
            label.iloc[i,0] = 1
        elif label.iloc[i,0] == 'Van':
            label.iloc[i,0] = 2
        elif label.iloc[i, 0] == 'Truck':
            label.iloc[i, 0] = 3
        elif label.iloc[i,0] == 'Pedestrian':
            label.iloc[i,0] = 4
        elif label.iloc[i,0] == 'Person_sitting':
            label.iloc[i,0] = 5
        elif label.iloc[i,0] == 'Cyclist':
            label.iloc[i,0] = 6
        elif label.iloc[i,0] == 'Tram':
            label.iloc[i,0] = 7
        elif label.iloc[i,0] == 'Misc':
            label.iloc[i,0] = 8
        elif label.iloc[i,0] == 'DontCare':
            label.iloc[i,0] = 9


Defining a function to convert target class into a Yolo target class

In [0]:
def transform_target(target):
    top_left_x = target[:, 1:2]
    top_left_y = target[:, 2:3]
    bottom_right_x = target[:, 3:4]
    bottom_right_y = target[:, 4:5]

    height = top_left_y-bottom_right_y
    width = bottom_right_x-top_left_x
    center_x = top_left_x + width/2
    center_y = bottom_right_y + height/2

    target[:, 1:2] = center_x/375
    target[:, 2:3] = center_y/1242
    target[:, 3:4] = height /1242
    target[:, 4:5] = width/375

    total_classes = torch.zeros(target.size()[0],target.size()[1]+4)
    total_classes[:,:-4] = target

    for i in range(0,len(target)):
        if(total_classes[i,0:1] == 1):
            total_classes[i,-4:-3] = 1
        if(total_classes[i,0:1] == 2):
            total_classes[i,-3:-2] = 1
        if(total_classes[i,0:1] == 3):
            total_classes[i,-2:-1] = 1
        if(total_classes[i,0:1] == 4):
            total_classes[i,-1] = 1
    target = total_classes
    return target


Defining custom training Dataset class | Kitti Dataset used

In [0]:
class KittiDataset(Dataset):

    def __init__(self, labels_dir, images_dir, transform = None):
        print('Initializing training dataset')
        self.labels_dict = {}
        self.labels_dir = labels_dir
        self.images_dir = images_dir
        self.transform = transform
        self.filename = []

        print('Labels Directory : ' + labels_dir)
        print('Images Directory : ' + images_dir)

        counter = 0
        for file in os.listdir(self.labels_dir):
            print('Reading label file -- : ' + file)
            label_path = self.labels_dir + '/' + file
            label = pd.read_csv(filepath_or_buffer=label_path, sep=' ', header=None, index_col=False)
            label = label.iloc[:,[0,5,6,7,8,14]]

            convert_class(label)
            label.columns = ['Class','LeftTopX','LeftTopY','RightBottomX','RightBottomY','Score']
            self.labels_dict[counter] = label
            self.filename.append(file[0:6])
            counter = counter + 1

    def __len__(self):
        return len(self.labels_dict)

    def __getitem__(self, index):
        image_path = self.images_dir + '/' + self.filename[index] + '.png'
        image = io.imread(image_path)

        label = np.array(self.labels_dict[index])
        image = skimage.transform.resize(image,(225,225),preserve_range=True)
        image = image.transpose((2, 0, 1))
        image = torch.from_numpy(image).float()
        label = torch.from_numpy(label).float()
        label = transform_target(label)
        label = label.view(-1)

        sample = {'image' : image, 'label' : label}
        return sample


Defining custom Testing Dataset. Used only for preliminary testing. Not mandatory

In [0]:
class TestKittiDataset(Dataset):

    def __init__(self, labels_dir, images_dir, transform = None):
        print('Initializing testing dataset')
        self.labels_dir = labels_dir
        self.images_dir = images_dir
        self.transform = transform
        self.filename = []

        print('Test Labels Directory : ' + labels_dir)
        print('Test Images Directory : ' + images_dir)

        for file in os.listdir(self.labels_dir):
            self.filename.append(file[0:6])
        
    def __len__(self):
        return len(self.filename)

    def __getitem__(self, index):
        image_path = self.images_dir + '/' + self.filename[index] + '.png'
        image = io.imread(image_path)
        image = skimage.transform.resize(image,(225,225),preserve_range=True)
        image = image.transpose((2, 0, 1))
        image = torch.from_numpy(image).float()
        sample = {'image' : image}
        return sample


Defining Neural Network class

In [0]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        print('Initializing Convolutional Neural Network')
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=1, stride=1, padding=0,bias=True)
        self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=1, stride=1, padding=0,bias=True)
        self.conv3 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=1, stride=1, padding=0,bias=True)
        self.conv4 = nn.Conv2d(in_channels=256, out_channels=128, kernel_size=1, stride=1, padding=0,bias=True)
        self.conv5 = nn.Conv2d(in_channels=128, out_channels=64,kernel_size=1, stride=1, padding=0,bias=True)
        self.fc1 = nn.Linear(25*25*64,25*25*32,bias=True)
        self.fc2 = nn.Linear(25*25*32,25*25*14,bias=True)

    def forward(self,x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = F.max_pool2d(input=x, kernel_size=3,stride=3)
        x = F.relu(self.conv5(x))
        x = F.max_pool2d(input=x, kernel_size=3,stride=3)
        x = x.view(-1,25*25*64)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        x = F.sigmoid(x)

        return x


Defining a function to calculate Intersection of Union (IOU)

In [0]:
def calculate_IOU(input_box,target_box):
    anchor_box = torch.zeros(4)
    bounding_box = torch.zeros(4)
    iou = 0

    anchor_box[0] = input_box[0] - (input_box[3])/2
    anchor_box[1] = input_box[1] + (input_box[2]) / 2
    anchor_box[2] = input_box[0] + (input_box[3]) / 2
    anchor_box[3] = input_box[1] - (input_box[2]) / 2

    bounding_box[0] = target_box[0] - (target_box[3]) / 2
    bounding_box[1] = target_box[1] + (target_box[2]) / 2
    bounding_box[2] = target_box[0] + (target_box[3]) / 2
    bounding_box[3] = target_box[1] - (target_box[2]) / 2

    if anchor_box[0]>bounding_box[0]:
        xA = anchor_box[0]
    else:
        xA = bounding_box[0]

    if anchor_box[1]>bounding_box[1]:
        yA = anchor_box[1]
    else:
        yA = bounding_box[1]

    if anchor_box[2]>bounding_box[2]:
        xB = anchor_box[2]
    else:
        xB = bounding_box[2]

    if anchor_box[3]>bounding_box[3]:
        yB = anchor_box[3]
    else:
        yB = bounding_box[3]

    if xB - xA + 1>0:
        a = xB - xA + 1
    if yB - yA + 1>0:
        b = yB - yA + 1
    intersection_area = a*b

    anchor_box_area = (anchor_box[2] - anchor_box[0] + 1) * (anchor_box[3] - anchor_box[1] + 1)
    bounding_box_area = (bounding_box[2] - bounding_box[0] + 1) * (bounding_box[3] - bounding_box[1] + 1)

    iou = intersection_area / float(anchor_box_area + bounding_box_area - intersection_area)

    return iou


Defining a function to convert Neural Network output into YOLO predicted output format


In [0]:
def compute_predict_output(output,threshold):
    predicted_output = torch.zeros((25*25)+1, 9)
    for grid_cell in range(0,(25*25)+1):
        grid_output = output[grid_cell:grid_cell+14,:]

        probability_class = grid_output[10:14,:]
        
        bounding_box1 = grid_output[0:5]
        bounding_box1 = bounding_box1.view(-1,5)
        bounding_box2 = grid_output[5:10]
        bounding_box2 = bounding_box2.view(-1,5)

        if(bounding_box1[0,0:1]>bounding_box2[0,0:1]):
          temp_output = torch.zeros(bounding_box1.size()[0],bounding_box1.size()[1]+4)
          temp_output[:,:-4] = bounding_box1
          temp_output[:,5:9] = probability_class.view(-1,4)
          predicted_output[grid_cell] = temp_output
        elif(bounding_box1[0,0:1]<bounding_box2[0,0:1]):
          temp_output = torch.zeros(bounding_box2.size()[0], bounding_box2.size()[1] + 4)
          temp_output[:, :-4] = bounding_box2
          temp_output[:,5:9] = probability_class.view(-1,4)
          predicted_output[grid_cell] = temp_output
    return predicted_output


Defining Loss function

In [0]:
def yolo_loss(output,target):
    total_loss = 0
    for grid_cell in range(0,(25*25)+1):
      output_probability_class = output[grid_cell,5:9]
      max_probability, indices = torch.max(output_probability_class,0)
      
      no_object_loss = 0
      object_class_detected = False

      if(output[grid_cell,0:1] > 0.5): 
        classification_loss = 10000
        localization_loss = 10000
        confidence_loss = 10000
      
        for i in range(0,target.size()[0]):
          new_classification_loss = 0
          new_localization_loss = 0
          new_confidence_loss = 0
          new_total_loss = 0
        
          if(indices.numpy() == target[i,0].numpy()):
            object_class_detected = True      
          
            new_classification_loss = (output[grid_cell, -1]) ** 2 + (output[grid_cell, -2]) ** 2 + (output[grid_cell, -3]) ** 2 + (output[grid_cell, -4]) ** 2
            new_classification_loss += 1 - (2 * output[grid_cell, (4 + indices)])
                    
            new_localization_loss = (output[grid_cell,1]-target[i,1])**2 + (output[grid_cell,2]-target[i,2])**2 \
                                 +(torch.sqrt(output[grid_cell,3])-torch.sqrt(target[i,3]))**2 \
                                 + (torch.sqrt(output[grid_cell,4])-torch.sqrt(target[i,4]))**2
                    
            new_confidence_loss = (calculate_IOU(output[grid_cell,1:5],target[i,1:5])*output[grid_cell,0:1])

            new_total_loss = new_classification_loss + new_localization_loss + new_confidence_loss

            if(new_total_loss < (classification_loss + localization_loss + confidence_loss)):
              classification_loss = new_classification_loss
              localization_loss = new_localization_loss
              confidence_loss = new_confidence_loss

        if(object_class_detected == False):
          classification_loss = (output[grid_cell, -1]) ** 2 + (output[grid_cell, -2]) ** 2 + (output[grid_cell, -3]) ** 2 + (output[grid_cell, -4]) ** 2
          localization_loss = (output[grid_cell,1]-target[i,1])**2 + (output[grid_cell,2]-target[i,2])**2 \
                                 +(torch.sqrt(output[grid_cell,3])-torch.sqrt(target[i,3]))**2 \
                                 + (torch.sqrt(output[grid_cell,4])-torch.sqrt(target[i,4]))**2
          confidence_loss = (calculate_IOU(output[grid_cell,1:5],target[i,1:5])*output[grid_cell,0:1])

        total_loss += classification_loss + localization_loss + confidence_loss
      else:
        no_object_loss += (output[grid_cell,0:1])**2
        
      total_loss += no_object_loss
    return total_loss/625


Defining a function to train the model

In [0]:
def train(model, train_loader, optimizer, epoch,threshold) :
    print('Training the model with epoch : {}'.format(epoch))
    model.train()
    batch_loss = 0
    for batch_index,data in enumerate(train_loader):
        input = data['image']
        target = data['label']
        
        optimizer.zero_grad()
        output = model(input)
        output = torch.transpose(output,1,0)
        predict_output = compute_predict_output(output, threshold)
        loss = torch.FloatTensor(yolo_loss(predict_output,target))
        batch_loss += loss
        loss.requires_grad
        
        print('Loss for batch {} is : {}'.format(batch_index,loss))
        # start debugger  
        #import pdb; pdb.set_trace()        
        loss.backward()
        #for param in model.parameters():
        #  print(param.grad.data.sum())
        optimizer.step()
        if(batch_index == 2):
          break
          
    print('Loss:{} -- {} '.format(epoch, batch_loss/500))


Defining a function to test model


In [0]:
def test(model, test_loader,threshold):
  model.eval()
  print('Testing the model')
  with torch.no_grad():
    for batch_index, data in enumerate(test_loader):
      output = model(data['image'])
      output = torch.transpose(output,1,0)
      predicted_output = compute_predict_output(output,threshold)
      center_x = predicted_output[0,1:2]*375
      center_y = predicted_output[0,2:3]*1242
      height = predicted_output[0,3:4]*1242
      width = predicted_output[0,4:5]*375

      top_left_x = center_x - (width/2)
      top_left_y = center_y + (height/2)
      bottom_right_x = center_x + (width/2)
      bottom_right_y = center_y - (height/2)
      data['image'] = data['image'][:3,:,:]
      print('Image size : {}'.format(data['image'].size()))
      image = data['image'].numpy()
      image = np.reshape(225,225,3)
      print('Image dim : {}'.format(image.ndim))
      #skimage.transform.resize(image,(225,225,3))
      image = cv2.rectangle(image,(top_left_x,top_left_y),(bottom_right_x,bottom_right_y),(255,0,0))
      print(image)
      save_path = r'/content/gdrive/My Drive/kitti_single_small/testing/results/image'+str(batch_index)+'.png'
      cv2.imwrite(save_path,image)
      plt.imshow(image)

  

Main Function

In [0]:
def main():
    print('Main starts')
    # Training Settings
    lr = 0.0001
    momentum = 0.5
    epochs = 1
    batch_size = 1
    threshold = 0.5
    save_model = False

    labels_dir = r'/content/gdrive/My Drive/kitti_single_small/training/label_2'
    images_dir = r'/content/gdrive/My Drive/kitti_single_small/training/image_2'
    test_images_dir = r'/content/gdrive/My Drive/kitti_single_small/testing/image_2'
    saving_model_path = r'/content/gdrive/My Drive/kitti_single_small/yolo.cnn.pt'

    if(save_model == False):
      model = Net()
      if(os.path.isfile(saving_model_path)):
        model.load_state_dict(torch.load(saving_model_path))
      save_model = True
    
    optimizer = optim.Adam(model.parameters(),lr=lr, betas=(0.9,0.999))

    training_dataset = KittiDataset(labels_dir=labels_dir,images_dir=images_dir, transform=None)
    testing_dataset = TestKittiDataset(labels_dir=labels_dir,images_dir=test_images_dir, transform=None)

    train_loader = DataLoader(dataset=training_dataset)
    test_loader = DataLoader(dataset=testing_dataset)
    
    print('Training starts')
    for epoch in range(1,2):
        train(model, train_loader, optimizer,epoch,threshold)
    
    print('Testing starts')
    test(model,test_loader,0.6)

    if (save_model):
        torch.save(model.state_dict(), saving_model_path)

if __name__ == '__main__':
  main()

