# YOLOv1 Implementation
This notebook aims to implement the YOLOv1 object detection algorithm and replicate the results as given in [this](https://arxiv.org/abs/1506.02640) paper.

Steps involved:
- pre-training weights on the ImageNet dataset.
- Implement the YOLOv1 model

## Step 1
Pre-training weights in ImageNet dataset
- prepare the modified network model
- prepare the dataset for training - done in an accompanying notebook
- Implement the diagnostic functions to track training
- train the model

In [1]:
import tensorflow as tf
import numpy as np
import math
import matplotlib as plt
from yolo_utils import *
import os

  from ._conv import register_converters as _register_converters


In [2]:
def create_placeholders_pretrain(n_H, n_W, n_C, n_y):
    '''
    Function to create placeholder for the input tensors
    
    Args:
    n_H = height of the image tensor
    n_W = width of the image tensor
    n_C = number of channels in the image tensor
    n_y = number of classes/output features
    
    returns:
    X, Y
    '''
    X = tf.placeholder(tf.float32, shape = (None, n_H, n_W, n_C))
    Y = tf.placeholder(tf.float32, shape = (None, n_y))
    return X, Y

In [None]:
# test script: DELETE
c = [448, 448, 3, 10]
X, Y = create_placeholders(*c)
print(X, Y)

In [None]:
# code that initializes weight parameters based on the configuration given in an xml file
path_to_xml = './YOLOv1_Pre_trained_Model.xml'
pre_train_parameters = initialize_weights(path_to_xml)

### 1.1 - Prepare the modified network model for pre-trianing on ImagenNet

In [11]:
def forward_propagation_pretrain(X, pre_train_parameters):
    '''
    Args:
    X - placeholder for the initial feature tensor
    pre_train_parameters - dictionary containing filters
    
    returns
    FC1 - output of the last fully connected layer
    
    NOT IMPLEMENTED: NORMALIZATION
    '''
    # Conv
    Z1 = tf.nn.conv2d(X, pre_train_parameters['W01'], [1,2,2,1], padding="SAME")
    Z1 = tf.nn.bias_add(Z1, pre_train_parameters['b01'])
    A1 = tf.nn.leaky_relu(Z1, alpha=0.1)
    
    # Pool
    P1 = tf.nn.max_pool(A1, [1,2,2,1], [1,2,2,1], padding="SAME")
    
    # Conv
    Z2 = tf.nn.conv2d(P1, pre_train_parameters['W02'], [1,1,1,1], padding="SAME")
    Z2 = tf.nn.bias_add(Z2, pre_train_parameters['b02'])
    A2 = tf.nn.leaky_relu(Z2, alpha=0.1)
    
    # Pool
    P2 = tf.nn.max_pool(A2, [1,2,2,1], [1,2,2,1], padding="SAME")
    
    # Conv
    Z3 = tf.nn.conv2d(P2, pre_train_parameters['W03'], [1,1,1,1], padding="SAME")
    Z3 = tf.nn.bias_add(Z3, pre_train_parameters['b03'])
    A3 = tf.nn.leaky_relu(Z3, alpha=0.1)
    Z4 = tf.nn.conv2d(A3, pre_train_parameters['W04'], [1,1,1,1], padding="SAME")
    Z4 = tf.nn.bias_add(Z4, pre_train_parameters['b04'])
    A4 = tf.nn.leaky_relu(Z4, alpha=0.1)
    Z5 = tf.nn.conv2d(A4, pre_train_parameters['W05'], [1,1,1,1], padding="SAME")
    Z5 = tf.nn.bias_add(Z5, pre_train_parameters['b05'])
    A5 = tf.nn.leaky_relu(Z5, alpha=0.1)
    Z6 = tf.nn.conv2d(A5, pre_train_parameters['W06'], [1,1,1,1], padding="SAME")
    Z6 = tf.nn.bias_add(Z6, pre_train_parameters['b06'])
    A6 = tf.nn.leaky_relu(Z6, alpha=0.1)
    
    # Pool
    P3 = tf.nn.max_pool(A6, [1,2,2,1], [1,2,2,1], padding="SAME")
    
    # Conv
    Z7 = tf.nn.conv2d(P3, pre_train_parameters['W07'], [1,1,1,1], padding="SAME")
    Z7 = tf.nn.bias_add(Z7, pre_train_parameters['b07'])
    A7 = tf.nn.leaky_relu(Z7, alpha=0.1)
    Z8 = tf.nn.conv2d(A7, pre_train_parameters['W08'], [1,1,1,1], padding="SAME")
    Z8 = tf.nn.bias_add(Z8, pre_train_parameters['b08'])
    A8 = tf.nn.leaky_relu(Z8, alpha=0.1)
    Z9 = tf.nn.conv2d(A8, pre_train_parameters['W09'], [1,1,1,1], padding="SAME")
    Z9 = tf.nn.bias_add(Z9, pre_train_parameters['b09'])
    A9 = tf.nn.leaky_relu(Z9, alpha=0.1)
    Z10 = tf.nn.conv2d(A9, pre_train_parameters['W10'], [1,1,1,1], padding="SAME")
    Z10 = tf.nn.bias_add(Z10, pre_train_parameters['b10'])
    A10 = tf.nn.leaky_relu(Z10, alpha=0.1)
    Z11 = tf.nn.conv2d(A10, pre_train_parameters['W11'], [1,1,1,1], padding="SAME")
    Z11 = tf.nn.bias_add(Z11, pre_train_parameters['b11'])
    A11 = tf.nn.leaky_relu(Z11, alpha=0.1)
    Z12 = tf.nn.conv2d(A11, pre_train_parameters['W12'], [1,1,1,1], padding="SAME")
    Z12 = tf.nn.bias_add(Z12, pre_train_parameters['b12'])
    A12 = tf.nn.leaky_relu(Z12, alpha=0.1)
    Z13 = tf.nn.conv2d(A12, pre_train_parameters['W13'], [1,1,1,1], padding="SAME")
    Z13 = tf.nn.bias_add(Z13, pre_train_parameters['b13'])
    A13 = tf.nn.leaky_relu(Z13, alpha=0.1)
    Z14 = tf.nn.conv2d(A13, pre_train_parameters['W14'], [1,1,1,1], padding="SAME")
    Z14 = tf.nn.bias_add(Z14, pre_train_parameters['b14'])
    A14 = tf.nn.leaky_relu(Z14, alpha=0.1)
    Z15 = tf.nn.conv2d(A14, pre_train_parameters['W15'], [1,1,1,1], padding="SAME")
    Z15 = tf.nn.bias_add(Z15, pre_train_parameters['b15'])
    A15 = tf.nn.leaky_relu(Z15, alpha=0.1)
    Z16 = tf.nn.conv2d(A15, pre_train_parameters['W16'], [1,1,1,1], padding="SAME")
    Z16 = tf.nn.bias_add(Z16, pre_train_parameters['b16'])
    A16 = tf.nn.leaky_relu(Z16, alpha=0.1)
    
    # Pool
    P4 = tf.nn.max_pool(A16, [1,2,2,1], [1,2,2,1], padding="SAME")
    
    
    # Conv
    Z17 = tf.nn.conv2d(P4, pre_train_parameters['W17'], [1,1,1,1], padding="SAME")
    Z17 = tf.nn.bias_add(Z17, pre_train_parameters['b17'])
    A17 = tf.nn.leaky_relu(Z17, alpha=0.1)
    Z18 = tf.nn.conv2d(A17, pre_train_parameters['W18'], [1,1,1,1], padding="SAME")
    Z18 = tf.nn.bias_add(Z18, pre_train_parameters['b18'])
    A18 = tf.nn.leaky_relu(Z18, alpha=0.1)
    Z19 = tf.nn.conv2d(A18, pre_train_parameters['W19'], [1,1,1,1], padding="SAME")
    Z19 = tf.nn.bias_add(Z19, pre_train_parameters['b19'])
    A19 = tf.nn.leaky_relu(Z19, alpha=0.1)
    Z20 = tf.nn.conv2d(A19, pre_train_parameters['W20'], [1,1,1,1], padding="SAME")
    Z20 = tf.nn.bias_add(Z20, pre_train_parameters['b20'])
    
    # Pool
    P5 = tf.nn.avg_pool(Z20, [1,2,2,1], [1,2,2,1], padding="SAME")
    
    # flatten
    P5 = tf.contrib.layers.flatten(P5)
    
    # Fully connected layer
    FC1 = tf.contrib.layers.fully_connected(P5, 1000, activation_fn=None)
    
    return FC1

In [None]:
# test script: DELETE
tf.reset_default_graph()
path_to_xml = './YOLOv1_Pre_trained_Model.xml'
with tf.Session() as sess:
    X, Y = create_placeholders_pretrain(448,448,3,1000)
    pre_train_parameters = initialize_weights(path_to_xml)
    FC1 = forward_propagation_pretrain(X, pre_train_parameters)
    init = tf.global_variables_initializer()
    sess.run(init)
    a = sess.run(FC1, {X: np.random.randn(2,448,448,3)})
    print("Z20 = " + str(a), "Z20 shape = " + str(a.shape))

In [4]:
def compute_cost_pretrain(FC1, Y):
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=FC1, labels=Y))
    return cost

The ImageNet dataset has been created in batches in an accompanying notbook. Each batch contains 2000 images and their corresponding labels. This is the main reason for the additional for loop in the ensuing cell

In [None]:
path_to_train_folder = './ImageNet_dataset/training_folder'
h5_files_list = os.listdir(path_to_train_folder)
print(h5_files_list)

In the cell below I implement the model function, which aggregates all the functions above, to train the model. I explicitly do not pass the train tensors as we are going to be reading them from another folder.

In [9]:
def model(xml_path, X_test=None, Y_test=None, learning_rate = 0.09, num_epochs = 100, 
          minibatch_size = 64, print_cost = True):
    
    # restting the default graph
    tf.reset_default_graph()
    
    # retrieve image shapes
    #(m, n_Htr, n_Wtr, n_Ctr) = X_train.shape
    #n_Y = Y_train.shape
    
    # global variables
    costs = []
    
    # randomizer
    seed = int(np.random.randint(1,100,1))
    
    # creating placeholders 
    X, Y = create_placeholders_pretrain(448,448,3,1000)
    
    # initializing parameters
    pretrain_parameters = initialize_weights(xml_path)
    
    # forward prop
    FC1 = forward_propagation_pretrain(X, pretrain_parameters)
    
    # compute cost
    cost = compute_cost_pretrain(FC1, Y)
    
    # select the appropriate the optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)
    
    # initialize global variables
    init = tf.global_variables_initializer()
    
    # train the session
    with tf.Session() as sess:
        
        # run the initialization for the session
        sess.run(init)
        
        # for loop for epoch/iterations
        for epoch in range(num_epochs):
            
            # maintain the cost through an epoch
            epoch_cost = 0
            
            # randomizer
            seed += 1
            
            # path to training folder
            PATH = './ImageNet_dataset/training_folder/'
            
            # set up the data
            h5_files = os.listdir(PATH)
            
            # for loop to iterate through the h5 files
            for file in h5_files:
                
                # DIAGNOSTIC print
                print("setting up" + file + "for training")
                
                # open the h5 file to form tensor
                with h5py.File(PATH+file, mode = 'r') as h5_file:
                    
                    # extract features and labels
                    X_train = np.asarray(h5_file['X_train'])
                    Y_train = np.asarray(h5_file['Y_train'])
                    
                    # number of examples
                    (m, n_Htr, n_Wtr, n_Ctr) = h5_file['X_train'].shape
                    
                    # ??? - REASON WHY
                    num_minibatches = int(m/minibatch_size)

                    # generate minibatches
                    minibatches = random_mini_batches(X_train, Y_train, minibatch_size, seed)

                    # iterate through the minibatches
                    for minibatch in minibatches:
                        
                        # procure minibatches
                        (minibatch_X, minibatch_Y) = minibatch
                        # optimize for cost, 
                        _ , minibatch_cost = sess.run([optimizer, cost], feed_dict={X: minibatch_X, Y: tf.one_hot(minibatch_Y, 1000)})
                        # cumulative minibatch cost
                        epoch_cost += minibatch_cost/num_minibatches
            
            # Print the cost after every 5 epochs
            if print_cost == True and epoch % 1 == 0:
                print ("Cost after epoch %i: %f" % (epoch, epoch_cost))
                costs.append(epoch_cost)
                
    return pretrain_parameters

In [12]:
_, _, parameters = model('YOLOv1_Pre_trained_Model.xml')

[7, 7, 3, 64]
[3, 3, 64, 192]
[1, 1, 192, 128]
[3, 3, 128, 256]
[1, 1, 256, 256]
[3, 3, 256, 512]
[1, 1, 512, 256]
[3, 3, 256, 512]
[1, 1, 512, 256]
[3, 3, 256, 512]
[1, 1, 512, 256]
[3, 3, 256, 512]
[1, 1, 512, 256]
[3, 3, 256, 512]
[1, 1, 512, 512]
[3, 3, 512, 1024]
[1, 1, 1024, 512]
[3, 3, 512, 1024]
[1, 1, 1024, 512]
[3, 3, 512, 1024]


ValueError: Shape must be rank 1 but is rank 2 for 'BiasAdd' (op: 'BiasAdd') with input shapes: [?,224,224,64], [64,1].

### Step 2
Implement the YOLOv1 model

- Implement Forward Propagation function
- Implement cost function
- Implement model function

Create placeholders for the feature and label tensors

In [None]:
# test script for the create-placeholder function
X, Y = create_placeholders(448, 448, 3, 1000)
print ("X = " + str(X))
print ("Y = " + str(Y))

Declare and initialize the parameters that are used in the model. Traditional implementation of a CNN would have had to initialize them randomly. But the YOLOv1 model is pre-trained on ImageNet. These weights can be procured from Step 1.

In [None]:
def initialize_parameters():
    """
    Yet to be coded
    """

Implement the YOLOv1 CNN forward propogation function. Facts that you need to paid attention
- Linear activation for the final layer, leaky relu for the rest with alpha = 0.1
- Any image is resized to 448x448. This is the standard input.
- Implement a function to load filter dimensions from an xml file

In [None]:
# experimenting with the ElementTree API
tf.reset_default_graph()
parameters = {}
tree = ET.parse('Configuration.xml')
root = tree.getroot()
for child in root:
    size = []
    for child1 in child:
        # print(child.attrib['name'], child1.tag, child1.text)
        if (child1.tag == 'dimension'):
            size.append((int)(child1.text))
            size.append((int)(child1.text))
        if (child1.tag == 'input'):
            size.append((int)(child1.text))
        if (child1.tag == 'output'):
            size.append((int)(child1.text))
    print(size)
    W = tf.get_variable(child.attrib['name'], size, initializer = tf.contrib.layers.xavier_initializer(seed = 0)) 
    parameters[child.attrib['name']] = W
    B = tf.Variable(tf.constant(0.01, shape=[size[-1]]))
    parameters['B'+(child.attrib['name'][1:])] = B
print(parameters)

In [None]:
for key in parameters:
    print(key, parameters[key])
    print(parameters[key].shape)

In [None]:
def load_and_initialize_weights(xml_file):
    '''
    Reads model parameter weights from xml_file and initializes filters and biases
    
    Args:
    xml_file - configuration xml with absolute path
    
    Returns:
    parameters - a dictionary containing initialized parameters
    '''
    parameters = {}
    tree = ET.parse(xml_file)
    root = tree.getroot()
    for child in root:
        size = []
        for child1 in child:
            # print(child.attrib['name'], child1.tag, child1.text)
            if (child1.tag == 'dimension'):
                size.append((int)(child1.text))
                size.append((int)(child1.text))
            if (child1.tag == 'input'):
                size.append((int)(child1.text))
            if (child1.tag == 'output'):
                size.append((int)(child1.text))
        print(size)
        W = tf.get_variable(child.attrib['name'], size, initializer = tf.contrib.layers.xavier_initializer(seed = 0)) 
        parameters[child.attrib['name']] = W
        B = tf.Variable(tf.constant(0.01, shape=[size[-1]]))
        parameters['B'+(child.attrib['name'][1:])] = B
        
    return parameters

#### Forward Propogation

In [None]:
def forward_propagation_YOLOv1(X, parameters):
    '''
    Args:
    X - placeholder for the initial feature tensor
    parameters - dictionary containing filters
    
    returns
    Z8 - output of the last LINEAR layer
    
    NOT IMPLEMENTED: NORMALIZATION
    '''
    
    Z1 = tf.nn.conv2d(X, parameters['W01'], [1,2,2,1], padding="VALID")
    Z1 = tf.nn.bias_add(Z1, parameters['B01'])
    A1 = tf.nn.leaky_relu(Z1, alpha=0.1)
    P1 = tf.nn.max_pool(A1, [1,2,2,1], [1,2,2,1], padding="VALID")
    
    
    Z2 = tf.nn.conv2d(P1, parameters['W02'], [1,1,1,1], padding="VALID")
    Z2 = tf.nn.bias_add(Z2, parameters['B02'])
    A2 = tf.nn.leaky_relu(Z2, alpha=0.1)
    P2 = tf.nn.max_pool(A2, [1,2,2,1], [1,2,2,1], padding="VALID")
    
    Z3 = tf.nn.conv2d(P2, parameters['W03'], [1,1,1,1], padding="VALID")
    Z3 = tf.nn.bias_add(Z3, parameters['B03'])
    A3 = tf.nn.leaky_relu(Z3, alpha=0.1)
    Z4 = tf.nn.conv2d(A3, parameters['W04'], [1,1,1,1], padding="VALID")
    Z4 = tf.nn.bias_add(Z4, parameters['B04'])
    A4 = tf.nn.leaky_relu(Z4, alpha=0.1)
    Z5 = tf.nn.conv2d(A4, parameters['W05'], [1,1,1,1], padding="VALID")
    Z5 = tf.nn.bias_add(Z5, parameters['B05'])
    A5 = tf.nn.leaky_relu(Z5, alpha=0.1)
    Z6 = tf.nn.conv2d(A5, parameters['W06'], [1,1,1,1], padding="VALID")
    Z6 = tf.nn.bias_add(Z6, parameters['B06'])
    A6 = tf.nn.leaky_relu(Z6, alpha=0.1)
    P3 = tf.nn.max_pool(A6, [1,2,2,1], [1,2,2,1], padding="VALID")
    
    Z7 = tf.nn.conv2d(P3, parameters['W07'], [1,1,1,1], padding="VALID")
    Z7 = tf.nn.bias_add(Z7, parameters['B07'])
    A7 = tf.nn.leaky_relu(Z7, alpha=0.1)
    Z8 = tf.nn.conv2d(A7, parameters['W08'], [1,1,1,1], padding="VALID")
    Z8 = tf.nn.bias_add(Z8, parameters['B08'])
    A8 = tf.nn.leaky_relu(Z8, alpha=0.1)
    Z9 = tf.nn.conv2d(A8, parameters['W09'], [1,1,1,1], padding="VALID")
    Z9 = tf.nn.bias_add(Z9, parameters['B09'])
    A9 = tf.nn.leaky_relu(Z9, alpha=0.1)
    Z10 = tf.nn.conv2d(A9, parameters['W10'], [1,1,1,1], padding="VALID")
    Z10 = tf.nn.bias_add(Z10, parameters['B10'])
    A10 = tf.nn.leaky_relu(Z10, alpha=0.1)
    Z11 = tf.nn.conv2d(P10, parameters['W11'], [1,1,1,1], padding="VALID")
    Z11 = tf.nn.bias_add(Z11, parameters['B11'])
    A11 = tf.nn.leaky_relu(Z11, alpha=0.1)
    Z12 = tf.nn.conv2d(A11, parameters['W12'], [1,1,1,1], padding="VALID")
    Z12 = tf.nn.bias_add(Z12, parameters['B12'])
    A12 = tf.nn.leaky_relu(Z12, alpha=0.1)
    Z13 = tf.nn.conv2d(A12, parameters['W13'], [1,1,1,1], padding="VALID")
    Z13 = tf.nn.bias_add(Z13, parameters['B13'])
    A13 = tf.nn.leaky_relu(Z13, alpha=0.1)
    Z14 = tf.nn.conv2d(A13, parameters['W14'], [1,1,1,1], padding="VALID")
    Z14 = tf.nn.bias_add(Z14, parameters['B14'])
    A14 = tf.nn.leaky_relu(Z14, alpha=0.1)
    Z15 = tf.nn.conv2d(P4, parameters['W15'], [1,1,1,1], padding="VALID")
    Z15 = tf.nn.bias_add(Z15, parameters['B15'])
    A15 = tf.nn.leaky_relu(Z15, alpha=0.1)
    Z16 = tf.nn.conv2d(A15, parameters['W16'], [1,1,1,1], padding="VALID")
    Z16 = tf.nn.bias_add(Z16, parameters['B16'])
    A16 = tf.nn.leaky_relu(Z16, alpha=0.1)
    P4 = tf.nn.max_pool(A16, [1,2,2,1], [1,2,2,1], padding="VALID")
    
    
    Z17 = tf.nn.conv2d(P4, parameters['W17'], [1,1,1,1], padding="VALID")
    Z17 = tf.nn.bias_add(Z17, parameters['B17'])
    A17 = tf.nn.leaky_relu(Z17, alpha=0.1)
    Z18 = tf.nn.conv2d(A17, parameters['W18'], [1,1,1,1], padding="VALID")
    Z18 = tf.nn.bias_add(Z18, parameters['B18'])
    A18 = tf.nn.leaky_relu(Z18, alpha=0.1)
    Z19 = tf.nn.conv2d(A18, parameters['W19'], [1,1,1,1], padding="VALID")
    Z19 = tf.nn.bias_add(Z19, parameters['B19'])
    A19 = tf.nn.leaky_relu(Z19, alpha=0.1)
    Z20 = tf.nn.conv2d(A19, parameters['W20'], [1,1,1,1], padding="VALID")
    Z20 = tf.nn.bias_add(Z20, parameters['B20'])
    A20 = tf.nn.leaky_relu(Z20, alpha=0.1)
    Z21 = tf.nn.conv2d(A20, parameters['W21'], [1,1,1,1], padding="VALID")
    Z21 = tf.nn.bias_add(Z21, parameters['B21'])
    A21 = tf.nn.leaky_relu(Z21, alpha=0.1)
    Z22 = tf.nn.conv2d(A20, parameters['W22'], [1,2,2,1], padding="VALID")
    Z22 = tf.nn.bias_add(Z22, parameters['B22'])
    A22 = tf.nn.leaky_relu(Z22, alpha=0.1)
    
    Z23 = tf.nn.conv2d(A22, parameters['W23'], [1,1,1,1], padding="VALID")
    Z23 = tf.nn.bias_add(Z23, parameters['B23'])
    A23 = tf.nn.leaky_relu(Z23, alpha=0.1)
    Z24 = tf.nn.conv2d(A23, parameters['W24'], [1,1,1,1], padding="VALID")
    Z24 = tf.nn.bias_add(Z24, parameters['B24'])
    A24 = tf.nn.leaky_relu(Z24, alpha=0.1)
    
    A24 = tf.contrib.layers.flatten(A24)
    FC1 = tf.contrib.layers.fully_connected(A24, 512, activation_fn=None)
    FC2 = tf.contrib.layers.fully_connected(FC1, 4096, activation_fn=None)
    FC3 = tf.contrib.layers.fully_connected(FC2, 1470, activation_fn=None)
    
    return FC3
    

#### Cost Function
The cost function is slightly tricky in YOLOv1. The model is optimized end-to-end and has a composite loss function. The cost function has been coded for an output tensor of shape 7x7x(2x5 + 20).

In [None]:
def compute_YOLOv1_cost(y_pred, y_ground):
    '''
    Calculates the loss for gradient descent
    
    y_pred - predicted values - a 7x7x(2x5 + 20) tensor
    y_ground - ground truth labels - a 7x7x(2x5 + 20) tensor
    '''
    predictedBoxScores = np.reshape(y_pred, [-1, 7, 7, 30])
    predictedClasses = predictedBoxScores[:, :, :, :20]
    predictedObjectConfidence = predictedBoxScores[:, :, :, 20:22]
    predictedBoxDimensions = predictedBoxScores[:, :, :, 22:]
    