In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import VGG16
import tensorflow_datasets as tfds
from tensorflow.keras.optimizers import SGD

import numpy as np
import math

import cv2
import matplotlib.pyplot as plt

## Load KITTI dataset

In [None]:
# dataset structure: # https://www.tensorflow.org/datasets/catalog/kitti
# classes: https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/datasets/kitti/kitti_dataset_builder.py
def load_kitti_dataset():
    dataset = tfds.load('kitti')   # es. split='train[:5%]' prende solo l'ultimo 5% del dataset
    return dataset

In [None]:
dataset = load_kitti_dataset()

In [None]:
training_set = dataset['train']
test_set = dataset['test']
validation_set = dataset['validation']

#### Show an example of images in the dataset

In [None]:
for example in training_set.take(1):
    print(example)
    print("Label: ", example["objects"]["type"].numpy()[0])   # get the label

In [None]:
def draw_3d_box(image, dimensions, location, orientation, K):
    # unpack dimensions (height, width, length)
    h, w, l = dimensions

    # compute the corners of the bounding box in object coordinates
    x_corners = [l/2, l/2, -l/2, -l/2, l/2, l/2, -l/2, -l/2]
    y_corners = [0, 0, 0, 0, -h, -h, -h, -h]
    z_corners = [w/2, -w/2, -w/2, w/2, w/2, -w/2, -w/2, w/2]
    corners = np.array([x_corners, y_corners, z_corners])

    # rotation matrix around the Y-axis
    R_y = np.array([[ np.cos(orientation), 0, np.sin(orientation)],
                    [0, 1, 0],
                    [-np.sin(orientation), 0, np.cos(orientation)]])

    # rotate and translate the corners to camera coordinates
    corners_3d = (R_y @ corners).T + location

    # project the 3D corners onto the 2D image plane
    corners_2d = []
    for corner in corners_3d:
        corner_homogeneous = np.append(corner, 1)
        projected = K @ corner_homogeneous[:3]
        projected /= projected[2]  # normalize by depth
        corners_2d.append(projected[:2])

    corners_2d = np.array(corners_2d, dtype=np.int32)

    # define the edges of the bounding box
    edges = [
        (0, 1), (1, 2), (2, 3), (3, 0),  # bottom face
        (4, 5), (5, 6), (6, 7), (7, 4),  # top face
        (0, 4), (1, 5), (2, 6), (3, 7)   # vertical edges
    ]

    for start, end in edges:
        pt1 = tuple(corners_2d[start])
        pt2 = tuple(corners_2d[end])
        cv2.line(image, pt1, pt2, color=(0, 255, 0), thickness=2)

    return image

In [None]:
# camera intrinsic matrix
K = np.array([[721.5377, 0, 609.5593],
              [0, 721.5377, 172.854],
              [0, 0, 1]])

for example in training_set.take(1):
    image = example["image"].numpy()
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    label = example["objects"]["type"].numpy()[0]
    dimensions = example["objects"]["dimensions"].numpy()[0]
    location = example["objects"]["location"].numpy()[0]
    alpha = example["objects"]["alpha"].numpy()[0]
    orientation = example["objects"]["rotation_y"].numpy()[0]

    print("Label: ", label)
    print("Dimension: ", dimensions)
    print("Location: ", location)
    print("Alpha: ", alpha)
    print("Rotation Y: ", orientation)

    # draw the 3D bounding box on the image
    image_with_bbox = draw_3d_box(image.copy(), dimensions, location, orientation, K)
    plt.imshow(cv2.cvtColor(image_with_bbox, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()



## Create the model architecture


*   VGG16 model for feature extraction
*   2 FC layers for dimension estimation
*   2 FC layers for location estimation
*   2 FC layers + 2D norm for orientation estimation
*   2 FC layers for class estimation






In [None]:
# model hyperparameters
batch_size = 32
num_bin = 2
input_shape = (224, 224, 3)   # (H, W, num_channels)
learning_rate = 0.0001
epochs = 50

In [None]:
def multibin_model(input_shape=(224, 224, 3), num_bin=2):
    vgg16 = VGG16(include_top=False, weights='imagenet', input_shape=input_shape)
    vgg16.trainable = False

    # model for feature extraction: based on pre-trained VGG, as mentioned in the paper
    feature_model = tf.keras.Sequential([
        layers.Lambda(tf.keras.applications.vgg16.preprocess_input),
        vgg16,

        ],
        name="feature_model")

    # construction of the 3D box module, made up of 3 modules: estimation of dimensions, orientations and confidences
    # model for estimation of dimensions: 2 FC layers
    dimension_model = models.Sequential([
        layers.Flatten(),
        layers.Dense(512, activation='relu'),
        layers.Dense(3, name='dim_output')  # dimensions: dx, dy, dz
        ],
        name="dimension_model")

    # model for estimation of orientations: 2 FC layers + L2 Norm
    orientation_model = models.Sequential([
        layers.Flatten(),
        layers.Dense(256, activation='relu'),
        layers.Dense(num_bin*2, activation='linear'),  # rotation y axis
        layers.Lambda(lambda x: tf.math.l2_normalize(x), name='orient_output')
        ],
        name="orientation_model")

    # model for computing confidences: 2 FC layers
    confidence_model = models.Sequential([
        layers.Flatten(),
        layers.Dense(256, activation='relu'),
        layers.Dense(num_bin, activation='softmax', name='conf_output')
        ],
        name="confidence_model")


    # create connections between the three models to create the overall architecture
    # define input tensor
    input_tensor = tf.keras.Input(shape=input_shape)

    # get feature output by calling feature_model with input_tensor
    feature_output = feature_model(input_tensor)
    dimension_output = dimension_model(feature_output)
    orientation_output = orientation_model(feature_output)
    confidence_output = confidence_model(feature_output)

    model = models.Model(inputs=input_tensor,
                         outputs={'dimension_model':dimension_output, 'orientation_model':orientation_output, 'confidence_model':confidence_output})

    print(model.summary())
    return model

## Model training

### Data generator

In [None]:
print("Samples in training set:", len(training_set))
print("Samples in validation set:", len(validation_set))
print("Samples in test set:", len(test_set))

In [None]:
# Data preparation
def preprocess_image(image):
    image = tf.image.resize(image, (224, 224))
    image = tf.cast(image, tf.float32) / 255.0
    return image

def prepare_data_batch(data_batch, num_bin):
    images, dimensions, orientations, confidences = [], [], [], []

    for sample in data_batch:
        image = preprocess_image(sample['image'])
        images.append(image)

        dimensions.append(sample['objects']['dimensions'][0])

        rotation = sample['objects']['rotation_y'][0].numpy()
        orientation = np.zeros((num_bin, 2))
        confidence = np.zeros(num_bin)

        bin_size = 2 * np.pi / num_bin
        bin_index = int((rotation + np.pi) / bin_size) % num_bin

        confidence[bin_index] = 1
        angle_in_bin = rotation - (bin_index * bin_size - np.pi)
        orientation[bin_index] = [np.cos(angle_in_bin), np.sin(angle_in_bin)]

        orientations.append(orientation.flatten())
        confidences.append(confidence)

    return (tf.stack(images),
            tf.convert_to_tensor(dimensions, dtype=tf.float32),
            tf.convert_to_tensor(orientations, dtype=tf.float32),
            tf.convert_to_tensor(confidences, dtype=tf.float32))

def data_generator(dataset, batch_size=32, num_bin=2):
    batch = []
    for sample in dataset:
        batch.append(sample)
        if len(batch) == batch_size:
            images, dimensions, orientations, confidences = prepare_data_batch(batch, num_bin)
            yield images, {'dimension_model': dimensions,
                           'orientation_model': orientations,
                           'confidence_model': confidences}
            batch = []

In [None]:
train_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(training_set, batch_size=batch_size, num_bin=num_bin),
    output_signature=(
        tf.TensorSpec(shape=(batch_size, 224, 224, 3), dtype=tf.float32),
        {
            'dimension_model': tf.TensorSpec(shape=(batch_size, 3), dtype=tf.float32),
            'orientation_model': tf.TensorSpec(shape=(batch_size, 2 * num_bin), dtype=tf.float32),
            'confidence_model': tf.TensorSpec(shape=(batch_size, num_bin), dtype=tf.float32)
        }
    )
)

val_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(validation_set, batch_size=batch_size, num_bin=num_bin),
    output_signature=(
        tf.TensorSpec(shape=(batch_size, 224, 224, 3), dtype=tf.float32),
        {
            'dimension_model': tf.TensorSpec(shape=(batch_size, 3), dtype=tf.float32),
            'orientation_model': tf.TensorSpec(shape=(batch_size, 2 * num_bin), dtype=tf.float32),
            'confidence_model': tf.TensorSpec(shape=(batch_size, num_bin), dtype=tf.float32)
        }
    )
)

train_dataset = train_dataset.repeat().prefetch(tf.data.experimental.AUTOTUNE)
val_dataset = val_dataset.repeat().prefetch(tf.data.experimental.AUTOTUNE)

### Model training

In [None]:
def localization_loss(loc_true, loc_pred):
    num_bins = loc_pred.shape[1] // 2
    loc_pred = tf.reshape(loc_pred, [-1, num_bins, 2])  # reshape for bins and [cos, sin]
    loc_pred_normalized = tf.math.l2_normalize(loc_pred, axis=2)

    loc_true = tf.reshape(loc_true, [-1, num_bins, 2])
    loc_true_normalized = tf.math.l2_normalize(loc_true, axis=2)

    cos_similarity = tf.reduce_sum(loc_pred_normalized * loc_true_normalized, axis=2)
    loc_loss = -tf.reduce_mean(cos_similarity)
    return loc_loss

def confidence_loss(y_true, y_pred):
    return tf.keras.losses.categorical_crossentropy(y_true, y_pred)

def orientation_loss(y_true, y_pred):
    num_bins = y_pred.shape[1] // 2    # number of bins
    conf_true = y_true[:, :num_bins]
    loc_true = y_true[:, num_bins:]

    conf_pred = y_pred[:, :num_bins]
    loc_pred = y_pred[:, num_bins:]

    conf_loss = confidence_loss(conf_true, conf_pred)
    loc_loss = localization_loss(loc_true, loc_pred)

    return conf_loss + 3.0 * loc_loss   # (3.0 = weights associated to the localization loss function)

In [None]:
training_set_dimension = len(training_set)
validation_set_dimension = len(validation_set)
train_steps_per_epoch = math.floor(training_set_dimension / batch_size)
val_steps_per_epoch = math.floor(validation_set_dimension/batch_size)

model = multibin_model(input_shape=input_shape, num_bin=num_bin)

losses = {
    'dimension_model': 'mse',
    'orientation_model': orientation_loss,
    'confidence_model': confidence_loss
}

optimizer = SGD(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss=losses)

history = model.fit(train_dataset,
          steps_per_epoch=train_steps_per_epoch,
          validation_data=val_dataset,
          validation_steps=val_steps_per_epoch,
          epochs=epochs
          )

In [None]:
plt.plot(history.history['dimension_model_loss'])
plt.plot(history.history['val_dimension_model_loss'])
plt.title('dimension model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

In [None]:
plt.plot(history.history['orientation_model_loss'])
plt.plot(history.history['val_orientation_model_loss'])
plt.title('orientation model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

In [None]:
plt.plot(history.history['confidence_model_loss'])
plt.plot(history.history['val_confidence_model_loss'])
plt.title('confidence model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

##Inference

In [None]:
def calculate_rotation(orientation, active_bin, num_bin):
    bin_size = 2 * np.pi / num_bin  # bin size in radians
    bin_index = confidence  # active bin index

    cos_value = orientation[2 * bin_index]
    sin_value = orientation[2 * bin_index + 1]

    angle_in_bin = np.arctan2(sin_value, cos_value)
    rotation = angle_in_bin + (bin_index * bin_size) - np.pi
    rotation = round(rotation, 4)
    return rotation

In [None]:
def inference(model, image):
    image = preprocess_image(image)
    prediction = model.predict(np.expand_dims(image, axis=0))
    return prediction

In [None]:
# camera intrinsic matrix
K = np.array([[721.5377, 0, 609.5593],
              [0, 721.5377, 172.854],
              [0, 0, 1]])

for i, example in enumerate(training_set.take(5)):
    image = example['image']
    bbox_2d = example['objects']['bbox'].numpy()[0]
    location = example['objects']['location'].numpy()[0]

    prediction = inference(model, image)
    dimension = prediction['dimension_model'][0]
    orientation = prediction['orientation_model'][0]
    confidence = prediction['confidence_model']
    confidence = np.argmax(confidence)
    rotation = calculate_rotation(orientation, confidence, num_bin)  # rotation_y

    # print("Image shape: ", image.shape)
    print("Sample", i+1)
    print("Dimension: ", dimension)
    print("Orientation: ", orientation)
    print("Confidence: ", confidence)
    print("Rotation_Y: ", rotation)

    # draw the 3D bounding box on the image
    image_with_bbox = draw_3d_box(image.numpy().copy(), dimension, location, rotation, K)
    plt.imshow(cv2.cvtColor(image_with_bbox, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()
    print()

In [None]:
# actual values (ground_truth)
for i, example in enumerate(training_set.take(5)):
    dimension = example["objects"]["dimensions"].numpy()
    orientation = example["objects"]["rotation_y"].numpy()

    print("Sample", i+1)
    print("Dimension: ", dimension)
    print("Alpha: ", orientation)
    print()
