In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# CityScape Data

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# extract CityScape dataset (after manually uploading it to session)
!unzip /content/drive/My\ Drive/AER1515\ Project/final_data.zip

In [None]:
import os
import numpy as np
import cv2
import copy
import tqdm

from glob import glob
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from PIL import Image
import torchvision.transforms as T

from tensorflow.keras.layers import Conv2D, Activation, BatchNormalization
from tensorflow.keras.layers import UpSampling2D, Input, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TensorBoard
from tensorflow.keras.metrics import Recall, Precision
from tensorflow.keras import backend as K

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
IMAGE_PATH = "final_data/leftImg8bit_trainvaltest/leftImg8bit"
LABEL_PATH = "final_data/gtFine_trainvaltest/gtFine"
MODEL_CHECKPOINT = "ModelCheckpoint"
PED_ID = 24
RIDER_ID = 25
BATCH = 32
IMAGE_SIZE = 256
EPOCHS = 200
LR = 1e-4

In [None]:
def load_data(img_path, label_path):
    x = sorted(glob(os.path.join(img_path, "*/*/*_leftImg8bit.png")))
    y = sorted(glob(os.path.join(label_path, "*/*/*_labelIds.png")))
    return x, y

In [None]:
imgs, labels = load_data(IMAGE_PATH, LABEL_PATH)
assert(all(imgs[idx].split("_leftImg8bit")[0].split("leftImg8bit/")[-1] == 
           labels[idx].replace("_gtFine_", "_").split("_labelIds")[0].split("gtFine/")[-1] 
           for idx in range(len(imgs))))

In [None]:
x = imgs[1]
x = cv2.imread(x, cv2.IMREAD_COLOR)
plt.figure(figsize = (5, 5))
plt.imshow(x)

In [None]:
y = labels[1]
y = cv2.imread(y, cv2.IMREAD_GRAYSCALE)
y = (y == PED_ID) | (y == RIDER_ID)
y = y.astype(float)

In [None]:
h, w = y.shape
for row in range(h):
    for col in range(w):
        if not y[row][col]:
            x[row][col] = [255, 255, 255]
plt.figure(figsize = (5, 5))
plt.imshow(x)

In [None]:
def load_data(images, masks, split=0.1):
    total_size = len(images)
    valid_size = int(split * total_size)
    test_size = int(split * total_size)
    train_x, valid_x = train_test_split(images, test_size=valid_size, random_state=42)
    train_y, valid_y = train_test_split(masks, test_size=valid_size, random_state=42)
    train_x, test_x = train_test_split(train_x, test_size=test_size, random_state=42)
    train_y, test_y = train_test_split(train_y, test_size=test_size, random_state=42)
    return (train_x, train_y), (valid_x, valid_y), (test_x, test_y)

def downsize(x):
    # downsample to largest side at 256
    x = T.functional.resize(x, (IMAGE_SIZE, IMAGE_SIZE))
    return x
    
def read_image(path, train=False):
    if train:
        path = path.decode()
    x = read_and_rgb(path)
    x = np.array(downsize(Image.fromarray(x)))
    x = x/255.0

    return x

def read_mask(path, train=False):
    if train:
        path = path.decode()
    x = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    x = np.array(downsize(Image.fromarray(x)))
    x = (x == PED_ID) | (x == RIDER_ID)
    x = x.astype(float)
    x = np.expand_dims(x, axis=-1)

    return x

def read_and_rgb(x):
    x = cv2.imread(x, cv2.IMREAD_COLOR)
    x = cv2.cvtColor(x, cv2.COLOR_BGR2RGB)
    return x

def tf_parse(x, y):
    def _parse(x, y):
        x = read_image(x, train=True)
        y = read_mask(y, train=True)
        return x, y

    x, y = tf.numpy_function(_parse, [x, y], [tf.float64, tf.float64])
    x.set_shape([IMAGE_SIZE, IMAGE_SIZE, 3])
    y.set_shape([IMAGE_SIZE, IMAGE_SIZE, 1])
    return x, y

def tf_dataset(x, y, batch=8):
    dataset = tf.data.Dataset.from_tensor_slices((x, y))
    dataset = dataset.map(tf_parse)
    dataset = dataset.batch(batch)
    dataset = dataset.repeat()
    return dataset

In [None]:
(train_x, train_y), (valid_x, valid_y), (test_x, test_y) = load_data(imgs, labels, split=0.1)

train_steps = len(train_x)//BATCH
valid_steps = len(valid_x)//BATCH

if len(train_x) % BATCH != 0:
    train_steps += 1
if len(valid_x) % BATCH != 0:
    valid_steps += 1

In [None]:
train_dataset = tf_dataset(train_x, train_y, batch=BATCH)
valid_dataset = tf_dataset(valid_x, valid_y, batch=BATCH)
test_dataset = tf_dataset(test_x, test_y, batch=BATCH)

In [None]:
len(train_x), len(valid_x), len(test_x)

In [None]:
def PedSegmentModel():
    inputs = Input(shape=(IMAGE_SIZE, IMAGE_SIZE, 3), name="input_image")

    encoder = MobileNetV2(input_tensor=inputs, weights="imagenet", include_top=False)
    
    # model v1
    # skip_connection_names = ["input_image", "block_1_expand_relu", "block_3_expand_relu", "block_6_expand_relu"]
    # encoder_output = encoder.get_layer("block_13_expand_relu").output
    # f = [16, 32, 48, 64]

    # model v2
    # skip_connection_names = ["input_image", "block_1_expand_relu", "block_3_expand_relu", "block_6_expand_relu", "block_13_expand_relu"]
    # encoder_output = encoder.get_layer("block_16_expand_relu").output
    # f = [16, 32, 48, 64, 80]

    # model v3
    # skip_connection_names = ["input_image", "block_1_expand_relu", "block_3_expand_relu", "block_6_expand_relu"]
    # encoder_output = encoder.get_layer("block_10_expand_relu").output
    # f = [16, 32, 48, 64]

    # model v4
    skip_connection_names = ["input_image", "block_1_expand_relu", "block_3_expand_relu", "block_6_expand_relu"]
    encoder_output = encoder.get_layer("block_7_expand_relu").output
    f = [16, 32, 48, 64]
    
    x = encoder_output
    for i in range(1, len(skip_connection_names)+1, 1):
        x_skip = encoder.get_layer(skip_connection_names[-i]).output
        x = UpSampling2D((2, 2))(x)
        x = Concatenate()([x, x_skip])
        
        x = Conv2D(f[-i], (3, 3), padding="same")(x)
        x = BatchNormalization()(x)
        x = Activation("relu")(x)
        
        x = Conv2D(f[-i], (3, 3), padding="same")(x)
        x = BatchNormalization()(x)
        x = Activation("relu")(x)
        
    x = Conv2D(1, (1, 1), padding="same")(x)
    x = Activation("sigmoid")(x)
    
    model = Model(inputs, x)
    return model

smooth = 1e-15
def IOU(y_true, y_pred):
    y_true = tf.keras.layers.Flatten()(y_true)
    y_pred = tf.keras.layers.Flatten()(y_pred)
    intersection = tf.reduce_sum(y_true * y_pred)
    return (intersection + smooth) / (tf.reduce_sum(y_true) + tf.reduce_sum(y_pred) - intersection + smooth)

def IOU_loss(y_true, y_pred):
    return 1.0 - IOU(y_true, y_pred)
    # return tf.keras.losses.MSE(y_true, y_pred)

In [None]:
opt = tf.keras.optimizers.Nadam(LR)
metrics = [IOU, Recall(), Precision()]
callbacks = [
    ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=4),
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=False),
    ModelCheckpoint(filepath=MODEL_CHECKPOINT, save_weights_only=True, monitor='val_loss', mode='min', save_best_only=True),
    TensorBoard(log_dir="logs/")
]

In [None]:
model = PedSegmentModel()
model.compile(loss=IOU_loss, optimizer=opt, metrics=metrics)

In [None]:
# model.load_weights(MODEL_CHECKPOINT)

model.fit(
    train_dataset,
    validation_data=valid_dataset,
    epochs=EPOCHS,
    steps_per_epoch=train_steps,
    validation_steps=valid_steps,
    callbacks=callbacks,
)

In [None]:
test_steps = (len(test_x)//BATCH)
if len(test_x) % BATCH != 0:
    test_steps += 1

model.load_weights(MODEL_CHECKPOINT)
model.evaluate(test_dataset, steps=test_steps)

In [None]:
def overlay_img_on_mask(img, mask):
    for row in range(h):
        for col in range(w):
            if not mask[row][col][0]:
                img[row][col] = [0, 0, 0]
                
def mask_parse(mask):
    mask = np.squeeze(mask)
    mask = [mask, mask, mask]
    mask = np.transpose(mask, (1, 2, 0))
    return mask

show_x = [test_x[11], test_x[44], test_x[102], test_x[106], test_x[500]]
show_y = [test_y[11], test_y[44], test_y[102], test_y[106], test_y[500]]
# show_x = test_x[:10]
# show_y = test_y[:10]
for i, (x, y) in enumerate(zip(show_x, show_y)):
    x = read_image(x)
    y = read_mask(y)
    y_pred = model.predict(np.expand_dims(x, axis=0))[0] > 0.5
    h, w, _ = x.shape
    white_line = np.ones((h, 10, 3))

    overlay_img = copy.deepcopy(x)
    overlay_img_on_mask(overlay_img, y_pred)

    all_images = [
        x, white_line,
        mask_parse(y), white_line,
        mask_parse(y_pred), white_line,
        overlay_img
    ]

    image = np.concatenate(all_images, axis=1)
    
    fig = plt.figure(figsize=(12, 12))
    a = fig.add_subplot(1, 1, 1)
    imgplot = plt.imshow(image)

In [None]:
%%shell
cp /content/ModelCheckpoint.data-00000-of-00001 /content/drive/My\ Drive/AER1515\ Project/ModelCheckpoint.data-00000-of-00001
cp /content/ModelCheckpoint.index /content/drive/My\ Drive/AER1515\ Project/ModelCheckpoint.index

# COMPARE MODELs FINAL

In [None]:
%%shell
cp /content/drive/My\ Drive/AER1515\ Project/ModelCheckpoint_13.data-00000-of-00001 ModelCheckpoint_13.data-00000-of-00001
cp /content/drive/My\ Drive/AER1515\ Project/ModelCheckpoint_13.index ModelCheckpoint_13.index
cp /content/drive/My\ Drive/AER1515\ Project/ModelCheckpoint_16.data-00000-of-00001 ModelCheckpoint_16.data-00000-of-00001
cp /content/drive/My\ Drive/AER1515\ Project/ModelCheckpoint_16.index ModelCheckpoint_16.index
cp /content/drive/My\ Drive/AER1515\ Project/ModelCheckpoint_10.data-00000-of-00001 ModelCheckpoint_10.data-00000-of-00001
cp /content/drive/My\ Drive/AER1515\ Project/ModelCheckpoint_10.index ModelCheckpoint_10.index
cp /content/drive/My\ Drive/AER1515\ Project/ModelCheckpoint_7.data-00000-of-00001 ModelCheckpoint_7.data-00000-of-00001
cp /content/drive/My\ Drive/AER1515\ Project/ModelCheckpoint_7.index ModelCheckpoint_7.index

In [None]:
import torchvision
import torch

In [None]:
def downsize(x):
    # downsample to largest side at 256
    x = T.functional.resize(x, (IMAGE_SIZE, IMAGE_SIZE))
    return x
    
def read_image(path):
    x = read_and_rgb(path)
    x = np.array(downsize(Image.fromarray(x)))
    return x

def read_mask(path):
    x = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    x = np.array(downsize(Image.fromarray(x)))
    x = (x == PED_ID) | (x == RIDER_ID)
    return x

def read_and_rgb(x):
    x = cv2.imread(x, cv2.IMREAD_COLOR)
    x = cv2.cvtColor(x, cv2.COLOR_BGR2RGB)
    return x

def overlay_img_on_mask(img, mask):
    for row in range(h):
        for col in range(w):
            if not mask[row][col]:
                img[row][col] = [0, 0, 0]

def mask_parse(mask):
    mask = np.squeeze(mask)
    mask = [mask, mask, mask]
    mask = np.transpose(mask, (1, 2, 0))
    return mask

In [None]:
def PedSegmentModel(version):
    inputs = Input(shape=(IMAGE_SIZE, IMAGE_SIZE, 3), name="input_image")

    encoder = MobileNetV2(input_tensor=inputs, weights="imagenet", include_top=False) #, alpha=0.35)
    
    if version == 0:
        skip_connection_names = ["input_image", "block_1_expand_relu", "block_3_expand_relu", "block_6_expand_relu"]
        encoder_output = encoder.get_layer("block_13_expand_relu").output
        f = [16, 32, 48, 64]

    elif version == 1:
        skip_connection_names = ["input_image", "block_1_expand_relu", "block_3_expand_relu", "block_6_expand_relu", "block_13_expand_relu"]
        encoder_output = encoder.get_layer("block_16_expand_relu").output
        f = [16, 32, 48, 64, 80]

    elif version == 2:
        skip_connection_names = ["input_image", "block_1_expand_relu", "block_3_expand_relu", "block_6_expand_relu"]
        encoder_output = encoder.get_layer("block_10_expand_relu").output
        f = [16, 32, 48, 64]
    
    elif version == 3:
        skip_connection_names = ["input_image", "block_1_expand_relu", "block_3_expand_relu", "block_6_expand_relu"]
        encoder_output = encoder.get_layer("block_7_expand_relu").output
        f = [16, 32, 48, 64]
    
    x = encoder_output
    for i in range(1, len(skip_connection_names)+1, 1):
        x_skip = encoder.get_layer(skip_connection_names[-i]).output
        x = UpSampling2D((2, 2))(x)
        x = Concatenate()([x, x_skip])
        
        x = Conv2D(f[-i], (3, 3), padding="same")(x)
        x = BatchNormalization()(x)
        x = Activation("relu")(x)
        
        x = Conv2D(f[-i], (3, 3), padding="same")(x)
        x = BatchNormalization()(x)
        x = Activation("relu")(x)
        
    x = Conv2D(1, (1, 1), padding="same")(x)
    x = Activation("sigmoid")(x)
    
    model = Model(inputs, x)
    return model

In [None]:
transform = T.Compose([
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
PEDID = 15
fcn_101 = torchvision.models.segmentation.fcn_resnet101(pretrained=True)
fcn_50 = torchvision.models.segmentation.fcn_resnet50(pretrained=True)
fcn_101.cuda()
fcn_50.cuda()
def fcn_predict(fcn, x):
    inp = transform(x)
    inp = inp.unsqueeze(0)
    inp = inp.cuda()
    y_pred = fcn(inp)['out']
    y_pred = y_pred.squeeze(0)
    y_pred = torch.argmax(y_pred, dim=0).detach().cpu().numpy()
    y_pred = y_pred == PEDID
    return y_pred

checkpoint_map = {0: "ModelCheckpoint_13", 1: "ModelCheckpoint_16", 2: "ModelCheckpoint_10", 3: "ModelCheckpoint_7"}
def model_predict(x, version):
    model = PedSegmentModel(version)
    model.load_weights(checkpoint_map[version])
    y_pred = model.predict(np.expand_dims(x, axis=0))[0] > 0.5
    return y_pred

In [None]:
show_x = [test_x[11], test_x[44], test_x[102], test_x[106], test_x[500], test_x[475], test_x[451], test_x[485], test_x[389]]
show_y = [test_y[11], test_y[44], test_y[102], test_y[106], test_y[500], test_y[475], test_y[451], test_y[485], test_y[389]]
images = None
for i, (x, y) in enumerate(zip(show_x, show_y)):
    x = read_image(x)
    y = read_mask(y)

    y_pred_fcn_50 = fcn_predict(fcn_50, x)
    y_pred_fcn_101 = fcn_predict(fcn_101, x)
    y_pred_mv0 = model_predict(x/255, version=0)
    y_pred_mv1 = model_predict(x/255, version=1)
    y_pred_mv2 = model_predict(x/255, version=2)
    y_pred_mv3 = model_predict(x/255, version=3)

    h, w, _ = x.shape
    white_line = np.ones((10, 256, 3))
    long_white_line = np.ones(((256+10)*8, 10, 3))

    all_images = [
        x/255, white_line,
        mask_parse(y), white_line,
        mask_parse(y_pred_mv3), white_line,
        mask_parse(y_pred_mv2), white_line,
        mask_parse(y_pred_mv0), white_line,
        mask_parse(y_pred_mv1), white_line,
        mask_parse(y_pred_fcn_50), white_line,
        mask_parse(y_pred_fcn_101), white_line,
    ]

    image = np.concatenate(all_images, axis=0)

    image = [long_white_line, image] if images is None else [images, long_white_line, image]
    images = np.concatenate(image, axis=1)
    
fig = plt.figure(figsize=(12, 12))
imgplot = plt.imshow(images)

# Inference

In [None]:
import time

m = model

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

dummy_input = np.random.rand(1,256,256,3)

# WARM-UP
for _ in range(10):
   _ = m(dummy_input)

# MEASURE PERFORMANCE
t = 0
repetitions = 300
for r in range(repetitions):
    start_time = time.time()
    result = model(dummy_input)
    end_time = time.time()
    t += (end_time - start_time)*1000
print("Mean inference time:", t/repetitions, " ms.")

# Tensorboard

If not working, just download the logs and open it in a local jupyter notebook

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs

# Final Comaprisons

My model v1 (13, input,1,3,6): 
- number of param = 1,265,729
- model size = 14.8 MB
- mean inference time = 37.2 ms
- test IOU = 78.7%

My model v2 (16, input,1,3,6,13)
- number of param = 3,050,401
- model size = 35.6 MB
- mean inference time = 44.6 ms
- test IOU = 79.5%

My model v3 (10, input,1,3,6)
- number of param = 813,761
- model size = 9.5 MB
- mean inference time = 31.16 ms
- test IOU = 76.6%

My model v4 (7, input,1,3,6)
- number of param = 645,953
- model size = 7.6 MB
- mean inference time = 23.81 ms
- test IOU = 75.7%

ResNet101 - FCN (pretrained)
- number of param = 54,314,346
- model size 208 MB
- mean inference time = 34.42ms
- test IOU = 52.4%

ResNet50 - FCN (pretrained)
- number of param = 35,322,218
- model size 136 MB
- mean inference time = 20.07ms
- test IOU = 52.6%