In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import add, Input, Dense, Conv2D, MaxPooling2D, AveragePooling2D, ZeroPadding2D, Dropout, Flatten, Concatenate, Reshape, Activation, BatchNormalization, GlobalAveragePooling2D, ZeroPadding2D
from tensorflow.nn import local_response_normalization
from tensorflow.python.keras.layers.merge import concatenate
from tensorflow.keras.activations import relu
import sys
import matplotlib.pyplot as plt
import math
import matplotlib
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.regularizers import l2, l1, l1_l2
from itertools import permutations, combinations
import cv2

In [2]:
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
config.log_device_placement = True  # to log device placement (on which device the operation ran)
# config.gpu_options.per_process_gpu_memory_fraction = 0.60
sess = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(sess)

Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: GeForce RTX 3080, pci bus id: 0000:41:00.0, compute capability: 8.6



In [3]:
global_batch_size = 32
image_resize = 32

########## Train
# datagen = ImageDataGenerator(rescale=1./255, rotation_range = 20, horizontal_flip = True, vertical_flip=True, height_shift_range = 0.2,
#                                    width_shift_range = 0.2, zoom_range = 0.2)

datagen = ImageDataGenerator(rescale=1./255)

train_it = datagen.flow_from_directory(
        'cat_dog/train',
        class_mode='categorical',
        target_size=(image_resize, image_resize),
        batch_size=global_batch_size)




############ Test
test_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.5)

validation_it = test_datagen.flow_from_directory(
        'cat_dog/test',
        class_mode='categorical',
        target_size=(image_resize, image_resize),
        batch_size=global_batch_size,
        subset = "training",seed = 545)

test_it = test_datagen.flow_from_directory(
        'cat_dog/test',
        class_mode='categorical',
        target_size=(image_resize, image_resize),
        batch_size=global_batch_size,
        subset = "validation",
        seed = 545)


for data_batch, labels_batch in train_it:
    print('data batch shape:', data_batch.shape)
    print('labels batch shape:', labels_batch.shape)
    break


Found 19999 images belonging to 2 classes.
Found 2500 images belonging to 2 classes.
Found 2499 images belonging to 2 classes.
data batch shape: (32, 32, 32, 3)
labels batch shape: (32, 2)


In [4]:
class CustomLoss(keras.losses.Loss):
    def __init__(self, factor):
        super().__init__()
        self.factor = factor

    def call(self, y_true, y_pred):
        ce = tf.keras.losses.categorical_crossentropy(y_true, y_pred)
        return ce * self.factor
    
class Residual_block(tf.keras.Model):
    def __init__(self, num_channels, use_1x1conv=False, strides=1):
        super().__init__()
        
        self.conv1 = Conv2D(num_channels, kernel_size=3, padding='same', strides=strides, kernel_initializer='he_uniform', kernel_regularizer=l2(0.00001), bias_regularizer=l2(0.00001))
        self.conv2 = Conv2D(num_channels, kernel_size=3, padding='same', kernel_initializer='he_uniform', kernel_regularizer=l2(0.00001), bias_regularizer=l2(0.00001))
        self.conv3 = None
        
        if use_1x1conv:
            self.conv3 = Conv2D(num_channels, kernel_size=1, strides=strides, kernel_initializer='he_uniform', kernel_regularizer=l2(0.00001), bias_regularizer=l2(0.00001))
            
        self.bn1 = BatchNormalization()
        self.bn2 = BatchNormalization()
        
    def call(self, X, training):
        
        if (training != False and training != True):
            training = False
        
        Y = self.conv1(X)
        Y = self.bn1 (Y, training = training)
        Y = relu(Y)
        Y = self.conv2(Y)
        Y = self.bn2 (Y, training = training)
        
        if self.conv3 is not None:
            X = self.conv3(X)
            
        Y += X
        Y = relu(Y)
        return Y


class CIFAR_ResNet(tf.keras.Model):
    def __init__(self, model_n, branch_number):
        super(CIFAR_ResNet, self).__init__()
        
        self.model_n = model_n
        self.branch_number = branch_number
        self.residual_layers = []
        
        ######## Begining Layers
        self.conv1 = Conv2D(16, kernel_size=3, padding='same',kernel_initializer='he_uniform', input_shape=(image_resize, image_resize, 3), kernel_regularizer=l2(0.00001), bias_regularizer=l2(0.00001))
        self.bn1 = BatchNormalization()
        
        
        ######## ResNet layers
        ### first block, 16 channels
        for i in range(self.model_n):
            self.residual_layers.append(Residual_block(16))
            
        ### second block, 32 channels
        for i in range(self.model_n):
            if i == 0:
                self.residual_layers.append(Residual_block(32, use_1x1conv=True, strides=2))
            else:
                self.residual_layers.append(Residual_block(32))
                
        ### third block, 64 channels
        self.residual_layers_block3 = []
        for i in range(self.model_n):
            if i == 0:
                self.residual_layers.append(Residual_block(64, use_1x1conv=True, strides=2))
            else:
                self.residual_layers.append(Residual_block(64))

        
        ######## OUTPUT Layers
        self.pool_out = GlobalAveragePooling2D()
        self.flat_out = Flatten()

        self.dense_layers = []
        for i in range(self.branch_number):
            self.dense_layers.append(Dense(2, kernel_initializer='he_uniform', kernel_regularizer=l2(0.00001), bias_regularizer=l2(0.00001), activation='softmax'))
        

    def call(self, inputs, training):
        inference_flag = -1000
        layer_ind_counter = 0
        comp_latency_list_backbone = [0 for i in range(self.branch_number)]
        comp_latency_list_exitbranch = [0 for i in range(self.branch_number)]
        out_vector_list = [[] for i in range(self.branch_number)]
        
        if (training != False and training != True):
            inference_flag = 1000
            training = False
            
            
        ##### Begining layers
        start_time = tf.timestamp()
        x = self.conv1(inputs)
        x = self.bn1 (x, training = training)
        x = relu (x)
        comp_latency_list_backbone[layer_ind_counter] = tf.timestamp() - start_time
        
        
        # exit branch
        start_time = tf.timestamp()
        temp = self.pool_out(x)
        temp = self.flat_out(temp)
        out_vector_list[layer_ind_counter] = self.dense_layers[layer_ind_counter](temp)
        comp_latency_list_exitbranch[layer_ind_counter] = tf.timestamp() - start_time
        
        
        
        ##### ResNet Layers
        for layer in self.residual_layers:
            layer_ind_counter += 1
            start_time = tf.timestamp()
            x = layer (x)
            comp_latency_list_backbone[layer_ind_counter] = tf.timestamp() - start_time
            
            # add exit branch after each resnet block
            start_time = tf.timestamp()
            temp = self.pool_out(x)
            temp = self.flat_out(temp)
            out_vector_list[layer_ind_counter] = self.dense_layers[layer_ind_counter](temp)
            comp_latency_list_exitbranch[layer_ind_counter] = tf.timestamp() - start_time
            


        if (inference_flag == 1000):
            return(out_vector_list, comp_latency_list_backbone, comp_latency_list_exitbranch)
        
        
        return out_vector_list

In [5]:
##### ResNet20 + cat & dog (32x32)

##### load weight
opt = tf.keras.optimizers.SGD(momentum=0.9)

##### for ResNet20
branch_number= 10
model_n = 3

ls = [CustomLoss(1) for i in range(branch_number)]

model_CIFAR_ResNet = CIFAR_ResNet(model_n,branch_number)
model_CIFAR_ResNet.compile(optimizer=opt, loss=ls, metrics=['accuracy'])
model_CIFAR_ResNet.load_weights('profiling_models/ResNet20_10out_bs32_epoch500_lr0.01_catdog(32)')
model_CIFAR_ResNet.evaluate(test_it);



In [5]:
##### ResNet20 + cat & dog (128x128)

##### load weight
opt = tf.keras.optimizers.SGD(momentum=0.9)

##### for ResNet20
branch_number= 10
model_n = 3

ls = [CustomLoss(1) for i in range(branch_number)]

model_CIFAR_ResNet = CIFAR_ResNet(model_n,branch_number)
model_CIFAR_ResNet.compile(optimizer=opt, loss=ls, metrics=['accuracy'])
model_CIFAR_ResNet.load_weights('profiling_models/ResNet20_10out_bs32_epoch200_lr0.01_catdog(128)')
model_CIFAR_ResNet.evaluate(test_it);



In [1]:
###### now the profiling part
###### TRAIN PART

In [7]:
##### passing TRAIN samples through model
##### this is to save computation time and exit rates for all the possible branches
print("%%%%%%%%%%%%%%%%%%%%%%% ResNet20 + cat ans dog(32x32) %%%%%%%%%%%%%%%%%%%%%%%%%%%%")
per_sample_label_list = []
per_sample_out_vector_list = []
per_sample_comp_latency_backbone_list = []
per_sample_comp_latency_exitbranch_list = []


#### saving intermediate data once
for i in range (len(train_it)):
    temp_batch = train_it[i]

    for j in range (len(temp_batch[0])):
        pic = temp_batch[0][j]
        label = temp_batch[1][j]
        per_sample_label_list.append(np.array(label).reshape(1,2))

        res = model_CIFAR_ResNet(np.array(pic.reshape(1,image_resize,image_resize,3)), training = 1000)
   
        per_sample_out_vector_list.append(res[0])
        per_sample_comp_latency_backbone_list.append(res[1])
        per_sample_comp_latency_exitbranch_list.append(res[2])

print ("computation latencies backbone(train data)   ", np.mean(per_sample_comp_latency_backbone_list, axis=0)*1000)
print ("computation latencies exitbranch(train data)   ", np.mean(per_sample_comp_latency_exitbranch_list, axis=0)*1000)




##### passing TRAIN samples through model
##### this is to save computation time and exit rates for all the possible branches

entropy_threshold_list = np.linspace(0.000000001, 0.999999, num=20)

### all possible branches, and chosen number
chosen_number = branch_number
I = list(range(1, branch_number+1))

#### number of train samples for simulation
sample_number = train_it.samples



#### per placement, in this case it is only 1, all the possible branches
for item in combinations(I, chosen_number):
    placement = list(np.array(item))
    print ("&&&&& selected exit ", placement , "  &&&&&&")
        
    ent_exitrate_list = []

    ##### per threshold 
    for thresh in entropy_threshold_list:
        print ("------------------ ", thresh, "------------------------")

        threshold_exit = []
        threshold_exitrate = [[] for i in range(branch_number)]

        ##### per sample
        for sample in range (sample_number):

            #### determining the exit branch based on entropy of the output and thresh
            for exit in range(branch_number):
                out = per_sample_out_vector_list[sample][exit]
                entropy = -1 * tf.math.reduce_sum((tf.math.log(out) * out)/ np.log(out.shape[1]))

                if (entropy < thresh or exit+1==branch_number):
                    threshold_exit.append(exit+1)
                    break


        # handling exit percentage part
        unique, counts = np.unique(threshold_exit, return_counts=True)
        exitper_list_dict = dict(zip(unique, counts))
        print ("exit rate per thresh ", exitper_list_dict)

        for i in range(branch_number):
            if (exitper_list_dict.get(placement[i]) is None):
                threshold_exitrate[i].append (0)
            else:
                threshold_exitrate[i].append(exitper_list_dict.get(placement[i]))


        ent_exitrate_list.append(np.mean(threshold_exitrate, axis=1)/sample_number)

    print("------------------------------------------------------------------")
    print("exit rate average ", np.mean(ent_exitrate_list, axis=0))
    break

%%%%%%%%%%%%%%%%%%%%%%% ResNet20 + cat ans dog(32x32) %%%%%%%%%%%%%%%%%%%%%%%%%%%%




computation latencies backbone(train data)    [1.00057505 1.52657419 1.33259182 1.3143112  1.67784431 1.35417646
 1.34030226 1.58788354 1.30361067 1.29685041]
computation latencies exitbranch(train data)    [0.55071364 0.55881715 0.5637968  0.55248329 0.55227588 0.54661874
 0.54181743 0.54673773 0.54091606 0.53955897]
&&&&& selected exit  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]   &&&&&&
------------------  1e-09 ------------------------
exit rate per thresh  {3: 3, 10: 19996}
------------------  0.052631527263157896 ------------------------
exit rate per thresh  {1: 217, 2: 1392, 3: 2210, 4: 1256, 5: 1159, 6: 1740, 7: 897, 8: 631, 9: 238, 10: 10259}
------------------  0.10526305352631579 ------------------------
exit rate per thresh  {1: 518, 2: 1876, 3: 2919, 4: 1585, 5: 1252, 6: 1852, 7: 849, 8: 569, 9: 262, 10: 8317}
------------------  0.1578945797894737 ------------------------
exit rate per thresh  {1: 878, 2: 2264, 3: 3280, 4: 1779, 5: 1293, 6: 1832, 7: 801, 8: 586, 9: 227, 10: 7059}
-

In [6]:
##### passing TRAIN samples through model
##### this is to save computation time and exit rates for all the possible branches
print("%%%%%%%%%%%%%%%%%%%%%%% ResNet20 + cat ans dog(128x128) %%%%%%%%%%%%%%%%%%%%%%%%%%%%")
per_sample_label_list = []
per_sample_out_vector_list = []
per_sample_comp_latency_backbone_list = []
per_sample_comp_latency_exitbranch_list = []


#### saving intermediate data once
for i in range (len(train_it)):
    temp_batch = train_it[i]

    for j in range (len(temp_batch[0])):
        pic = temp_batch[0][j]
        label = temp_batch[1][j]
        per_sample_label_list.append(np.array(label).reshape(1,2))

        res = model_CIFAR_ResNet(np.array(pic.reshape(1,image_resize,image_resize,3)), training = 1000)
   
        per_sample_out_vector_list.append(res[0])
        per_sample_comp_latency_backbone_list.append(res[1])
        per_sample_comp_latency_exitbranch_list.append(res[2])

print ("computation latencies backbone(train data)   ", np.mean(per_sample_comp_latency_backbone_list, axis=0)*1000)
print ("computation latencies exitbranch(train data)   ", np.mean(per_sample_comp_latency_exitbranch_list, axis=0)*1000)




##### passing TRAIN samples through model
##### this is to save computation time and exit rates for all the possible branches

entropy_threshold_list = np.linspace(0.000000001, 0.999999, num=20)

### all possible branches, and chosen number
chosen_number = branch_number
I = list(range(1, branch_number+1))

#### number of train samples for simulation
sample_number = train_it.samples



#### per placement, in this case it is only 1, all the possible branches
for item in combinations(I, chosen_number):
    placement = list(np.array(item))
    print ("&&&&& selected exit ", placement , "  &&&&&&")
        
    ent_exitrate_list = []

    ##### per threshold 
    for thresh in entropy_threshold_list:
        print ("------------------ ", thresh, "------------------------")

        threshold_exit = []
        threshold_exitrate = [[] for i in range(branch_number)]

        ##### per sample
        for sample in range (sample_number):

            #### determining the exit branch based on entropy of the output and thresh
            for exit in range(branch_number):
                out = per_sample_out_vector_list[sample][exit]
                entropy = -1 * tf.math.reduce_sum((tf.math.log(out) * out)/ np.log(out.shape[1]))

                if (entropy < thresh or exit+1==branch_number):
                    threshold_exit.append(exit+1)
                    break


        # handling exit percentage part
        unique, counts = np.unique(threshold_exit, return_counts=True)
        exitper_list_dict = dict(zip(unique, counts))
        print ("exit rate per thresh ", exitper_list_dict)

        for i in range(branch_number):
            if (exitper_list_dict.get(placement[i]) is None):
                threshold_exitrate[i].append (0)
            else:
                threshold_exitrate[i].append(exitper_list_dict.get(placement[i]))


        ent_exitrate_list.append(np.mean(threshold_exitrate, axis=1)/sample_number)

    print("------------------------------------------------------------------")
    print("exit rate average ", np.mean(ent_exitrate_list, axis=0))
    break

%%%%%%%%%%%%%%%%%%%%%%% ResNet20 + cat ans dog(128x128) %%%%%%%%%%%%%%%%%%%%%%%%%%%%




computation latencies backbone(train data)    [1.05533315 1.59532795 1.40126899 1.3827306  1.6223228  1.34523489
 1.33511933 1.58681718 1.34406202 1.33806372]
computation latencies exitbranch(train data)    [0.55596838 0.55857674 0.56180903 0.54995784 0.54561086 0.5383982
 0.53693254 0.54295068 0.54104106 0.53888817]
&&&&& selected exit  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]   &&&&&&
------------------  1e-09 ------------------------
exit rate per thresh  {7: 15, 8: 41, 9: 24, 10: 19919}
------------------  0.052631527263157896 ------------------------
exit rate per thresh  {1: 1, 2: 844, 3: 945, 4: 2030, 5: 2899, 6: 2450, 7: 1294, 8: 1154, 9: 692, 10: 7690}
------------------  0.10526305352631579 ------------------------
exit rate per thresh  {1: 6, 2: 1284, 3: 1634, 4: 2758, 5: 3218, 6: 2304, 7: 1098, 8: 1006, 9: 595, 10: 6096}
------------------  0.1578945797894737 ------------------------
exit rate per thresh  {1: 26, 2: 1689, 3: 2280, 4: 3132, 5: 3234, 6: 2141, 7: 1012, 8: 872, 9: 498, 

In [10]:
##### TEST PART, FOR SIMULATION ONLY

In [8]:
##### passing TEST samples through model once and save the intermediate data
#### this is for doing the simulation

per_sample_label_list = []
per_sample_out_vector_list = []
per_sample_comp_latency_backbone_list = []
per_sample_comp_latency_exitbranch_list = []

for i in range (len(test_it)):
    temp_batch = test_it[i]

    for j in range (len(temp_batch[0])):
        pic = temp_batch[0][j]
        label = temp_batch[1][j]
        per_sample_label_list.append(np.array(label).reshape(1,2))

        res = model_CIFAR_ResNet(np.array(pic.reshape(1,image_resize,image_resize,3)), training = 1000)
   
        per_sample_out_vector_list.append(res[0])
        per_sample_comp_latency_backbone_list.append(res[1])
        per_sample_comp_latency_exitbranch_list.append(res[2])
        
print ("computation latencies backbone(test data)   ", np.mean(per_sample_comp_latency_backbone_list, axis=0)*1000)
print ("computation latencies exitbranch(test data)   ", np.mean(per_sample_comp_latency_exitbranch_list, axis=0)*1000)


computation latencies backbone(test data)    [1.11432074 1.7882254  1.5120326  1.49918471 1.89505629 1.50203495
 1.48380924 2.07751615 1.87093451 1.85276709]
computation latencies exitbranch(test data)    [0.69392238 0.69457877 0.68776786 0.68741877 0.69685372 0.6818094
 0.68248296 0.70836488 0.70242006 0.70320219]
